In [47]:
import pandas as pd
import numpy as np
import re

df = pd.read_excel("NALSA 2010 response.xlsx", skiprows=1)

In [48]:
# renaming the column names in the NALSA 2010 file 
df_colnames = pd.read_excel("DataDict_NALSA2010.xlsx").fillna(0)
colnames =  list(df_colnames['COL_NAME'].unique())[:6] + list(df_colnames['COL_NAME'].unique())[7:]

df.columns = colnames
df = df[[col for col in df.columns if not col.startswith("REM")]]

In [49]:
# shape of the dataset
df.shape

(246, 27)

In [50]:
# top 5 rows 
df.head()

Unnamed: 0,STATE,DISTRICTS,NUM_LWYR_PANEL,NUM_LWYR_RETAINER,NUM_LWYR_PANEL_All,SEP_PANELS,NUM_APLCTN_PRISON,NUM_APLCTN_COURT,NUM_LWYR_APNTD_PRISON,NUM_LWYR_APNTD_COURT,...,NUM_ACQTL,NUM_DSPSD,NUM_CMPLT_REPORT,NUM_WTDRW,MNTRNG_COMTT,NUM_MNTRNG_COMTT_STAFF,NUM_RPRT_PNL_TO_MNTRNG,NUM_RPRT_MNTRNG,NUM_CASE_CMPLN_PNL,NUM_LWYR_REM_PNL
0,West Bengal,BALURGHAT,102,0,"S-70, L-0,JO-0.M-7,LP-0",Yes\n,322,212,322,212,...,Data not collated,167,174,0,Yes,0,,0,No complaints received,Nil
1,HIMACHAL PRADESH,BILASPUR,Not Provided,Not provided,7,Yes,1190,1190,IN ALL THE CASES,All,...,NO SUCH DATA MAINTAINED,Not Provided,Not Provided,Not provided,Yes,0,,Not provided,No complaints received,Nil
2,HIMACHAL PRADESH,MANDI,Not responded,Not responded,Not Responded,Not Provided,Not Responded,Not Responded,Not Responded,Not Responded,...,Not Responded,Not Responded,Not responded,Not responded,Not Responded,Not responded,Not Responded,Not Responded,Not Responded,Not Responded
3,HIMACHAL PRADESH,To clarify the name,96,15,S-36,No,9,26,6,26,...,2,5,2,4,Yes,Not responded,21,12,No complaints received,Nil
4,HIMACHAL PRADESH,SIRMAUR,22,Not responded,Not Responded,No,0,17,0,13,...,Not Responded,1,Not responded,Not responded,Yes,Not responded,Not Responded,Not Responded,No complaints received,Nil


In [51]:
# list of columns
df.columns

Index(['STATE', 'DISTRICTS', 'NUM_LWYR_PANEL', 'NUM_LWYR_RETAINER',
       'NUM_LWYR_PANEL_All', 'SEP_PANELS', 'NUM_APLCTN_PRISON',
       'NUM_APLCTN_COURT', 'NUM_LWYR_APNTD_PRISON', 'NUM_LWYR_APNTD_COURT',
       'NUM_LWYR_APNTD_PRISON_NOT1', 'NUM_LWYR_APNTD_COURT_NOT1',
       'NUM_LWYR_APNTD_PRISON_NOT2', 'NUM_LWYR_APNTD_COURT_NOT2',
       'NUM_ACSD_RPSNTD_PANEL', 'NUM_ACSD_RPSNTD_RETAINER', 'NUM_BAIL_REQ',
       'NUM_ACQTL', 'NUM_DSPSD', 'NUM_CMPLT_REPORT', 'NUM_WTDRW',
       'MNTRNG_COMTT', 'NUM_MNTRNG_COMTT_STAFF', 'NUM_RPRT_PNL_TO_MNTRNG',
       'NUM_RPRT_MNTRNG', 'NUM_CASE_CMPLN_PNL', 'NUM_LWYR_REM_PNL'],
      dtype='object')

In [52]:
# creationg a dictionary of column names and there string values
str_values_by_columns = {}

for cols in df.columns:
    str_values_by_columns[cols] = [val for val in df[cols].unique() if not str(val).isdigit()]
    
# list of sub-strings to be mapped as NA values - 0/Yes - 1/monthly - 12/ bimonthly - 6/ Quarterly - 4 
nvals = ['NOT','N.A','NO','NIL','NAN','ATTACH','CHECK','PENDING','PERTAIN','CUMM']
yvals = ['YES']
monthvals = ['MONTH', 'REGULAR', 'RECIEVE', 'MAINTAIN', 'RECVIED', 'CONSTIUTED']
quartvals = ['QUART','QUATER']
bimonvals = ['BI']
# list of strings containing the sub-strings (to be replaced in data)
na_str_val = []
yes_str_val = []
mon_str_val = []
quart_str_val = []
bimon_str_val = []

# creating list of strings to be replaced
for k in str_values_by_columns.keys():
    if k not in ['STATE','DISTRICTS','NUM_LWYR_PANEL_SR','NUM_MNTRNG_COMTT_STAFF']:
        na_str_val = na_str_val + [x for x in str_values_by_columns[k] if any(nv.lower() in str(x).lower() for nv in nvals)]
        yes_str_val = yes_str_val + [x for x in str_values_by_columns[k] if any(yv.lower() in str(x).lower() for yv in yvals)]
        mon_str_val = mon_str_val + [x for x in str_values_by_columns[k] if any(mv.lower() in str(x).lower() for mv in monthvals)]
        bimon_str_val = bimon_str_val + [x for x in str_values_by_columns[k] if any(bv.lower() in str(x).lower() for bv in bimonvals)]
        quart_str_val = quart_str_val + [x for x in str_values_by_columns[k] if any(qv.lower() in str(x).lower() for qv in quartvals)]

# replacing the the list of string with there respective values
for cols in df.columns:
    #replacing strings with NA values to 0
    df[cols][df[cols].isin(list(set(na_str_val)))] = 0
    #replacing strings with YES values to 1
    df[cols][df[cols].isin(list(set(yes_str_val)))] = 1
    #replacing strings with monthly values to 12
    df[cols][df[cols].isin(list(set(mon_str_val)))] = 12
    #replacing strings with  bi-monthly values to 6
    df[cols][df[cols].isin(list(set(bimon_str_val)))] = 6
    #replacing strings with quarterly values to4
    df[cols][df[cols].isin(list(set(quart_str_val)))] = 4
    
# treating few outlier cases seperately
df.NUM_LWYR_APNTD_COURT[(df.NUM_LWYR_APNTD_COURT.isin(['All']))] = df.NUM_APLCTN_COURT
df.NUM_LWYR_APNTD_PRISON[(df.NUM_LWYR_APNTD_PRISON.isin(['IN ALL THE CASES']))] = df.NUM_APLCTN_PRISON
df.NUM_LWYR_APNTD_COURT_NOT1[(df.NUM_LWYR_APNTD_COURT_NOT1.isin(['IN 9 CASES LEGAL AID PROVIDED']))] = 0
df.NUM_LWYR_APNTD_PRISON_NOT1[(df.NUM_LWYR_APNTD_PRISON_NOT1.isin(['IN 9 CASES LEGAL AID PROVIDED']))] = 0
df.NUM_ACQTL[(df.NUM_ACQTL.isin(['R11 (1) OF NALSA 2010-(267)']))] = 0

In [53]:
# function calculates the sum of all numbers in a strin (i.e. composition of Panel/Monitoring comitte)
def calcNum(panelVal):
    if type(panelVal) == int:
        return panelVal
    else:
        return sum([int(s) for s in re.findall(r'\b\d+\b', panelVal)])

In [54]:
# NUM_LWYR_PANEL_TOT & NUM_MNTRNG_COMTT_STAFF contain sum of total number of members on the panels/committe
df['NUM_LWYR_PANEL_TOT'] = df.NUM_LWYR_PANEL_All.apply(lambda x: calcNum(x))
df.NUM_MNTRNG_COMTT_STAFF = df.NUM_MNTRNG_COMTT_STAFF.apply(lambda x: calcNum(x))

In [55]:
# checking for any un-treated string values in the dataset
str_values_by_columns_check = {}
for cols in df.columns:
    str_values_by_columns_check[cols] = [val for val in df[cols].unique() if not str(val).isdigit()]
for k in str_values_by_columns_check.keys():
    if k not in ['STATE','DISTRICTS','NUM_LWYR_PANEL_All']:
        print (k, str_values_by_columns_check[k])

NUM_LWYR_PANEL []
NUM_LWYR_RETAINER []
SEP_PANELS []
NUM_APLCTN_PRISON []
NUM_APLCTN_COURT []
NUM_LWYR_APNTD_PRISON []
NUM_LWYR_APNTD_COURT []
NUM_LWYR_APNTD_PRISON_NOT1 []
NUM_LWYR_APNTD_COURT_NOT1 []
NUM_LWYR_APNTD_PRISON_NOT2 []
NUM_LWYR_APNTD_COURT_NOT2 []
NUM_ACSD_RPSNTD_PANEL []
NUM_ACSD_RPSNTD_RETAINER []
NUM_BAIL_REQ []
NUM_ACQTL []
NUM_DSPSD []
NUM_CMPLT_REPORT []
NUM_WTDRW []
MNTRNG_COMTT []
NUM_MNTRNG_COMTT_STAFF []
NUM_RPRT_PNL_TO_MNTRNG []
NUM_RPRT_MNTRNG []
NUM_CASE_CMPLN_PNL []
NUM_LWYR_REM_PNL []
NUM_LWYR_PANEL_TOT []


In [56]:
df.drop('NUM_LWYR_PANEL_All',inplace=True, axis=1)
# exporting to csv
df.to_csv("NALSA_2010_cleaned.csv")
df.head()

Unnamed: 0,STATE,DISTRICTS,NUM_LWYR_PANEL,NUM_LWYR_RETAINER,SEP_PANELS,NUM_APLCTN_PRISON,NUM_APLCTN_COURT,NUM_LWYR_APNTD_PRISON,NUM_LWYR_APNTD_COURT,NUM_LWYR_APNTD_PRISON_NOT1,...,NUM_DSPSD,NUM_CMPLT_REPORT,NUM_WTDRW,MNTRNG_COMTT,NUM_MNTRNG_COMTT_STAFF,NUM_RPRT_PNL_TO_MNTRNG,NUM_RPRT_MNTRNG,NUM_CASE_CMPLN_PNL,NUM_LWYR_REM_PNL,NUM_LWYR_PANEL_TOT
0,West Bengal,BALURGHAT,102,0,1,322,212,322,212,0,...,167,174,0,1,0,0,0,0,0,77
1,HIMACHAL PRADESH,BILASPUR,0,0,1,1190,1190,1190,1190,0,...,0,0,0,1,0,0,0,0,0,7
2,HIMACHAL PRADESH,MANDI,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HIMACHAL PRADESH,To clarify the name,96,15,0,9,26,6,26,0,...,5,2,4,1,0,21,12,0,0,36
4,HIMACHAL PRADESH,SIRMAUR,22,0,0,0,17,0,13,0,...,1,0,0,1,0,0,0,0,0,0
