In [10]:
import pandas as pd
import numpy as np
import re

df = pd.read_excel("Copy of DK- NALSA Remand & Bail Responses.xlsx", skiprows=1)

In [11]:
# renaming the column names in the NALSA 2010 file 
colnames =  ['STATE','DISTRICT','LEGAL_AID_MANDATORY','NUM_MAGISTRATE_COURTS_TOTAL','NUM_MAGISTRATE_COURTS_REMAND',
             'NUM_REMAND_BAIL_LAWYERS','TENURE_LWYRS','LWYRS_APNTD_FOR_REMAND_COURTS','LWYRS_TRAINED','NUM_ACCSD_RPRSNTD_POLSTATN',
             'NUM_ACCSD_RPRSNTD_COURT','NUM_GRANTED_BAIL_POLSTATN','NUM_GRANTED_BAIL_COURT','NUM_RELEASED_POLSTATN','NUM_RELEASED_COURT',
             'NUM_DISCHARGED','DLSA_TO_SLSA','NUM_ATTNDNC_CERTIFICATE','NUM_REPORTS_LWYRS_SUBMITTED','NUM_CASES_FEE_COMPLAINT','NUM_LWYRS_REM']

df.columns = colnames
df = df[[col for col in df.columns if not col.startswith("REM")]]

In [12]:
# shape of the dataset
df.shape

(206, 21)

In [13]:
# top 5 rows 
df.head()

Unnamed: 0,STATE,DISTRICT,LEGAL_AID_MANDATORY,NUM_MAGISTRATE_COURTS_TOTAL,NUM_MAGISTRATE_COURTS_REMAND,NUM_REMAND_BAIL_LAWYERS,TENURE_LWYRS,LWYRS_APNTD_FOR_REMAND_COURTS,LWYRS_TRAINED,NUM_ACCSD_RPRSNTD_POLSTATN,...,NUM_GRANTED_BAIL_POLSTATN,NUM_GRANTED_BAIL_COURT,NUM_RELEASED_POLSTATN,NUM_RELEASED_COURT,NUM_DISCHARGED,DLSA_TO_SLSA,NUM_ATTNDNC_CERTIFICATE,NUM_REPORTS_LWYRS_SUBMITTED,NUM_CASES_FEE_COMPLAINT,NUM_LWYRS_REM
0,DELHI,NORTH- EAST,No,5,5,5,1,Yes,Yes,Not Responded,...,Not Responded,Not Responded,Not Responded,Not Responded,Not Responded,Yes,Not Provided,Not provided,Not Provided,0
1,DELHI,NORTH,Not provided,11,11,13,3,Yes,Yes,Not Responded,...,Not Responded,Not Responded,Not Responded,Not Responded,Not Responded,Not Responded,Not Responded,Not Responded,Not Responded,Not Provided
2,DELHI,EAST,Not provided,12,12,8,2,Yes,Yes,0,...,0,0,Not provided,0,0,Not provided,2,Not provided,45,0
3,DELHI,WEST,No,17,16,13,Not Provided,Yes,Yes,Not Responded,...,Not Responded,Not Responded,Not Responded,Not Responded,Not Responded,No,Not provided,Not Responded,40,0\n8 Resigned
4,DELHI,SOUTH,No,12,12,9,6,YES,Yes,0,...,0,66,0,0,0,Yes,22,20,0,0


In [14]:
# list of columns
df.columns

Index(['STATE', 'DISTRICT', 'LEGAL_AID_MANDATORY',
       'NUM_MAGISTRATE_COURTS_TOTAL', 'NUM_MAGISTRATE_COURTS_REMAND',
       'NUM_REMAND_BAIL_LAWYERS', 'TENURE_LWYRS',
       'LWYRS_APNTD_FOR_REMAND_COURTS', 'LWYRS_TRAINED',
       'NUM_ACCSD_RPRSNTD_POLSTATN', 'NUM_ACCSD_RPRSNTD_COURT',
       'NUM_GRANTED_BAIL_POLSTATN', 'NUM_GRANTED_BAIL_COURT',
       'NUM_RELEASED_POLSTATN', 'NUM_RELEASED_COURT', 'NUM_DISCHARGED',
       'DLSA_TO_SLSA', 'NUM_ATTNDNC_CERTIFICATE',
       'NUM_REPORTS_LWYRS_SUBMITTED', 'NUM_CASES_FEE_COMPLAINT',
       'NUM_LWYRS_REM'],
      dtype='object')

In [15]:
#cleaning for STATE names and removing duplicates in DISTRICT
df.STATE[df.STATE.isin(['ANDRHA PRADESH '])] = 'ANDHRA PRADESH '
df.STATE[df.STATE.isin(['JHARKHNAD '])] = 'JHARKHAND'
df.STATE[df.STATE.isin(['MAHARASTRA '])] = 'MAHARASHTRA '

df.STATE = df.STATE.str.strip()

df.drop_duplicates(['DISTRICT'], inplace=True)


In [16]:
# creationg a dictionary of column names and there string values
str_values_by_columns = {}

for cols in df.columns:
    str_values_by_columns[cols] = [val for val in df[cols].unique() if not str(val).isdigit()]
    
# list of sub-strings to be mapped as NA values - 0/Yes - 1/monthly - 12/ bimonthly - 6/ Quarterly - 4 
nvals = ['NOT','N.A','NO','NIL','NAN','ATTACH','CHECK','PENDING','MAINTAIN']
yvals = ['YES']
monthvals = ['MONTH',]
# list of strings containing the sub-strings (to be replaced in data)
na_str_val = []
yes_str_val = []
mon_str_val = []

# creating list of strings to be replaced
for k in str_values_by_columns.keys():
    if k not in ['STATE','DISTRICT',]:
        na_str_val = na_str_val + [x for x in str_values_by_columns[k] if any(nv.lower() in str(x).lower() for nv in nvals)]
        yes_str_val = yes_str_val + [x for x in str_values_by_columns[k] if any(yv.lower() in str(x).lower() for yv in yvals)]
        mon_str_val = mon_str_val + [x for x in str_values_by_columns[k] if any(mv.lower() in str(x).lower() for mv in monthvals)]

# replacing the the list of string with there respective values
for cols in df.columns:
    #replacing strings with NA values to 0
    df[cols][df[cols].isin(list(set(na_str_val)))] = 0
    #replacing strings with YES values to 1
    df[cols][df[cols].isin(list(set(yes_str_val)))] = 1
    #replacing strings with monthly values to 12
    df[cols][df[cols].isin(list(set(mon_str_val)))] = 12
    #replacing strings with  bi-monthly values to 6
    
# treating few outlier cases seperately
df.NUM_REMAND_BAIL_LAWYERS[(df.NUM_REMAND_BAIL_LAWYERS.isin(['335 throughout the state']))] = 355
df.TENURE_LWYRS[(df.TENURE_LWYRS.isin(['Till the tenure as panel lawyer']))] = 12
df.NUM_LWYRS_REM[(df.NUM_LWYRS_REM.isin(['0\n8 Resigned']))] = 0

In [17]:
# checking for any un-treated string values in the dataset
str_values_by_columns_check = {}
for cols in df.columns:
    str_values_by_columns_check[cols] = [val for val in df[cols].unique() if not str(val).isdigit()]
for k in str_values_by_columns_check.keys():
    if k not in ['STATE','DISTRICT',]:
        print (k, str_values_by_columns_check[k])

LEGAL_AID_MANDATORY []
NUM_MAGISTRATE_COURTS_TOTAL []
NUM_MAGISTRATE_COURTS_REMAND []
NUM_REMAND_BAIL_LAWYERS []
TENURE_LWYRS []
LWYRS_APNTD_FOR_REMAND_COURTS []
LWYRS_TRAINED []
NUM_ACCSD_RPRSNTD_POLSTATN []
NUM_ACCSD_RPRSNTD_COURT []
NUM_GRANTED_BAIL_POLSTATN []
NUM_GRANTED_BAIL_COURT []
NUM_RELEASED_POLSTATN []
NUM_RELEASED_COURT []
NUM_DISCHARGED []
DLSA_TO_SLSA []
NUM_ATTNDNC_CERTIFICATE []
NUM_REPORTS_LWYRS_SUBMITTED []
NUM_CASES_FEE_COMPLAINT []
NUM_LWYRS_REM []


In [18]:
# exporting to csv
df.to_csv("NALSA_Remand-Bail_cleaned.csv")
df.head()

Unnamed: 0,STATE,DISTRICT,LEGAL_AID_MANDATORY,NUM_MAGISTRATE_COURTS_TOTAL,NUM_MAGISTRATE_COURTS_REMAND,NUM_REMAND_BAIL_LAWYERS,TENURE_LWYRS,LWYRS_APNTD_FOR_REMAND_COURTS,LWYRS_TRAINED,NUM_ACCSD_RPRSNTD_POLSTATN,...,NUM_GRANTED_BAIL_POLSTATN,NUM_GRANTED_BAIL_COURT,NUM_RELEASED_POLSTATN,NUM_RELEASED_COURT,NUM_DISCHARGED,DLSA_TO_SLSA,NUM_ATTNDNC_CERTIFICATE,NUM_REPORTS_LWYRS_SUBMITTED,NUM_CASES_FEE_COMPLAINT,NUM_LWYRS_REM
0,DELHI,NORTH- EAST,0,5,5,5,1,1,1,0,...,0,0,0,0,0,1,0,0,0,0
1,DELHI,NORTH,0,11,11,13,3,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,DELHI,EAST,0,12,12,8,2,1,1,0,...,0,0,0,0,0,0,2,0,45,0
3,DELHI,WEST,0,17,16,13,0,1,1,0,...,0,0,0,0,0,0,0,0,40,0
4,DELHI,SOUTH,0,12,12,9,6,1,1,0,...,0,66,0,0,0,1,22,20,0,0
