In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from time import gmtime, strftime

df = pd.read_excel("Copy of DK- NALSA 2011 Document 03_06_17 version.xlsx", skiprows=1).fillna("NR")

In [2]:
# renaming the column names in the NALSA 2011 file 
colnames =  ['STATE','DISTRICTS','LEGALAID_JAILS','JAIL_NAME','LEGALAID_CONSTI','DATE_LEGALAID','NUM_LWYR_VISITING',
             'NUM_PARALEGAL_PRISONER','NUM_PARALEGAL_COMMUNITY']
df.columns = colnames


In [3]:
# shape of the dataset
df.shape

(379, 9)

In [4]:
# top 5 rows 
df.head()

Unnamed: 0,STATE,DISTRICTS,LEGALAID_JAILS,JAIL_NAME,LEGALAID_CONSTI,DATE_LEGALAID,NUM_LWYR_VISITING,NUM_PARALEGAL_PRISONER,NUM_PARALEGAL_COMMUNITY
0,TELANGANA,NIZAMABAD,DISTRICT JAIL,NIZAMABAD,YES,2011-03-25 00:00:00,40,Not responded,Not responded
1,TELANGANA,NIZAMABAD,SUB JAIL,ARMOOR,YES,2015-09-05 00:00:00,29,Not responded,Not responded
2,TELANGANA,NIZAMABAD,SUB JAIL,KAMAREDDY,YES,2015-09-05 00:00:00,25,Not responded,Not responded
3,TELANGANA,NIZAMABAD,SUB JAIL,BODHAN,YES,2015-09-05 00:00:00,28,Not responded,Not responded
4,KARNATAKA,KOLAR,DISTRICT JAIL,"DISTRICT SUB JAIL, KOLAR",YES,2016-02-01 00:00:00,1,0,0


In [5]:
# list of columns
df.columns

Index(['STATE', 'DISTRICTS', 'LEGALAID_JAILS', 'JAIL_NAME', 'LEGALAID_CONSTI',
       'DATE_LEGALAID', 'NUM_LWYR_VISITING', 'NUM_PARALEGAL_PRISONER',
       'NUM_PARALEGAL_COMMUNITY'],
      dtype='object')

In [6]:
rule_dict = {"NA":['N.A','NOT APPLICABLE','NOT APPLICABLE ','NA ','Not Applicable','Not Applicable ','N.A ','Not applicable'],
             "NP":[' nan,','nan',' nan','Nil','ATTACHED','Cumm','Cumm Response','Data not collated','DATA NOT MAINTAINED',
                                      'DOES NOT ARISE','INFO. PERTAINS TO COURTS','INFOMATION PERTAINS TO COURTS',
                                      'INFORMATION NOT AVAILABLE','NIL','No','NO MAINTAINED','NO OF STAFF PROVIDED',
                                      'NO RECORD AVAILBALE','NO SUCH DATA MAINTAINED','None','Not available','NOT COMPILED',
                                      'Not constituted','NOT COUNTED','NOT KNOWN','NOT MAINTAINED','Not Provided',
                                      'NOT REPORTED','NOT SUBMITTED BY PANEL LAWYER','PENDING','RECVIED THROUGH SDLSCs',
                                      'REPORT NOT RECEIVED','Response not clear','ATTACHED ',
                                      'THE SERVICE OF JUNIOR ADMINISTRATIVE ASSISTANT WHO ATTACHED TO DLSAs ABOVE IS UTILIZED',
                                      'To check','To check attachment','Not provided','NOT AVAILABLE','NOT COMPILED ',
                   'NOT AVAILABLE ','NOT COUNTED ','NOT KNOWN ','INFOMATION PERTAINS TO COURTS ','PENDING ','NOT REPORTED ',
                   'REPORT NOT RECEIVED ','INFORMATION NOT AVAILABLE ','DATA NOT COLLATED ', 'NOT SUBMITTED BY PANEL LAWYER ', 
                   'PA on rotation','DATA IS NOT AVAILABLE '], 
             "NR":['No Response','Not responded','Not Responded','No response','-']}

In [7]:
# creationg a dictionary of column names and there string values
str_values_by_columns = {}

for cols in df.columns:
    str_values_by_columns[cols] = [val for val in df[cols].unique() if not str(val).isdigit()]
    
# list of sub-strings to be mapped as NA values - 0/Yes - 1/monthly - 12/ bimonthly - 6/ Quarterly - 4 
##nvals = ['NOT','N.A','NO','NIL','NAN','ATTACH','CHECK','PENDING','PERTAIN','CUMM']
yvals = ['YES']
# list of strings containing the sub-strings (to be replaced in data)
##na_str_val = []
yes_str_val = []

# creating list of strings to be replaced
for k in str_values_by_columns.keys():
    if k not in ['STATE','DISTRICTS',]:
        #na_str_val = na_str_val + [x for x in str_values_by_columns[k] if any(nv.lower() in str(x).lower() for nv in nvals)]
        yes_str_val = yes_str_val + [x for x in str_values_by_columns[k] if any(yv.lower() in str(x).lower() for yv in yvals)]

# replacing the the list of string with there respective values
for cols in df.columns:
    #replacing strings with NA values to 0
    ##df[cols][df[cols].isin(list(set(na_str_val)))] = 0
    for k in rule_dict.keys():
        df[cols][df[cols].isin(rule_dict[k])] = k
    #replacing strings with YES values to 1
    df[cols][df[cols].isin(list(set(yes_str_val)))] = 1


In [8]:
# replacing all strings other than NP NR NA withe zero
for cols in df.columns:
    if cols not in ['STATE','DISTRICTS','LEGALAID_JAILS','JAIL_NAME','DATE_LEGALAID']:
        df[cols][df[cols].isin([val for val in df[cols].unique() if not str(val).isdigit() and val not in ['NP','NR','NA']])] = 0    

In [9]:
# extracting year from monitoring committee constitution date
def yearLEGALAID(x):
    if type(x) == datetime.datetime:
        return x.year
    elif (type(x)!= int) and (re.search(r'-|/' ,x)):
        return "20"+str(re.sub(r' |,','',x)[-2:])
    else:
        return x
        
df['YR_LEGALAID'] = df.DATE_LEGALAID.apply(lambda x : yearLEGALAID(x))
df['YR_LEGALAID'][df['YR_LEGALAID'].isin(['2051','20D)'])] = 2015
df['YR_LEGALAID'][df['YR_LEGALAID'].isin(['IN THE YEAR 2013','DURING JULY 2013'])] = 2013
df['YR_LEGALAID'][df['YR_LEGALAID'].isin(['JUNE , 2012'])] = 2013

In [10]:
# checking for any un-treated string values in the dataset
str_values_by_columns_check = {}
for cols in df.columns:
    str_values_by_columns_check[cols] = [val for val in df[cols].unique() if not str(val).isdigit() ]
for k in str_values_by_columns_check.keys():
    if k not in ['STATE','DISTRICTS','LEGALAID_JAILS','JAIL_NAME','DATE_LEGALAID']:
        print (k, str_values_by_columns_check[k])

LEGALAID_CONSTI ['NR', 'NP']
NUM_LWYR_VISITING ['NR', 'NP']
NUM_PARALEGAL_PRISONER ['NR', 'NP']
NUM_PARALEGAL_COMMUNITY ['NR', 'NP']
YR_LEGALAID ['NR', 'NA', 'NP']


In [11]:
df.drop(['DATE_LEGALAID'],inplace=True, axis=1)
# exporting to csv
df.to_csv("NALSA_2011_cleaned_"+re.sub(r' |:', '-', strftime("%Y-%m-%d %H:%M:%S", gmtime()))+".csv")
df.head()

Unnamed: 0,STATE,DISTRICTS,LEGALAID_JAILS,JAIL_NAME,LEGALAID_CONSTI,NUM_LWYR_VISITING,NUM_PARALEGAL_PRISONER,NUM_PARALEGAL_COMMUNITY,YR_LEGALAID
0,TELANGANA,NIZAMABAD,DISTRICT JAIL,NIZAMABAD,1,40,NR,NR,2011
1,TELANGANA,NIZAMABAD,SUB JAIL,ARMOOR,1,29,NR,NR,2015
2,TELANGANA,NIZAMABAD,SUB JAIL,KAMAREDDY,1,25,NR,NR,2015
3,TELANGANA,NIZAMABAD,SUB JAIL,BODHAN,1,28,NR,NR,2015
4,KARNATAKA,KOLAR,DISTRICT JAIL,"DISTRICT SUB JAIL, KOLAR",1,1,0,0,2016
