In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from time import gmtime, strftime

df = pd.read_excel("Copy of DK- NALSA 2011 Document 03_06_17 version.xlsx", "District Responses", skiprows=1).fillna("NR")

In [2]:
# renaming the column names in the NALSA 2011 file 
colnames =  ['STATE','DISTRICTS','NUM_VISITS_LWYR','NUM_TRAINING_DLSA_LWYR','NUM_DAYS_LEGALAID_PARA',
             'NUM_TRAINING_DLSA_PRISONER_PARA', 'NUM_TRAINING_DLSA_COMMUNITY_PARA','SIGNBOARD']
df.columns = colnames


In [3]:
# shape of the dataset
df.shape

(208, 8)

In [4]:
# top 5 rows 
df.head()

Unnamed: 0,STATE,DISTRICTS,NUM_VISITS_LWYR,NUM_TRAINING_DLSA_LWYR,NUM_DAYS_LEGALAID_PARA,NUM_TRAINING_DLSA_PRISONER_PARA,NUM_TRAINING_DLSA_COMMUNITY_PARA,SIGNBOARD
0,KARNATAKA,KOLAR,48,0,0,0,0,YES
1,KARNATAKA,MANGALURU,46,Not Responded,0,0,0,YES
2,KARNATAKA,VIJAYAPURA,36,0,0,0,0,YES
3,KARNATAKA,MANDYA,96,0,Not Responded,NR,Not responded,YES
4,KARNATAKA,UDUPI,2,0,Not provided,0,Not provided,YES


In [5]:
# list of columns
df.columns

Index(['STATE', 'DISTRICTS', 'NUM_VISITS_LWYR', 'NUM_TRAINING_DLSA_LWYR',
       'NUM_DAYS_LEGALAID_PARA', 'NUM_TRAINING_DLSA_PRISONER_PARA',
       'NUM_TRAINING_DLSA_COMMUNITY_PARA', 'SIGNBOARD'],
      dtype='object')

In [6]:
rule_dict = {"NA":['N.A','NOT APPLICABLE','NOT APPLICABLE ','NA ','Not Applicable','Not Applicable ','N.A ','Not applicable'],
             "NP":[' nan,','nan',' nan','Nil','ATTACHED','Cumm','Cumm Response','Data not collated','DATA NOT MAINTAINED',
                                      'DOES NOT ARISE','INFO. PERTAINS TO COURTS','INFOMATION PERTAINS TO COURTS',
                                      'INFORMATION NOT AVAILABLE','NIL','No','NO MAINTAINED','NO OF STAFF PROVIDED',
                                      'NO RECORD AVAILBALE','NO SUCH DATA MAINTAINED','None','Not available','NOT COMPILED',
                                      'Not constituted','NOT COUNTED','NOT KNOWN','NOT MAINTAINED','Not Provided',
                                      'NOT REPORTED','NOT SUBMITTED BY PANEL LAWYER','PENDING','RECVIED THROUGH SDLSCs',
                                      'REPORT NOT RECEIVED','Response not clear','ATTACHED ',
                                      'THE SERVICE OF JUNIOR ADMINISTRATIVE ASSISTANT WHO ATTACHED TO DLSAs ABOVE IS UTILIZED',
                                      'To check','To check attachment','Not provided','NOT AVAILABLE','NOT COMPILED ',
                   'NOT AVAILABLE ','NOT COUNTED ','NOT KNOWN ','INFOMATION PERTAINS TO COURTS ','PENDING ','NOT REPORTED ',
                   'REPORT NOT RECEIVED ','INFORMATION NOT AVAILABLE ','DATA NOT COLLATED ', 'NOT SUBMITTED BY PANEL LAWYER ', 
                   'PA on rotation','DATA IS NOT AVAILABLE '], 
             "NR":['No Response','Not responded','Not Responded','No response','-']}

In [7]:
# creationg a dictionary of column names and there string values
str_values_by_columns = {}

for cols in df.columns:
    str_values_by_columns[cols] = [val for val in df[cols].unique() if not str(val).isdigit()]
    
# list of sub-strings to be mapped as NA values - 0/Yes - 1/monthly - 12/ bimonthly - 6/ Quarterly - 4 
##nvals = ['NOT','N.A','NO','NIL','NAN','ATTACH','CHECK','PENDING','PERTAIN','CUMM']
yvals = ['YES']
# list of strings containing the sub-strings (to be replaced in data)
##na_str_val = []
yes_str_val = []

# creating list of strings to be replaced
for k in str_values_by_columns.keys():
    if k not in ['STATE','DISTRICTS',]:
        #na_str_val = na_str_val + [x for x in str_values_by_columns[k] if any(nv.lower() in str(x).lower() for nv in nvals)]
        yes_str_val = yes_str_val + [x for x in str_values_by_columns[k] if any(yv.lower() in str(x).lower() for yv in yvals)]

# replacing the the list of string with there respective values
for cols in df.columns:
    #replacing strings with NA values to 0
    ##df[cols][df[cols].isin(list(set(na_str_val)))] = 0
    for k in rule_dict.keys():
        df[cols][df[cols].isin(rule_dict[k])] = k
    #replacing strings with YES values to 1
    df[cols][df[cols].isin(list(set(yes_str_val)))] = 1


In [8]:
# function calculates the sum of all numbers in a strin (i.e. composition of Panel/Monitoring comitte)
def calcNum(panelVal):
    if type(panelVal) == int:
        return panelVal
    elif panelVal in ['NP','NR','NA']:
        return panelVal
    else:
        return sum([int(s) for s in re.findall(r'\b\d+\b', panelVal)])

In [9]:
# NUM_LWYR_PANEL_TOT & NUM_MNTRNG_COMTT_STAFF contain sum of total number of members on the panels/committe
df.NUM_VISITS_LWYR = df.NUM_VISITS_LWYR.apply(lambda x: calcNum(x))
df.NUM_TRAINING_DLSA_LWYR = df.NUM_TRAINING_DLSA_LWYR.apply(lambda x: calcNum(x))
df.NUM_DAYS_LEGALAID_PARA = df.NUM_DAYS_LEGALAID_PARA.apply(lambda x: calcNum(x))
df.NUM_TRAINING_DLSA_PRISONER_PARA = df.NUM_TRAINING_DLSA_PRISONER_PARA.apply(lambda x: calcNum(x))

In [10]:
# replacing all strings other than NP NR NA withe zero
for cols in df.columns:
    if cols not in ['STATE','DISTRICTS',]:
        df[cols][df[cols].isin([val for val in df[cols].unique() if not str(val).isdigit() and val not in ['NP','NR','NA']])] = 0    

In [11]:
# checking for any un-treated string values in the dataset
str_values_by_columns_check = {}
for cols in df.columns:
    str_values_by_columns_check[cols] = [val for val in df[cols].unique() if not str(val).isdigit() ]
for k in str_values_by_columns_check.keys():
    if k not in ['STATE','DISTRICTS',]:
        print (k, str_values_by_columns_check[k])

NUM_VISITS_LWYR ['NP', 'NR']
NUM_TRAINING_DLSA_LWYR ['NR', 'NP']
NUM_DAYS_LEGALAID_PARA ['NR', 'NP']
NUM_TRAINING_DLSA_PRISONER_PARA ['NR', 'NP']
NUM_TRAINING_DLSA_COMMUNITY_PARA ['NR', 'NP']
SIGNBOARD ['NR', 'NP']


In [12]:
#cleaning for STATE names and removing duplicates in DISTRICT
df.STATE[df.STATE.isin(['ANDRHA PRADESH '])] = 'ANDHRA PRADESH '
df.STATE[df.STATE.isin(['JHARKHNAD '])] = 'JHARKHAND'
df.STATE[df.STATE.isin(['MAHARASTRA '])] = 'MAHARASHTRA '
df.STATE[df.STATE.isin(['PANJAB'])] = 'PUNJAB'
df.STATE[df.STATE.isin(['PANJAB '])] = 'PUNJAB'

df.STATE = df.STATE.str.strip()

# exporting to csv
df.to_csv("NALSA_2011_DistResp_cleaned_"+re.sub(r' |:', '-', strftime("%Y-%m-%d %H:%M:%S", gmtime()))+".csv")
df.head()

Unnamed: 0,STATE,DISTRICTS,NUM_VISITS_LWYR,NUM_TRAINING_DLSA_LWYR,NUM_DAYS_LEGALAID_PARA,NUM_TRAINING_DLSA_PRISONER_PARA,NUM_TRAINING_DLSA_COMMUNITY_PARA,SIGNBOARD
0,KARNATAKA,KOLAR,48,0,0,0,0,1
1,KARNATAKA,MANGALURU,46,NR,0,0,0,1
2,KARNATAKA,VIJAYAPURA,36,0,0,0,0,1
3,KARNATAKA,MANDYA,96,0,NR,NR,NR,1
4,KARNATAKA,UDUPI,2,0,NP,0,NP,1
