In [1]:
#IMPORTING REQUIRED MODULES

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import scipy.stats

In [2]:
#reading the required data files(statewise population data and statewise bilingualism and trilingualism data)

newdf=pd.read_excel(r'DDW-C18-0000.xlsx')
totpop=pd.read_csv(r'DDW_PCA0000_2011_Indiastatedist.csv')

In [3]:
#renaming the columns in statewise bilingualism and trilingualism data

newnames=['statecode','districtcode','state','TRU','agegroup','2.persons','2.males','2.females','3.persons','3.males','3.females']

newdf.columns=newnames

In [4]:
#removing the rows that contain metadata

newdf=newdf[5:]

In [5]:
#choosing rows that have 'agegroup' as total and 'tru' as not total

newdf=newdf[newdf['TRU']!='Total']
newdf=newdf[newdf['agegroup']=='Total']

In [6]:
#selecting only the required columns

newdf=newdf.iloc[:,[0,2,3,5,8]]

In [7]:
#printing the selected column names

newdf.columns

Index(['statecode', 'state', 'TRU', '2.persons', '3.persons'], dtype='object')

In [8]:
#choosing rural and urban state population data rows only from population data

totpop=totpop[totpop['Level']!='DISTRICT']
totpop=totpop[totpop['TRU']!='Total']

In [9]:
#selecting only the required columns

totpop=totpop.iloc[:,[7,8,10]]

In [10]:
#changing name 'india' to 'INDIA'

totpop.iloc[0,0]='INDIA'
totpop.iloc[1,0]='INDIA'

In [11]:
#renaming columns

totpop.columns=['state', 'TRU', 'TOT_P']

In [12]:
#merging the data frames based on state names and tru values

merged = totpop.merge(newdf,on=['state','TRU'])

In [13]:
#splitting rural and urban data into different dataframes

rmerge=merged[merged['TRU']=='Rural']
umerge=merged[merged['TRU']!='Rural']

In [14]:
#merging rmerge and umerge based on statecode column

merged1=rmerge.merge(umerge,on=['statecode'])

In [15]:
#selecting required columns

merged1=merged1.iloc[:,[2,3,4,5,8,9,10]]

In [16]:
#renaming columns

merged1.columns=['ruralpop','statecode','2.rural','3.rural','urbanpop','2.urban','3.urban']

In [17]:
#converting string to numeric

merged1['3.rural']=pd.to_numeric(merged1['3.rural'])
merged1['2.rural']=pd.to_numeric(merged1['2.rural'])
merged1['3.urban']=pd.to_numeric(merged1['3.urban'])
merged1['2.urban']=pd.to_numeric(merged1['2.urban'])

In [18]:
#calculating the no of urban and rural ppl speaking only 1 lang and  exactly 2 language by

#no of rural ppl speaking exactly 2 langs=no of rural ppl speaking 2 or more langs - no of rural ppl speaking 3 or more langs
#no of urban ppl  speaking exactly 2 langs=no of urban ppl speaking 2 or more langs - no of urban ppl speaking 3 or more langs

merged1['exact2rural']=merged1['2.rural']-merged1['3.rural']
merged1['exact2urban']=merged1['2.urban']-merged1['3.urban']

#no of rural ppl speaking exactly 1 lang= total no of rural ppl in that state-no of rural ppl speaking 2 or more langs
#no of urban ppl speaking exactly 1 lang= total no of urban ppl in that state-no of urban ppl speaking 2 or more langs

merged1['exact1rural']=merged1['ruralpop']-merged1['2.rural']
merged1['exact1urban']=merged1['urbanpop']-merged1['2.urban']

In [19]:
#calculating the percentage of rural and urban ppl speaking exactly one, exacly two and three or more languages in that state

merged1['exact1perrural']=merged1['exact1rural']/merged1['ruralpop']*100
merged1['exact2perrural']=merged1['exact2rural']/merged1['ruralpop']*100
merged1['3ormoreperrural']=merged1['3.rural']/merged1['ruralpop']*100

merged1['exact1perurban']=merged1['exact1urban']/merged1['urbanpop']*100
merged1['exact2perurban']=merged1['exact2urban']/merged1['urbanpop']*100
merged1['3ormoreperurban']=merged1['3.urban']/merged1['urbanpop']*100

In [20]:
merged1.columns

Index(['ruralpop', 'statecode', '2.rural', '3.rural', 'urbanpop', '2.urban',
       '3.urban', 'exact2rural', 'exact2urban', 'exact1rural', 'exact1urban',
       'exact1perrural', 'exact2perrural', '3ormoreperrural', 'exact1perurban',
       'exact2perurban', '3ormoreperurban'],
      dtype='object')

In [21]:
#dividing the merged1 data frame into three different dataframe
#part a:exactly 1 lang
#part b: exactly 2 lang
#part c: 3 or more langs

part_a=merged1.iloc[:,[1,14,11]]
part_b=merged1.iloc[:,[1,15,12]]
part_c=merged1.iloc[:,[1,16,13]]

In [22]:
#function to calculate the p value
#takes a dataframe as input ,which has columns as statecode,urbanpercentage,ruralpercentage

def p_cal(df):
    
    #calculating the difference between urban and rural percentages
    
    diff=abs(df.iloc[:,1]-df.iloc[:,2])
    
    #calculating the mean of the differences
    
    mean=np.mean(diff)
    
    #calculating the standard deviation of the differences
    
    sd=np.std(diff)
    
    #computing the z score for each difference value based on computed mean and standard deviation
    
    zscore=(diff-mean)/sd
    
    #list to store the p values
    
    p=[]
    
    #converting each zscore value to p value
    
    for i in range(len(zscore)):
        
        #we are doing two tailed test , so we are multiplying the final value by 2
        
        p.append(scipy.stats.norm.sf(abs(zscore[i]))*2)
    
    #returning the p values
    
    return p

In [23]:
#storing the p values of each part in lists

p_a=p_cal(part_a)
p_b=p_cal(part_b)
p_c=p_cal(part_c)

In [24]:
#adding p values to the dataframes

part_a['p']=p_a
part_b['p']=p_b
part_c['p']=p_c

In [25]:
#renaming the columns

part_a.columns=['statecode','urban-percentage','rural-percentage', 'p-value']
part_b.columns=['statecode','urban-percentage','rural-percentage', 'p-value']
part_c.columns=['statecode','urban-percentage','rural-percentage', 'p-value']

In [26]:
#writing dataframes to output files

part_a.to_csv('geography-india-a.csv',index=False)
part_b.to_csv('geography-india-b.csv',index=False)
part_c.to_csv('geography-india-c.csv',index=False)