In [1]:
#IMPORTING REQUIRED MODULES

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import scipy.stats

In [2]:
#reading the required data files(statewise population data and statewise bilingualism and trilingualism data)

newdf=pd.read_excel(r'DDW-C18-0000.xlsx')
totpop=pd.read_csv(r'DDW_PCA0000_2011_Indiastatedist.csv')

In [3]:
#renaming the columns in statewise bilingualism and trilingualism data

newnames=['statecode','districtcode','state','type','agegroup','2.persons','2.males','2.females','3.persons','3.males','3.females']

newdf.columns=newnames

In [4]:
#removing the rows that contain metadata

newdf=newdf[5:]

In [5]:
#choosing rows that have 'agegroup' and 'type' as total

newdf=newdf[newdf['type']=='Total']
newdf=newdf[newdf['agegroup']=='Total']

In [6]:
#selecting only the required columns

newdf=newdf.iloc[:,[0,2,6,7,9,10]]

In [7]:
#printing the selected column names

newdf.columns

Index(['statecode', 'state', '2.males', '2.females', '3.males', '3.females'], dtype='object')

In [8]:
#choosing total state population data rows only from population data

totpop=totpop[totpop['Level']!='DISTRICT']
totpop=totpop[totpop['TRU']=='Total']

In [9]:
#selecting only the required columns

totpop=totpop.iloc[:,[7,11,12]]

In [10]:
#changing name 'india' to 'INDIA'

totpop.iloc[0,0]='INDIA'

In [11]:
#printing the selected column names

totpop.columns

Index(['Name', 'TOT_M', 'TOT_F'], dtype='object')

In [12]:
#merging the data frames based on state names

merged = pd.merge(left=newdf, right=totpop, left_on='state', right_on='Name')

In [13]:
#converting the strings to numeric values

merged['3.males']=pd.to_numeric(merged['3.males'])
merged['3.females']=pd.to_numeric(merged['3.females'])

In [14]:
#calculating the no of males and females speaking only 1 lang and no of people speaking exactly 2 language by

#no of males  speaking exactly 2 langs=no of males speaking 2 or more langs - no of males speaking 3 or more langs
#no of females  speaking exactly 2 langs=no of females speaking 2 or more langs - no of females speaking 3 or more langs

merged['exact2males']=merged['2.males']-merged['3.males']
merged['exact2females']=merged['2.females']-merged['3.females']

#no of males speaking exactly 1 lang= total no of males in that state-no of males speaking 2 or more langs
#no of females speaking exactly 1 lang= total no of females in that state-no of females speaking 2 or more langs

merged['exact1males']=merged['TOT_M']-merged['2.males']
merged['exact1females']=merged['TOT_F']-merged['2.females']

In [15]:
#calculating the percentage of males and females speaking exactly one, exacly two and three or more languages in that state

merged['exact1permales']=merged['exact1males']/merged['TOT_M']*100
merged['exact2permales']=merged['exact2males']/merged['TOT_M']*100
merged['3ormorepermales']=merged['3.males']/merged['TOT_M']*100

merged['exact1perfemales']=merged['exact1females']/merged['TOT_F']*100
merged['exact2perfemales']=merged['exact2females']/merged['TOT_F']*100
merged['3ormoreperfemales']=merged['3.females']/merged['TOT_F']*100

In [16]:
#dividing the merged data frame into three different dataframe
#part a:exactly 1 lang
#part b: exactly 2 lang
#part c: 3 or more langs

part_a=merged.iloc[:,[0,13,16]]
part_b=merged.iloc[:,[0,14,17]]
part_c=merged.iloc[:,[0,15,18]]

In [17]:
#function to calculate the p value
#takes a dataframe as input ,which has columns as statecode,malepercentage,femalepercentage

def p_cal(df):
    
    #calculating the difference between male and female percentages
    
    diff=abs(df.iloc[:,1]-df.iloc[:,2])
    
    #calculating the mean of the differences
    
    mean=np.mean(diff)
    
    #calculating the standard deviation of the differences
    
    sd=np.std(diff)
    
    #computing the z score for each difference value based on computed mean and standard deviation
    
    zscore=(diff-mean)/sd
    
    #list to store the p values
    
    p=[]
    
    #converting each zscore value to p value
    
    for i in range(len(zscore)):
        
        #we are doing two tailed test , so we are multiplying the final value by 2
        
        p.append(scipy.stats.norm.sf(abs(zscore[i]))*2)
    
    #returning the p values
    
    return p

In [18]:
#storing the p values of each part in lists

p_a=p_cal(part_a)
p_b=p_cal(part_b)
p_c=p_cal(part_c)

In [19]:
#adding p values to the dataframes

part_a['p']=p_a
part_b['p']=p_b
part_c['p']=p_c

In [20]:
#renaming the columns

part_a.columns=['statecode','male-percentage','female-percentage', 'p-value']
part_b.columns=['statecode','male-percentage','female-percentage', 'p-value']
part_c.columns=['statecode','male-percentage','female-percentage', 'p-value']

In [21]:
#writing dataframes to output files

part_a.to_csv('gender-india-a.csv',index=False)
part_b.to_csv('gender-india-b.csv',index=False)
part_c.to_csv('gender-india-c.csv',index=False)