In [1]:
#IMPORTING REQUIRED MODULES

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading required data files

newdf=pd.read_excel(r'DDW-C19-0000.xlsx') #POPULATION BY BILINGUALISM, TRILINGUALISM, EDUCATIONAL LEVEL AND SEX
data=pd.read_excel('DDW-0000C-08.xlsx') #EDUCATIONAL LEVEL BY AGE AND SEX FOR POPULATION AGE 7 AND ABOVE 

In [3]:
#selecting only the required columns

newdf=newdf.iloc[:,[0,3,4,6,7,9,10]]

In [4]:
#renaming the columns

newdf.columns=['statecode','tru','level','2.more males','2.more females','3.more males','3.more females']

In [5]:
#removing the rows that contain metadata

newdf=newdf.iloc[5:-3,:]

In [6]:
#selecting rows that have level as one of the literacy groups

newdf=newdf[newdf['level']!='Total']
newdf=newdf[newdf['level']!='Literate']

#selecting rows that have tru as total

newdf=newdf[newdf['tru']=='Total']

In [7]:
#removing tru column

newdf=newdf.iloc[:,[0,2,3,4,5,6]]

In [8]:
#selecting only the req rows and columns from 'data' dataframe

data=data.iloc[6:,[1,4,5,10,11,16,17,19,20,22,23,25,26,28,29,31,32,34,35,37,38,40,41]]

In [9]:
#renaming the columns

data.columns=['statecode','tru','age','il.male','il.female','we.male','we.female','bepri.male','bepri.female','pri.male','pri.female','mid.male','mid.female','sec.male','sec.female','highsec.male','highsec.female','ntd.male','ntd.female','td.male','td.female','gna.male','gna.female']

In [10]:
#selecting rows that have tru as total and age as all ages

data=data[data['tru']=='Total']
data=data[data['age']=='All ages']

In [11]:
#removing tru and age columns

data=data.iloc[:,[0,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]]

In [12]:
#combining various literacy clases into one class to make them consistent across files

# combined below primary and literate without education level into below primary

data['bepri.male']=data['we.male']+data['bepri.male']
data['bepri.female']=data['we.female']+data['bepri.female']

#combined secondary,higher secondary, non technical diploma and technical diploma into secondary

data['sec.male']=data['sec.male']+data['highsec.male']+data['ntd.male']+data['td.male']
data['sec.female']=data['sec.female']+data['highsec.female']+data['ntd.female']+data['td.female']

#selecting only the required columns

data=data.iloc[:,[0,1,2,5,6,7,8,9,10,11,12,19,20]]

In [13]:
#creating a new dataframe

newdata=pd.DataFrame()

In [14]:
#converting literacy subgroup values which are as columns into rows of new data

for i in range(len(data)):
    for j in range(1,12,2):
        newdata=newdata.append(pd.Series([data.iloc[i,0],data.iloc[i,j],data.iloc[i,j+1]]),ignore_index=True)
        
#naming the columns of newdata dataframe

newdata.columns=['statecode','males','females']

#adding column 'level' to new data to detct which literacy group that row belongs to

newdata['level']='a'
l=['Illiterate','Literate but below primary','Primary but below middle','Middle but below matric/secondary','Matric/Secondary but below graduate','Graduate and above']
count=0
for i in range(len(newdata)):
    newdata.iloc[i,3]=l[count]
    count=(count+1)%6

In [15]:
#merging newdf and newdata

merged=pd.merge(newdf,newdata)

In [16]:
#converting string to numeric value

merged['2.more males']=pd.to_numeric(merged['2.more males'])
merged['2.more females']=pd.to_numeric(merged['2.more females'])

#converting string to numeric value

merged['3.more males']=pd.to_numeric(merged['3.more males'])
merged['3.more females']=pd.to_numeric(merged['3.more females'])

In [17]:
#calculating the no of males and females speaking only 1 lang and no of people speaking exactly 2 language by

#no of males speaking exactly 2 langs=no of males speaking 2 or more langs in that literacy group - no of males speaking 3 or more langs in that literacy group
#no of females speaking exactly 2 langs=no of females speaking 2 or more langs in that literacy group - no of females speaking 3 or more langs in that literacy group

merged['exact2males']=merged['2.more males']-merged['3.more males']
merged['exact2females']=merged['2.more females']-merged['3.more females']


#no of males speaking exactly 1 lang= total no of males in that literacy group-no of males speaking 2 or more langs in that literacy group
#no of females speaking exactly 1 lang= total no of females in that literacy group-no of females speaking 2 or more langs in that literacy group

merged['exact1males']=merged['males']-merged['2.more males']
merged['exact1females']=merged['females']-merged['2.more females']

In [18]:
#calculating the ratio of males and females speaking exactly one, exacly two and three or more languages in a state for each literacy group

merged['1.maleratio']=merged['exact1males']/merged['males'] 
merged['1.femaleratio']=merged['exact1females']/merged['females'] 

merged['2.maleratio']=merged['exact2males']/merged['males'] 
merged['2.femaleratio']=merged['exact2females']/merged['females'] 

merged['3.maleratio']=merged['3.more males']/merged['males'] 
merged['3.femaleratio']=merged['3.more females']/merged['females'] 

In [19]:
#selecting only the required columns from merged data frame

merged=merged.iloc[:,[0,1,12,13,14,15,16,17]]

In [20]:
#dividing merged dataframe into 3 diff dataframes containg info about male and female percentage speaking 
#exactly one lang and exactly speaking 2 langs and speaking 3 or more langs respectively

merged1=merged.iloc[:,[0,1,2,3]]
merged2=merged.iloc[:,[0,1,-4,-3]]
merged3=merged.iloc[:,[0,1,-2,-1]]

In [21]:
#function that takes dataframe having 'statecode','literacygroup','male percent','femalepercent' as columns
#here 'malepercent' can be malepercentage speaking exactly one lang or exactly 2 langs or 3 or more langs based on the dataframe passed
#here 'femalepercent' can be femalepercentage speaking exactly one lang or exactly 2 langs or 3 or more langs based on the dataframe passed

def high(df):
    
    #renaming the columns
    
    df.columns=['statecode','level','malepercent','femalepercent']
    
    #splitting male and female data into two different dataframes
    
    male=df.iloc[:,[0,1,2]]
    female=df.iloc[:,[0,1,3]]
    
    #creating new dataframe 'highmale' and 'highfemale' to store the literacy group of each state that has the 
    #highest percentage of males speaking based on malepercent column and highest percentage of females speaking
    #based on femalepercent column
    
    highmale=pd.DataFrame()
    highfemale=pd.DataFrame()
    
    #adding statecode column to highmale and high female dataframes
    
    g=merged.groupby(by=['statecode'],as_index=False)
    
    #this list  will store the statecodes
    
    statecode_list=[]
    for i,j in g:
        statecode_list.append(i)
    
    #assigning the statecode_list to statecode column of highmale and high female dataframes
    
    highmale['statecode']=statecode_list
    highfemale['statecode']=statecode_list
    
    #getting the highest percentage value of  literacy group in males and females
    
    highmale['malepercent']=male.groupby(by=['statecode'],as_index=False).agg(func='max')['malepercent']
    highfemale['femalepercent']=female.groupby(by=['statecode'],as_index=False).agg(func='max')['femalepercent']
    
    #merging the highmale and male data frame to get the finalmale merged dataframe
    #merging the highfemale and female data frame to get the finalfemale merged dataframe
    #the merging happens of statecode and the highest percentage value
    
    finalmalemerged=pd.merge(highmale,male)
    finalfemalemerged=pd.merge(highfemale,female)
    
    #reordering of columns in finalmale merged dataframe
    
    finalmalemerged=finalmalemerged[['statecode','level','malepercent']]
    
    #reordering of columns in final female merged dataframe
    
    finalfemalemerged=finalfemalemerged[['statecode','level','femalepercent']]
    
    #renaming of columns
    
    finalmalemerged.columns=['statecode','male literacy group','malepercent']
    finalfemalemerged.columns=['statecode','female literacy group','femalepercent']
    
    #merging the finalmale merged and finalfemale merged dataframe
    #merging happens on statecode column(which is common in both )
    
    finalmerged=pd.merge(finalmalemerged,finalfemalemerged)
    
    #returning the final merged dataframe
    
    return finalmerged

In [22]:
#passing merged1.merged2,merged3 as inputs to high function and storing the outputs in output1,output2,output3 respectively

output1=high(merged1)
output2=high(merged2)
output3=high(merged3)

In [23]:
#renaming of columns

output1.columns=['state/ut', 'literacy-group-males', 'ratio-males','literacy-group-females', 'ratio-females']
output2.columns=['state/ut', 'literacy-group-males', 'ratio-males','literacy-group-females', 'ratio-females']
output3.columns=['state/ut', 'literacy-group-males', 'ratio-males','literacy-group-females', 'ratio-females']

In [24]:
#writing output dataframes to their respective csv files

output3.to_csv('literacy-gender-a.csv',index=False)
output2.to_csv('literacy-gender-b.csv',index=False)
output1.to_csv('literacy-gender-c.csv',index=False)