### India Literacy Data - District Wise EDA
The Datasets used are:
* [India Literacy Data - District Wise](https://www.kaggle.com/datasets/satyampd/india-literacy-data-district-wise),uploaded by Satyam Prasad Tiwari on Kaggle.
* [India States](https://www.kaggle.com/datasets/somacodes/india-states),uploaded by Soma Ghosh on Kaggle.
* [India 2020 District Level Shape files](https://www.kaggle.com/datasets/imdevskp/india-district-wise-shape-files),uploaded by Devakumar K. P. on Kaggle.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from statsmodels.graphics.gofplots import qqplot
%matplotlib inline
import seaborn as sns
import plotly.express as px
import geopandas as gpd
import re

In [5]:
# Loading Data
df = pd.read_csv('../input/india-literacy-data-district-wise/Literacy Data 2011.csv')


#lets drop Unamed: 0

df = df.drop(columns = 'Unnamed: 0')

#lets' set state as index and assign it to literacy Data Frame
df = df.sort_values('State').copy()

In [6]:
# Lets Compare Literacy among States
states_group=pd.DataFrame(df.groupby('State')['Literacy'].mean())


states = sns.catplot(kind = 'bar', x =states_group.index, y = 'Literacy', data = states_group , color = "#ef476f")
states.fig.set_size_inches([28,10])

## Customisation ##
states.set_xticklabels(rotation = 90,color = '#073B4C' )
states.set_xticklabels(color = '#073B4C' )
states.fig.suptitle('State wise Average Literacy',color = '#602437',y = 1.0055, x = 0.56,fontsize = 20)
plt.xlabel('States',color = '#602437',fontsize = 20)
plt.ylabel('Literacy in %',color = '#602437',fontsize = 20)
plt.axhline(y = states_group['Literacy'].mean(),linestyle = '--',color = '#880d1e',label = 'Avergae Literacy')
plt.legend()
plt.show()
plt.clf()

In [7]:
## Individual State's Literacy Rate among thier districts ##
plt.rcParams['figure.figsize'] = [28, 12]
plt.rcParams['figure.dpi'] = 100
sns.set_style('dark')
for state in literacy['State'].unique():
    
    sns.catplot(kind = 'bar', x = 'Literacy', y = 'District', data = literacy[literacy['State'] == state], color = "#ef476f")
    
   
    mean_literacy = literacy[literacy['State'] == state]['Literacy'].mean()
    
    # Customizing
    if len(literacy[literacy['State'] == state]['State'].to_list())> 34:
        plt.yticks(fontsize = 5,color = '#073B4C')
    else:
        plt.yticks(color = '#073B4C')
    plt.xticks(rotation = 90,color = '#073B4C')
    plt.title(state,color = '#602437')
    plt.xlabel('Literacy in %',color = '#602437')
    plt.ylabel('Districts',color = '#602437')
    plt.axvline(x = mean_literacy, linestyle = '--',label = 'Average Literacy',color = "#880d1e")
    plt.legend(loc = 'upper left',bbox_to_anchor = (1, 1))
    plt.show()
    plt.clf()

In [8]:
df['State'] = df['State'].apply(lambda x: x.lstrip())
df['District'] = df['District'].apply(lambda x: x.lstrip())
df.head()

In [18]:
Data=df.groupby(by="State")["Literacy"].mean()
df6=pd.DataFrame(Data)
States=list(df6.index)

Literacy=[]
for i in States:
    Literacy.append(df6.loc[i].Literacy)

    
    
List2=sorted(list(zip(States,Literacy)),key = lambda x: x[1],reverse=True)
import seaborn as sns
import matplotlib.pyplot as plt

X2=[i[0] for i in List2]
Y2=[i[1] for i in List2]
plt.figure(figsize=(10,8))
plt.title("Literacy Rate of States in India")
sns.barplot(Y2,X2,palette='bright')
plt.tight_layout()

In [11]:
avg_state_literacy = df.groupby('State').agg({'Literacy': 'mean'}).reset_index().sort_values('Literacy')
px.scatter(avg_state_literacy, 
           x='Literacy', 
           y='State', 
           template='ggplot2', title='Average Literacy Rate', height=800)

In [12]:
# india map shape file
map_df = gpd.read_file('../input/india-states/Igismap/Indian_States.shp')
# corrections
map_df['st_nm'] = map_df['st_nm'].apply(lambda x: re.sub('&', 'and', x))
map
map_df.iloc[0, 0] = 'Andaman And Nicobar Islands'
map_df.iloc[1, 0] = 'Arunachal Pradesh'
map_df.iloc[6, 0] = 'Dadra and Nagar Haveli'
map_df.iloc[34, 0] = 'Orissa'
map_df.iloc[23, 0] = 'Delhi'
new_row = {'State': 'Telangana', 'Literacy': np.NAN}
avg_state_literacy = avg_state_literacy.append(new_row, ignore_index=True)
# merge the data frames 
merged = (map_df.set_index('st_nm').sort_index()).join(avg_state_literacy.set_index('State').sort_index())
# plot
with plt.style.context('ggplot'):
    fig, ax = plt.subplots(1, figsize=(15, 10))
    ax.axis('off')
    ax.set_title('Average Literacy Rates for each state', fontdict={'fontsize': '14', 'fontweight' : '4'})
    merged.plot(column='Literacy', cmap='tab20', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
    plt.show()

In [19]:
# district shape file
district_map = gpd.read_file('../input/india-district-wise-shape-files/output.shp')
district_map = district_map[['geometry', 'distname']]
# join df with map file
merged2 = (district_map.set_index('distname').sort_index()).join(df.set_index('District').sort_index())
merged2 = merged2[['geometry', 'Literacy']].dropna()
# plot
with plt.style.context('ggplot'):
    fig, ax = plt.subplots(1, figsize=(12, 12))
    ax.axis('off')
    ax.set_title('District wise Literacy', fontdict={'fontsize': '20', 'fontweight' : '3'})
    merged2.plot(column='Literacy', cmap='RdYlBu', linewidth=0.8, ax=ax, edgecolor='0.8', 
                 legend=True, scheme='EqualInterval', k=5)
    plt.show()

In [14]:
# Top 10 states with Highest Literacy
df.sort_values(by='Literacy').set_index('District')[-10:].style.background_gradient(cmap='YlGnBu')

In [15]:
df3=df.sort_values(by='Literacy',inplace=False,ascending=False)
df4=df3.head(10)

plt.figure(figsize=(10,10))
fig=sunburst=px.sunburst(df4,path=['State','District','Literacy'])
plt.tight_layout()
fig

In [16]:
# Top 10 states with Lowest Literacy
df.sort_values(by='Literacy').set_index('District')[:10].style.background_gradient(cmap='RdYlBu')

In [17]:
df3=df.sort_values(by='Literacy',inplace=False,ascending=False)
df5=df3.tail(10)


plt.figure(figsize=(12,12))
fig=sunburst=px.sunburst(df5,path=['State','District','Literacy'])
fig.show()
plt.tight_layout()