In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# plotly
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import warnings
# ignore warnings
warnings.filterwarnings("ignore")
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

In [2]:
df_ele = pd.read_csv('2015_16_Statewise_Elementary.csv')

df_sec = pd.read_csv('2015_16_Statewise_Secondary.csv')

world = pd.read_csv('Countries.csv')
world.dtypes
world.columns = (["country","region","population","area","density","coastline","migration","infant_mortality","gdp","literacy","phones","arable","crops","other","climate","birthrate","deathrate","agriculture","industry","service"])

In [3]:
world.country = world.country.astype('category')
world.region = world.region.astype('category')
world.density = world.density.str.replace(",",".").astype(float)
world.coastline = world.coastline.str.replace(",",".").astype(float)
world.migration = world.migration.str.replace(",",".").astype(float)
world.infant_mortality = world.infant_mortality.str.replace(",",".").astype(float)
world.literacy = world.literacy.str.replace(",",".").astype(float)
world.phones = world.phones.str.replace(",",".").astype(float)
world.arable = world.arable.str.replace(",",".").astype(float)
world.crops = world.crops.str.replace(",",".").astype(float)
world.other = world.other.str.replace(",",".").astype(float)
world.climate = world.climate.str.replace(",",".").astype(float)
world.birthrate = world.birthrate.str.replace(",",".").astype(float)
world.deathrate = world.deathrate.str.replace(",",".").astype(float)
world.agriculture = world.agriculture.str.replace(",",".").astype(float)
world.industry = world.industry.str.replace(",",".").astype(float)
world.service = world.service.str.replace(",",".").astype(float)

In [4]:
#Literacy rates per country
data = dict(type='choropleth',
locations = world.country,
locationmode = 'country names', z = world.literacy,
text = world.country, colorbar = {'title':'Literacy'},
colorscale = 'YlOrRd', reversescale = True)
layout = dict(title='Literacy Rate per country',
geo = dict(showframe=False,projection={'type':'natural earth'}))
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap,validate=False)

In [5]:
display(df_ele['STATNAME'].unique())

array(['JAMMU & KASHMIR', 'HIMACHAL PRADESH', 'PUNJAB', 'CHANDIGARH',
       'UTTARAKHAND', 'HARYANA', 'DELHI', 'RAJASTHAN', 'UTTAR PRADESH',
       'BIHAR', 'SIKKIM', 'ARUNACHAL PRADESH', 'NAGALAND', 'MANIPUR',
       'MIZORAM', 'TRIPURA', 'MEGHALAYA', 'ASSAM', 'WEST BENGAL',
       'JHARKHAND', 'ODISHA', 'CHHATTISGARH', 'MADHYA PRADESH', 'GUJARAT',
       'DAMAN & DIU', 'DADRA & NAGAR HAVELI', 'MAHARASHTRA',
       'ANDHRA PRADESH', 'KARNATAKA', 'GOA', 'LAKSHADWEEP', 'KERALA',
       'TAMIL NADU', 'PUDUCHERRY', 'A & N ISLANDS', 'TELANGANA'],
      dtype=object)

In [6]:
display(df_sec['statname'].unique())

array(['Jammu And Kashmir', 'Himachal Pradesh', 'Punjab', 'Chandigarh',
       'Uttarakhand', 'Haryana', 'Delhi                         ',
       'Rajasthan                     ', 'Uttar Pradesh', 'Bihar',
       'Sikkim', 'Arunachal Pradesh', 'Nagaland', 'Manipur',
       'Mizoram                       ', 'Tripura',
       'Meghalaya                     ', 'Assam',
       'West Bengal                   ', 'Jharkhand', 'Odisha',
       'Chhattisgarh', 'MADHYA PRADESH', 'Gujarat',
       'Daman & Diu                   ', 'Dadra & Nagar Haveli          ',
       'Maharashtra', 'Andhra Pradesh                ', 'Karnataka',
       'Goa                           ', 'Lakshadweep                   ',
       'Kerala', 'Tamil Nadu', 'Puducherry',
       'Andaman & Nicobar Islands     ', 'Telangana'], dtype=object)

In [7]:
#elementary and secondary literacy rate in INDIA
trace1 = go.Scatter(
      x = df_ele.STATNAME,
      y = df_ele.OVERALL_LI,
      name = "Primary education",
)

trace2 = go.Scatter(
    x=df_sec.statname,
    y=df_sec.literacy_rate,
    xaxis='x2',
    yaxis='y2',
    name = "Secondary education",
)

data = [trace1, trace2]
layout = go.Layout(
    xaxis=dict(
        domain=[0, 0.45],
    ),
    yaxis=dict(
        domain=[0, 0.45]
    ),
    xaxis2=dict(
        domain=[0.55, 1]
    ),
    yaxis2=dict(
        domain=[0, 0.45],
        anchor='x2'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [8]:
#add columns total_number of district
district_dic = {'WEST BENGAL':23,
'UTTARAKHAND':13,
'UTTAR PRADESH':75,
'TRIPURA':8,
'TELANGANA':31,
'TAMIL NADU':32,
'SIKKIM':4,
'RAJASTHAN':33,
'PUNJAB':22,
'PUDUCHERRY':4,
'ODISHA':30,
'DELHI':11,
'NAGALAND':11,
'MIZORAM':8,
'MEGHALAYA':11,
'MANIPUR':16,
'MAHARASHTRA':36,
'MADHYA PRADESH':51,
'LAKSHADWEEP':1,
'KERALA':14,
'KARNATAKA':30,
'JHARKHAND':24,
'JAMMU & KASHMIR':22,
'HIMACHAL PRADESH':12,
'HARYANA':22,
'GUJARAT':33,
'GOA':2,
'DAMAN & DIU':2,
'DADRA & NAGAR HAVELI':1,
'CHHATTISGARH':27,
'CHANDIGARH':1,
'BIHAR':38,
'ASSAM':33,
'ARUNACHAL PRADESH':21,
'ANDHRA PRADESH':13,
'A & N ISLANDS':3}

In [9]:
df_ele['TOT_DISTRICT'] = df_ele['STATNAME'].map(district_dic)
#df_ele['PER_DIST'] = df_ele.apply(lambda x: (df_ele['DISTRICTS']*100)/df_ele['TOT_DISTRICT'])
df_ele['PER_DIST'] = (df_ele.DISTRICTS * 100)/df_ele.TOT_DISTRICT
df_ele.head(2)

Unnamed: 0,AC_YEAR,STATCD,STATNAME,DISTRICTS,BLOCKS,VILLAGES,CLUSTERS,TOTPOPULAT,P_URB_POP,POPULATION_0_6,...,TCHINV,TOTCLS1G,TOTCLS2G,TOTCLS3G,TOTCLS4G,TOTCLS5G,TOTCLS6G,TOTCLS7G,TOT_DISTRICT,PER_DIST
0,2015-16,1,JAMMU & KASHMIR,22,201,7263,1628,12549,20.05,16.01,...,1946,41171,62474,7878,564,495,24326,2001,22,100.0
1,2015-16,2,HIMACHAL PRADESH,12,124,10120,2243,6857,8.69,11.14,...,3053,38307,6930,6909,6498,5445,6151,2717,12,100.0


In [10]:
x = df_ele.STATNAME

# Creating trace1
trace1 = go.Scatter(
                    x = x,
                    y = df_ele.PER_DIST,
                    mode = "lines+markers",
                    name = "District count",
                    text= df_ele.STATNAME)

data = [trace1]
layout = dict(title = 'Data reported from District (in %) from each state',
              xaxis= dict(
                  ticklen= 5,
                  tickangle=90,
                  zeroline= False),
                  
              yaxis=dict(
                    title='Data reported from District (in %)'
                )
             )
fig = dict(data = data, layout = layout)
iplot(fig)

In [11]:
# Create and style traces
trace0 = go.Scatter(
    x = df_ele.STATNAME,
    y = df_ele.OVERALL_LI,
    mode = "lines+markers",
    name = 'Elementary literacy',
    line = dict(
        color = ('rgba(255,10,10, 0.8)'),
        width = 1)
)
trace1 = go.Scatter(
    x = df_ele.STATNAME,
    y = df_ele.MALE_LIT,
    mode = "lines+markers",
    name = 'Elelmentary male literacy',
    line = dict(
        width = 1)
)
trace2 = go.Scatter(
    x = df_ele.STATNAME,
    y = df_ele.FEMALE_LIT,
    mode = "lines+markers",
    name = 'Elelmentary female literacy',
    line = dict(
        width = 1,
        dash = 'dash') # dash options include 'dash', 'dot', and 'dashdot'
)

trace3 = go.Scatter(
    x = df_ele.STATNAME,
    y = df_sec.literacy_rate,
    mode = "lines+markers",
    name = 'secondary literacy rate',
    line = dict(
        width = 1,
        dash = 'dash')
)
trace4 = go.Scatter(
    x = df_ele.STATNAME,
    mode = "lines+markers",
    y = df_sec.male_literacy_rate,
    name = 'secondary male literacy rate',
    line = dict(
        width = 1,
        dash = 'dot')
)
trace5 = go.Scatter(
    x = df_ele.STATNAME,
    y = df_sec.female_literacy_rate,
    mode = "lines+markers",
    name = 'secondary female literacy rate',
    line = dict(
        width = 1,
        dash = 'dot')
)
data = [trace0, trace1, trace2, trace3, trace4, trace5]

# Edit the layout
layout = dict(title = 'Statewise literacy rate in India',
                      xaxis= dict(
                          ticklen= 5,
                          tickangle=90,
                          zeroline= False),

                      yaxis=dict(
                            title='Statewise literacy rate'
                        )
                     )

fig = dict(data=data, layout=layout)
iplot(fig)

In [12]:
# Creating trace1
trace1 = go.Scatter(
                    x = df_ele.STATNAME,
                    y = df_ele.TOT_6_10_15,
                    mode = "lines+markers",
                    name = "Age Group 6 to 10",
                    text= df_ele.STATNAME)
trace2 = go.Scatter(
                    x = df_ele.STATNAME,
                    y = df_ele.TOT_11_13_15,
                    mode = "lines+markers",
                    name = "Age Group 11 to 13",
                    text= df_ele.STATNAME
                    )
                    
trace3 = go.Scatter(
                    x = df_ele.STATNAME,
                    y = df_ele.OVERALL_LI,
                    mode = "lines+markers",
                    name = "Overall Literacy",
                    text= df_ele.STATNAME,
                    yaxis='y2')


data = [trace1, trace2, trace3]
layout = dict(title = 'Projected Population vs Literacy',
              xaxis= dict(
                  ticklen= 5,
                  tickangle=90,
                  zeroline= False),
                  
              yaxis=dict(
                    title='Projected Population'
                ),
              yaxis2=dict(
              title='Overall Literacy',
              titlefont=dict(
               color='rgb(148, 103, 189)'
              ),
              tickfont=dict(
              color='rgb(148, 103, 189)'
           ),
             overlaying='y',
             side='right'
         )
       )

fig = dict(data = data, layout = layout)
iplot(fig)

In [13]:
# Creating trace1
trace1 = go.Scatter(
                    x = df_ele.STATNAME,
                    y = df_ele.SCHTOT,
                    mode = "lines+markers",
                    name = "total schools",
                    text= df_ele.STATNAME)
trace2 = go.Scatter(
                    x = df_ele.STATNAME,
                    y = df_ele.SCHTOTG,
                    mode = "lines+markers",
                    name = "Schools by Category: Government: Total",
                    text= df_ele.STATNAME)
trace3 = go.Scatter(
                    x = df_ele.STATNAME,
                    y = df_ele.SCHTOTP,
                    mode = "lines+markers",
                    name = "Schools by Category: Private : Total",
                    text= df_ele.STATNAME)
trace4 = go.Scatter(
                    x = df_ele.STATNAME,
                    y = df_ele.OVERALL_LI,
                    mode = "lines+markers",
                    name = "Overall Literacy",
                    text= df_ele.STATNAME,
                    yaxis='y2')


data = [trace1, trace2, trace3, trace4]
layout = dict(title = 'School type',
              xaxis= dict(
                  ticklen= 5,
                  tickangle=90,
                  zeroline= False),
                  
              yaxis=dict(
                    title='School type'
                ),
             yaxis2=dict(
              title='Overall Literacy',
              titlefont=dict(
               color='rgb(148, 103, 189)'
              ),
              tickfont=dict(
              color='rgb(148, 103, 189)'
           ),
             overlaying='y',
             side='right'
         )
    )

fig = dict(data = data, layout = layout)
iplot(fig)

In [14]:
top_3_elem = df_ele.sort_values(by = 'OVERALL_LI', ascending = False).head(3)
bottom_3_elem = df_ele.sort_values(by = 'OVERALL_LI', ascending = True).head(4)
bottom_3_elem = bottom_3_elem.drop(35)x_data = top_bottom['STATNAME']

fig = go.Figure(data=[
    go.Bar(name='SEX RATION',x = x_data, y=top_bottom['SEXRATIO'])
])
# Change the bar mode
fig.update_layout(barmode='group', title="SEX RATIO STATEWISE")
fig.show()

In [15]:
top_bottom = pd.concat([top_3_elem, bottom_3_elem], axis = 0, sort = False)
top_bottom

Unnamed: 0,AC_YEAR,STATCD,STATNAME,DISTRICTS,BLOCKS,VILLAGES,CLUSTERS,TOTPOPULAT,P_URB_POP,POPULATION_0_6,...,TCHINV,TOTCLS1G,TOTCLS2G,TOTCLS3G,TOTCLS4G,TOTCLS5G,TOTCLS6G,TOTCLS7G,TOT_DISTRICT,PER_DIST
31,2015-16,32,KERALA,14,166,1907,1375,33388,24.76,9.95,...,675,51419,43025,31335,5327,15449,17752,3936,14,100.0
30,2015-16,31,LAKSHADWEEP,1,3,10,9,64,41.86,11.0,...,209,139,150,49,15,28,11,0,1,100.0
14,2015-16,15,MIZORAM,8,36,851,169,1091,40.42,15.17,...,1386,6593,3267,0,4575,0,0,0,8,100.0
9,2015-16,10,BIHAR,38,537,40779,5633,103805,8.36,17.9,...,3614,147323,272919,11332,2481,475,23095,927,38,100.0
11,2015-16,12,ARUNACHAL PRADESH,20,99,2982,234,1383,16.48,14.66,...,504,8564,9020,820,202,304,2022,206,21,95.238095
7,2015-16,8,RAJASTHAN,33,302,41441,10594,68621,19.26,15.31,...,2701,125798,253514,120330,979,5189,86126,1720,33,100.0


In [16]:
top_bottom['P_RUR_POP'] = 100 - top_bottom['P_URB_POP']

In [18]:
x_data = top_bottom['STATNAME']

fig = go.Figure(data=[
    go.Bar(name='URBAN POPULATION',x = x_data, y=top_bottom['P_URB_POP']),
    go.Bar(name='RURAL POPULATION',x = x_data, y=top_bottom['P_RUR_POP'])
])
# Change the bar mode
fig.update_layout(barmode='group', title="POPULATION DISTRIBUTION STATEWISE")
fig.show()

In [19]:
x_data = top_bottom['STATNAME']

fig = go.Figure(data=[
    go.Bar(name='SEX RATION',x = x_data, y=top_bottom['SEXRATIO'])
])
# Change the bar mode
fig.update_layout(barmode='group', title="SEX RATIO STATEWISE")
fig.show()

In [20]:
top_bottom['SC_ST_POP'] = top_bottom.P_SC_POP + top_bottom.P_ST_POP

In [21]:
x_data = top_bottom['STATNAME']

fig = go.Figure(data=[
    go.Bar(name='SC AND ST POPULATION',x = x_data, y=top_bottom['SC_ST_POP']),
    go.Bar(name='SC POPULATION',x = x_data, y=top_bottom['P_SC_POP']),
    go.Bar(name='ST POPULATION',x = x_data, y=top_bottom['P_ST_POP'])
])
# Change the bar mode
fig.update_layout(barmode='group', title ="SC AND ST POPULATION STATEWISE")
fig.show()

In [22]:
# No. of kids in the age group 6-15, i.e, elementary schoolkids
top_bottom['SCHKIDS'] = top_bottom.TOT_6_10_15 + top_bottom.TOT_11_13_15
# No. of kids per school
top_bottom['KIDSPERSCH'] = top_bottom.SCHKIDS/top_bottom.SCHTOT
x_data = top_bottom['STATNAME']

fig = go.Figure(data=[
    go.Bar(name='URBAN POPULATION',x = x_data, y=top_bottom['KIDSPERSCH'])
])
# Change the bar mode
fig.update_layout(barmode='group', title="NUMBER OF KIDS PER SCHOOL STATEWISE")
fig.show()

In [23]:
schtotg_avg = (df_ele.SCHTOTG/df_ele.SCHTOT).mean()
schtotp_avg = (df_ele.SCHTOTP/df_ele.SCHTOT).mean()
schtotm_avg = (df_ele.SCHTOTM/df_ele.SCHTOT).mean()

top_3_elem['SCHTOTG_P'] = (top_3_elem.SCHTOTG/top_3_elem.SCHTOT)
top_3_elem['SCHTOTP_P'] = (top_3_elem.SCHTOTP/top_3_elem.SCHTOT)
top_3_elem['SCHTOTM_P'] = (top_3_elem.SCHTOTM/top_3_elem.SCHTOT)

bottom_3_elem['SCHTOTG_P'] = (bottom_3_elem.SCHTOTG/bottom_3_elem.SCHTOT)
bottom_3_elem['SCHTOTP_P'] = (bottom_3_elem.SCHTOTP/bottom_3_elem.SCHTOT)
bottom_3_elem['SCHTOTM_P'] = (bottom_3_elem.SCHTOTM/bottom_3_elem.SCHTOT)