# AICTE Approved Institutions Data Processing & Analysis

### Data Source: AICTE Website
### Relevance: 2012-2019

In [1]:
# importing necessary libraries
import pandas as pd
from impyute.imputation.cs import mice

In [2]:
df = pd.read_csv('AICTE_data_Sheet1.csv') #Loading data

In [3]:
df.head()

Unnamed: 0,Year,Total Institutions,New Institutions,Closed Institutions,Total Intake,Girl's Enrolment,Boy's Enrolment,Faculty,#Students Passed,Placement
0,2012-13,10272,321,73,3463402,726105,1431295,392270,1380789.0,560640
1,2013-14,10300,157,108,3723719,571138,1618179,533352,1504802.0,613106
2,2014-15,10334,244,77,3961670,554665,1587283,682079,1585743.0,673837
3,2015-16,10329,174,123,3836181,553264,1532071,698554,1636939.0,701031
4,2016-17,10365,293,155,3702582,529928,1424844,692191,1709053.0,722494


In [4]:
df.dtypes

Year                    object
Total Institutions       int64
New Institutions         int64
Closed Institutions      int64
Total Intake             int64
Girl's Enrolment         int64
Boy's Enrolment          int64
Faculty                  int64
#Students Passed       float64
Placement                int64
dtype: object

### Handling Missing Values

In [5]:
df.isnull().sum()

Year                   0
Total Institutions     0
New Institutions       0
Closed Institutions    0
Total Intake           0
Girl's Enrolment       0
Boy's Enrolment        0
Faculty                0
#Students Passed       1
Placement              0
dtype: int64

In [6]:
# Filling missing values using MICE algorithm
temp = df.drop(columns=["Year"]) #Excluding Year Column
cols = temp.columns
imputed = pd.DataFrame(mice(temp.values))
imputed.columns = cols
df[cols] = imputed[cols]

In [7]:
df =  df.round() #Rounding off the imputed values

In [8]:
df.describe()

Unnamed: 0,Total Institutions,New Institutions,Closed Institutions,Total Intake,Girl's Enrolment,Boy's Enrolment,Faculty,#Students Passed,Placement
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,10346.571429,294.714286,99.142857,3661824.0,568144.428571,1468611.0,612032.857143,1639166.0,678216.714286
std,54.313464,130.342772,34.411377,203922.3,72234.813857,113047.0,116437.779537,240068.8,69375.440356
min,10272.0,157.0,53.0,3392834.0,517115.0,1316060.0,392270.0,1380789.0,560640.0
25%,10314.5,209.0,75.0,3507890.0,527362.0,1397694.0,560423.0,1515942.0,643471.5
50%,10334.0,293.0,105.0,3702582.0,553264.0,1431295.0,682079.0,1585743.0,701031.0
75%,10382.5,323.5,115.5,3779950.0,562901.5,1559677.0,695240.5,1672996.0,718534.0
max,10426.0,548.0,155.0,3961670.0,726105.0,1618179.0,698554.0,2129757.0,761835.0


In [9]:
df

Unnamed: 0,Year,Total Institutions,New Institutions,Closed Institutions,Total Intake,Girl's Enrolment,Boy's Enrolment,Faculty,#Students Passed,Placement
0,2012-13,10272.0,321.0,73.0,3463402.0,726105.0,1431295.0,392270.0,1380789.0,560640.0
1,2013-14,10300.0,157.0,108.0,3723719.0,571138.0,1618179.0,533352.0,1504802.0,613106.0
2,2014-15,10334.0,244.0,77.0,3961670.0,554665.0,1587283.0,682079.0,1585743.0,673837.0
3,2015-16,10329.0,174.0,123.0,3836181.0,553264.0,1532071.0,698554.0,1636939.0,701031.0
4,2016-17,10365.0,293.0,155.0,3702582.0,529928.0,1424844.0,692191.0,1709053.0,722494.0
5,2017-18,10400.0,326.0,105.0,3552377.0,524796.0,1370544.0,698290.0,1527081.0,714574.0
6,2018-19,10426.0,548.0,53.0,3392834.0,517115.0,1316060.0,587494.0,2129757.0,761835.0


### Finding Correlation

In [10]:
df.corr()

Unnamed: 0,Total Institutions,New Institutions,Closed Institutions,Total Intake,Girl's Enrolment,Boy's Enrolment,Faculty,#Students Passed,Placement
Total Institutions,1.0,0.682032,-0.082626,-0.321741,-0.788505,-0.677334,0.574206,0.77335,0.922434
New Institutions,0.682032,1.0,-0.567179,-0.762601,-0.141542,-0.8697,-0.138281,0.720775,0.471566
Closed Institutions,-0.082626,-0.567179,1.0,0.410971,-0.281173,0.223069,0.485692,-0.256133,0.137263
Total Intake,-0.321741,-0.762601,0.410971,1.0,-0.236256,0.819089,0.52821,-0.307436,-0.03184
Girl's Enrolment,-0.788505,-0.141542,-0.281173,-0.236256,1.0,0.116696,-0.846749,-0.60846,-0.884098
Boy's Enrolment,-0.677334,-0.8697,0.223069,0.819089,0.116696,1.0,0.025474,-0.514898,-0.482027
Faculty,0.574206,-0.138281,0.485692,0.52821,-0.846749,0.025474,1.0,0.282104,0.768813
#Students Passed,0.77335,0.720775,-0.256133,-0.307436,-0.60846,-0.514898,0.282104,1.0,0.794087
Placement,0.922434,0.471566,0.137263,-0.03184,-0.884098,-0.482027,0.768813,0.794087,1.0


> Placements are highly correlated to the total number of institution (0.922434)

> Placements are highly correlated to the total number of faculties (0.768813)



In [11]:
df

Unnamed: 0,Year,Total Institutions,New Institutions,Closed Institutions,Total Intake,Girl's Enrolment,Boy's Enrolment,Faculty,#Students Passed,Placement
0,2012-13,10272.0,321.0,73.0,3463402.0,726105.0,1431295.0,392270.0,1380789.0,560640.0
1,2013-14,10300.0,157.0,108.0,3723719.0,571138.0,1618179.0,533352.0,1504802.0,613106.0
2,2014-15,10334.0,244.0,77.0,3961670.0,554665.0,1587283.0,682079.0,1585743.0,673837.0
3,2015-16,10329.0,174.0,123.0,3836181.0,553264.0,1532071.0,698554.0,1636939.0,701031.0
4,2016-17,10365.0,293.0,155.0,3702582.0,529928.0,1424844.0,692191.0,1709053.0,722494.0
5,2017-18,10400.0,326.0,105.0,3552377.0,524796.0,1370544.0,698290.0,1527081.0,714574.0
6,2018-19,10426.0,548.0,53.0,3392834.0,517115.0,1316060.0,587494.0,2129757.0,761835.0


### Feature Extraction

In [12]:
df["Not_placed"] = df["#Students Passed"] - df["Placement"]

In [13]:
df["placement_ratio"] = df["Placement"]/df["#Students Passed"]

In [14]:
df

Unnamed: 0,Year,Total Institutions,New Institutions,Closed Institutions,Total Intake,Girl's Enrolment,Boy's Enrolment,Faculty,#Students Passed,Placement,Not_placed,placement_ratio
0,2012-13,10272.0,321.0,73.0,3463402.0,726105.0,1431295.0,392270.0,1380789.0,560640.0,820149.0,0.406029
1,2013-14,10300.0,157.0,108.0,3723719.0,571138.0,1618179.0,533352.0,1504802.0,613106.0,891696.0,0.407433
2,2014-15,10334.0,244.0,77.0,3961670.0,554665.0,1587283.0,682079.0,1585743.0,673837.0,911906.0,0.424935
3,2015-16,10329.0,174.0,123.0,3836181.0,553264.0,1532071.0,698554.0,1636939.0,701031.0,935908.0,0.428257
4,2016-17,10365.0,293.0,155.0,3702582.0,529928.0,1424844.0,692191.0,1709053.0,722494.0,986559.0,0.422745
5,2017-18,10400.0,326.0,105.0,3552377.0,524796.0,1370544.0,698290.0,1527081.0,714574.0,812507.0,0.467935
6,2018-19,10426.0,548.0,53.0,3392834.0,517115.0,1316060.0,587494.0,2129757.0,761835.0,1367922.0,0.35771


In [15]:
df.to_csv('preprocessed_aicte.csv', index=False)