In [319]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hypertools as hyp
import seaborn as sns
%matplotlib notebook

In [320]:
#load in the data

fname = 'At Risk Students For Dartmouth.xlsx'
sheet1 = 'Year 16-17'
sheet2 = 'Year 17-18 9.25.17'
columns = ('id', 'grade', 'age', 'school', 'sex', 'homeless', 'disadvantaged', 'specialneeds',
           'excused1', 'unexcused1', 'tardy1',
           'excused2', 'unexcused2', 'tardy2',
           'excused3', 'unexcused3', 'tardy3',
           'excused4', 'unexcused4', 'tardy4')
y1_data = pd.read_excel(fname, sheetname=sheet1, skiprows=[0], names=columns)
y2_data = pd.read_excel(fname, sheetname=sheet2, skiprows=[0], names=columns)

#use student IDs as the index
y1_data.set_index('id', inplace=True)
y2_data.set_index('id', inplace=True)

In [321]:
#do some data cleaning

#in "disadvantaged" column, replace "YES" with 1 and NaN with 0
y1_data['disadvantaged'] = y1_data['disadvantaged'].map({np.nan: 0, 'YES': 1})
y2_data['disadvantaged'] = y2_data['disadvantaged'].map({np.nan: 0, 'YES': 1})

#in "disadvantaged" column, replace "YES" with 1 and NaN with 0
#y1_data['specialneeds'] = y1_data['specialneeds'].map({np.nan: '', '504': 504, 'IEP': IEP  })
#y1_data['specialneeds'] = y2_data['specialneeds'].map({np.nan: '', '504': 504, 'IEP': IEP  })

#replace '---' with 0 (Fourth marking period columns)
y1_data.replace('---', 0, inplace=True)
y2_data.replace('---', 0, inplace=True)

In [314]:
y1_data.head()

Unnamed: 0_level_0,grade,age,school,sex,homeless,disadvantaged,specialneeds,excused1,unexcused1,tardy1,excused2,unexcused2,tardy2,excused3,unexcused3,tardy3,excused4,unexcused4,tardy4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
300053,10,16,Hartford High School/HACTC,F,N,1,,22,0,0,0,0,0,0,0,1,2,2,0
300339,12,18,Hartford High School/HACTC,F,N,0,,4,2,4,13,9,6,27,15,5,36,15,7
300340,11,17,Hartford High School/HACTC,M,N,0,,1,0,0,5,1,1,3,0,5,24,0,1
300344,9,15,Hartford High School/HACTC,M,N,0,,0,0,0,15,0,0,0,1,0,3,0,0
300345,8,15,Hartford Memorial Middle School,M,N,0,504.0,0,0,0,3,0,1,4,0,1,5,0,3


In [315]:
pd.unique(y1_data['specialneeds'])

array([nan, 504, 'IEP'], dtype=object)

In [316]:
y1_data['total_debauchery'] = y1_data['unexcused2'] + y1_data['unexcused1']+ y1_data['unexcused3']+ y1_data['unexcused4']
y1_data

Unnamed: 0_level_0,grade,age,school,sex,homeless,disadvantaged,specialneeds,excused1,unexcused1,tardy1,excused2,unexcused2,tardy2,excused3,unexcused3,tardy3,excused4,unexcused4,tardy4,total_debauchery
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
300053,10,16,Hartford High School/HACTC,F,N,1,,22,0,0,0,0,0,0,0,1,2,2,0,2
300339,12,18,Hartford High School/HACTC,F,N,0,,4,2,4,13,9,6,27,15,5,36,15,7,41
300340,11,17,Hartford High School/HACTC,M,N,0,,1,0,0,5,1,1,3,0,5,24,0,1,1
300344,9,15,Hartford High School/HACTC,M,N,0,,0,0,0,15,0,0,0,1,0,3,0,0,1
300345,8,15,Hartford Memorial Middle School,M,N,0,504,0,0,0,3,0,1,4,0,1,5,0,3,0
300628,11,17,Hartford High School/HACTC,F,N,0,,2,1,1,0,0,3,6,0,5,2,4,11,5
300630,9,15,Hartford High School/HACTC,M,N,0,504,1,0,1,0,0,0,0,1,0,2,1,2,2
300631,8,14,Hartford Memorial Middle School,F,N,0,,0,0,1,1,0,0,2,1,0,0,2,0,3
300908,11,18,Hartford High School/HACTC,F,N,0,,1,0,0,4,0,1,3,0,3,0,9,6,9
300909,9,17,Hartford High School/HACTC,M,N,0,IEP,27,0,1,16,0,0,5,0,0,15,0,0,0


In [317]:
df_adv = y1_data.drop(y1_data[(y1_data.disadvantaged == 1)].index)
df_disadv = y1_data.drop(y1_data[(y1_data.disadvantaged == 0)].index)

In [318]:
df_disadv

Unnamed: 0_level_0,grade,age,school,sex,homeless,disadvantaged,specialneeds,excused1,unexcused1,tardy1,excused2,unexcused2,tardy2,excused3,unexcused3,tardy3,excused4,unexcused4,tardy4,total_debauchery
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
300053,10,16,Hartford High School/HACTC,F,N,1,,22,0,0,0,0,0,0,0,1,2,2,0,2
301467,10,16,Hartford High School/HACTC,F,N,1,,37,0,4,15,1,4,81,4,6,62,6,8,11
302028,11,17,Hartford High School/HACTC,M,N,1,,6,0,1,0,0,1,4,0,0,0,0,0,0
302316,8,15,Hartford Memorial Middle School,F,N,1,504,1,1,0,7,0,4,0,0,2,0,0,3,1
303158,9,15,Hartford High School/HACTC,M,N,1,IEP,1,4,5,0,2,3,1,14,4,2,24,11,44
303725,10,16,Hartford High School/HACTC,M,N,1,IEP,4,0,2,16,6,3,5,11,2,6,7,1,24
304003,10,16,Hartford High School/HACTC,M,N,1,IEP,3,0,0,15,1,0,10,0,0,10,0,1,1
304280,12,19,Hartford High School/HACTC,M,N,1,IEP,16,2,4,17,3,3,26,5,5,8,4,1,14
304559,12,18,Hartford High School/HACTC,M,N,1,IEP,25,0,0,2,3,0,10,0,0,11,4,2,7
304840,10,17,Hartford High School/HACTC,M,N,1,504,6,0,2,22,1,3,11,0,2,19,4,2,5


In [305]:
variables = ['age', 'excused1', 'unexcused1']

# Calculate correlations
corr1 = df_disadv[variables].corr()
corr2 = df_adv[variables].corr()

# Heatmap
sns.heatmap(corr1)
sns.heatmap(corr2)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fd2b97b0898>

In [306]:
sns.boxplot(data = y1_data, x='disadvantaged', y = 'total_debauchery', order=[0, 1])
plt.ylim(0,100)

(0, 100)

In [322]:
data = y1_data
data['disadvantaged'] = data['disadvantaged'].map({0: 'A', 1: 'D'})
data

Unnamed: 0_level_0,grade,age,school,sex,homeless,disadvantaged,specialneeds,excused1,unexcused1,tardy1,excused2,unexcused2,tardy2,excused3,unexcused3,tardy3,excused4,unexcused4,tardy4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
300053,10,16,Hartford High School/HACTC,F,N,D,,22,0,0,0,0,0,0,0,1,2,2,0
300339,12,18,Hartford High School/HACTC,F,N,A,,4,2,4,13,9,6,27,15,5,36,15,7
300340,11,17,Hartford High School/HACTC,M,N,A,,1,0,0,5,1,1,3,0,5,24,0,1
300344,9,15,Hartford High School/HACTC,M,N,A,,0,0,0,15,0,0,0,1,0,3,0,0
300345,8,15,Hartford Memorial Middle School,M,N,A,504,0,0,0,3,0,1,4,0,1,5,0,3
300628,11,17,Hartford High School/HACTC,F,N,A,,2,1,1,0,0,3,6,0,5,2,4,11
300630,9,15,Hartford High School/HACTC,M,N,A,504,1,0,1,0,0,0,0,1,0,2,1,2
300631,8,14,Hartford Memorial Middle School,F,N,A,,0,0,1,1,0,0,2,1,0,0,2,0
300908,11,18,Hartford High School/HACTC,F,N,A,,1,0,0,4,0,1,3,0,3,0,9,6
300909,9,17,Hartford High School/HACTC,M,N,A,IEP,27,0,1,16,0,0,5,0,0,15,0,0


In [326]:
sns.distplot(df_adv['total_debauchery'], bins = 20)
sns.distplot(df_disadv['total_debauchery'], bins = 20)
plt.xlim(0,100)

<IPython.core.display.Javascript object>

(0, 100)

In [309]:
for d in ['A', 'D']:
    sns.distplot(data[data.disadvantaged == d]['total_debauchery'], bins = 20, label = d)
sns.plt.legend()
sns.plt.show()

In [294]:
sns.set_palette('muted')
sns.set_style('ticks')
plt.xlim(0,100)

(0, 100)