In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns
import hypertools as hyp
%matplotlib notebook

In [2]:
#load in the data

fname = 'At Risk Students For Dartmouth.xlsx'
sheet1 = 'Year 16-17'
sheet2 = 'Year 17-18 9.25.17'
columns = ('id', 'grade', 'age', 'school', 'sex', 'homeless', 'disadvantaged', 'specialneeds',
           'excused1', 'unexcused1', 'tardy1',
           'excused2', 'unexcused2', 'tardy2',
           'excused3', 'unexcused3', 'tardy3',
           'excused4', 'unexcused4', 'tardy4')
y1_data = pd.read_excel(fname, sheetname=sheet1, skiprows=[0], names=columns)
y2_data = pd.read_excel(fname, sheetname=sheet2, skiprows=[0], names=columns)

#use student IDs as the index
y1_data.set_index('id', inplace=True)
y2_data.set_index('id', inplace=True)

In [3]:
pd.unique(y1_data['specialneeds'])

array([nan, 504, 'IEP'], dtype=object)

In [4]:
#do some data cleaning

#in "disadvantaged" column, replace "YES" with 1 and NaN with 0
y1_data['disadvantaged'] = y1_data['disadvantaged'].map({np.nan: 0, 'YES': 1})
y2_data['disadvantaged'] = y2_data['disadvantaged'].map({np.nan: 0, 'YES': 1})

#in "specialneeds" column, 
y1_data['specialneeds'] = y1_data['specialneeds'].map({np.nan: 0, 504: '504', 'IEP': 'IEP'})
y2_data['specialneeds'] = y2_data['specialneeds'].map({np.nan: 0, 504: '504', 'IEP': 'IEP'})

#replace '---' with 0 (Fourth marking period columns)
y1_data.replace('---', 0, inplace=True)
y2_data.replace('---', 0, inplace=True)

In [5]:
y1_data.head()

Unnamed: 0_level_0,grade,age,school,sex,homeless,disadvantaged,specialneeds,excused1,unexcused1,tardy1,excused2,unexcused2,tardy2,excused3,unexcused3,tardy3,excused4,unexcused4,tardy4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
300053,10,16,Hartford High School/HACTC,F,N,1,0,22,0,0,0,0,0,0,0,1,2,2,0
300339,12,18,Hartford High School/HACTC,F,N,0,0,4,2,4,13,9,6,27,15,5,36,15,7
300340,11,17,Hartford High School/HACTC,M,N,0,0,1,0,0,5,1,1,3,0,5,24,0,1
300344,9,15,Hartford High School/HACTC,M,N,0,0,0,0,0,15,0,0,0,1,0,3,0,0
300345,8,15,Hartford Memorial Middle School,M,N,0,504,0,0,0,3,0,1,4,0,1,5,0,3


In [7]:
#in “disadvantaged” column, replace “YES” with 1 and NaN with 0
y1_data['disadvantaged'] = y1_data['disadvantaged'].map({np.nan: 0, 'YES': 1})
y2_data['disadvantaged'] = y2_data['disadvantaged'].map({np.nan: 0, 'YES': 1})

In [9]:
#in “specialneeds” column,
y1_data['specialneeds'] = y1_data['specialneeds'].map({np.nan: 0, 504: '504', 'IEP': 'IEP'})
y2_data['specialneeds'] = y2_data['specialneeds'].map({np.nan: 0, 504: '504', 'IEP': 'IEP'})

In [11]:
#replace ‘---’ with 0 (Fourth marking period columns)
y1_data.replace('---', 0, inplace=True)
y2_data.replace('---', 0, inplace=True)

In [13]:
#Create dummy variables for male and female, create new dataframe
y1_data_sex = pd.get_dummies(y1_data['sex'])
y1_data_new = y1_data.join(y1_data_sex)
y1_data_new

Unnamed: 0_level_0,grade,age,school,sex,homeless,disadvantaged,specialneeds,excused1,unexcused1,tardy1,...,unexcused2,tardy2,excused3,unexcused3,tardy3,excused4,unexcused4,tardy4,F,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300053,10,16,Hartford High School/HACTC,F,N,,,22,0,0,...,0,0,0,0,1,2,2,0,1,0
300339,12,18,Hartford High School/HACTC,F,N,,,4,2,4,...,9,6,27,15,5,36,15,7,1,0
300340,11,17,Hartford High School/HACTC,M,N,,,1,0,0,...,1,1,3,0,5,24,0,1,0,1
300344,9,15,Hartford High School/HACTC,M,N,,,0,0,0,...,0,0,0,1,0,3,0,0,0,1
300345,8,15,Hartford Memorial Middle School,M,N,,,0,0,0,...,0,1,4,0,1,5,0,3,0,1
300628,11,17,Hartford High School/HACTC,F,N,,,2,1,1,...,0,3,6,0,5,2,4,11,1,0
300630,9,15,Hartford High School/HACTC,M,N,,,1,0,1,...,0,0,0,1,0,2,1,2,0,1
300631,8,14,Hartford Memorial Middle School,F,N,,,0,0,1,...,0,0,2,1,0,0,2,0,1,0
300908,11,18,Hartford High School/HACTC,F,N,,,1,0,0,...,0,1,3,0,3,0,9,6,1,0
300909,9,17,Hartford High School/HACTC,M,N,,IEP,27,0,1,...,0,0,5,0,0,15,0,0,0,1


In [14]:
#Drop all strings in grade column
y1_data_new.loc[list(map(lambda x: x not in ['K', 'AW', 'PA', 'PD', 'PP'], y1_data_new['grade']))]

Unnamed: 0_level_0,grade,age,school,sex,homeless,disadvantaged,specialneeds,excused1,unexcused1,tardy1,...,unexcused2,tardy2,excused3,unexcused3,tardy3,excused4,unexcused4,tardy4,F,M
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300053,10,16,Hartford High School/HACTC,F,N,,,22,0,0,...,0,0,0,0,1,2,2,0,1,0
300339,12,18,Hartford High School/HACTC,F,N,,,4,2,4,...,9,6,27,15,5,36,15,7,1,0
300340,11,17,Hartford High School/HACTC,M,N,,,1,0,0,...,1,1,3,0,5,24,0,1,0,1
300344,9,15,Hartford High School/HACTC,M,N,,,0,0,0,...,0,0,0,1,0,3,0,0,0,1
300345,8,15,Hartford Memorial Middle School,M,N,,,0,0,0,...,0,1,4,0,1,5,0,3,0,1
300628,11,17,Hartford High School/HACTC,F,N,,,2,1,1,...,0,3,6,0,5,2,4,11,1,0
300630,9,15,Hartford High School/HACTC,M,N,,,1,0,1,...,0,0,0,1,0,2,1,2,0,1
300631,8,14,Hartford Memorial Middle School,F,N,,,0,0,1,...,0,0,2,1,0,0,2,0,1,0
300908,11,18,Hartford High School/HACTC,F,N,,,1,0,0,...,0,1,3,0,3,0,9,6,1,0
300909,9,17,Hartford High School/HACTC,M,N,,IEP,27,0,1,...,0,0,5,0,0,15,0,0,0,1


In [20]:
#Create new year-long columns
y1_data_new['excused']=y1_data_new['excused1'] + y1_data_new['excused2'] + y1_data_new['excused3']
y1_data_new['unexcused']=y1_data_new['unexcused1'] + y1_data_new['unexcused2'] + y1_data_new['unexcused3']
y1_data_new['tardy']=y1_data_new['tardy1'] + y1_data_new['tardy2'] + y1_data_new['tardy3']