In [64]:
# This program will combine and clean the high school performance data, as well as add location information

# dependencies and setup
import requests
import pandas as pd
import numpy as np

# files to load
DOE_2013_2014_report = "Resources/2013_-_2014_DOE_High_School_Performance-Directory.csv" # https://data.cityofnewyork.us/Education/2013-2014-DOE-High-School-Performance-Directory/42et-jh9v
DOE_2014_2015_report = "Resources/2014-2015_DOE_High_School_Performance-Directory.csv" # https://data.cityofnewyork.us/Education/2014-2015-DOE-High-School-Performance-Directory/xahu-rkwn
DOE_2016_report = "Resources/2016_DOE_High_School_Performance__Directory.csv" # https://data.cityofnewyork.us/Education/2016-DOE-High-School-Performance-Directory/qvir-knu3
school_safety_report = "Resources/2010_-_2016_School_Safety_Report.csv" # https://data.cityofnewyork.us/Education/2010-2016-School-Safety-Report/qybk-bjjc
precinct_key = "Resources/Precinct_Key.xlsx"

# read files and store into pandas data frames
DOE_2013_2014 = pd.read_csv(DOE_2013_2014_report)
DOE_2014_2015 = pd.read_csv(DOE_2014_2015_report)
DOE_2016 = pd.read_csv(DOE_2016_report)
school_safety = pd.read_csv(school_safety_report)
precincts = pd.read_excel(precinct_key)

In [65]:
# data cleaning

# select needed columns
DOE_2013_2014 = DOE_2013_2014[["DBN", "graduation 2010-11", "college enroll 2010-11", "graduation 2011-12", "college enroll 2011-12"]]
DOE_2014_2015 = DOE_2014_2015[["dbn", "graduation_rate_2013", "college_career_rate_2013"]]
DOE_2016 = DOE_2016[["dbn", "graduation_rate_2014", "college_career_rate_2014"]]

# rename column headers
DOE_2013_2014 = DOE_2013_2014.rename(index=str, columns={"graduation 2010-11": "graduation_rate_2011",
                                                         "college enroll 2010-11": "college_career_rate_2011",
                                                         "graduation 2011-12": "graduation_rate_2012",
                                                         "college enroll 2011-12": "college_career_rate_2012"})
DOE_2014_2015 = DOE_2014_2015.rename(index=str, columns={"dbn": "DBN"})
DOE_2016 = DOE_2016.rename(index=str, columns={"dbn": "DBN"})
precincts = precincts.rename(index=str, columns={"dbn": "DBN"})

In [66]:
# merge the 3 datasets
merge0 = pd.merge(DOE_2013_2014, DOE_2014_2015, on="DBN")
merge1 = pd.merge(merge0, DOE_2016, on="DBN")

In [67]:
# convert percent data to float
merge1["graduation_rate_2011"] = merge1["graduation_rate_2011"].str.rstrip('%').astype('float') / 100
merge1["college_career_rate_2011"] = merge1["college_career_rate_2011"].str.rstrip('%').astype('float') / 100
merge1["graduation_rate_2012"] = merge1["graduation_rate_2012"].str.rstrip('%').astype('float') / 100
merge1["college_career_rate_2012"] = merge1["college_career_rate_2012"].str.rstrip('%').astype('float') / 100
merge1["graduation_rate_2013"] = merge1["graduation_rate_2013"].str.rstrip('%').astype('float') / 100
merge1["college_career_rate_2013"] = merge1["college_career_rate_2013"].str.rstrip('%').astype('float') / 100
merge1["graduation_rate_2014"] = merge1["graduation_rate_2014"].str.rstrip('%').astype('float') / 100
merge1["college_career_rate_2014"] = merge1["college_career_rate_2014"].str.rstrip('%').astype('float') / 100

In [68]:
# bring in geographic data
geo_safety_data = school_safety[["School Year", "DBN", "Community Board", "Council District ", "Census Tract", "NTA"]]
geo_safety_data = geo_safety_data.loc[geo_safety_data["School Year"] == "2013-14"]
geo_safety_data = geo_safety_data.drop(columns="School Year")

merge2 = pd.merge(merge1, geo_safety_data, on="DBN")

# clean NTA column (remove trailing white space)
merge2["NTA"] = merge2["NTA"].str.strip()

# bring in precinct number
merge3 = pd.merge(merge2, precincts, on="DBN")

In [69]:
# export to csv file
merge3.to_csv("Resources/Combined_HS_Performance_Data_v2.csv", index=False)