In [1]:
import pandas as pd
import numpy as np

files = ["ap_2010.csv", "class_size.csv", 
         "demographics.csv", "graduation.csv", 
         "hs_directory.csv", "math_test_results.csv", 
         "sat_results.csv"]

data = {}

for f in files:
    d = pd.read_csv("schools/{0}".format(f))
    data[f.replace(".csv", "")] = d

In [2]:
data.keys()

dict_keys(['hs_directory', 'math_test_results', 'graduation', 'sat_results', 'ap_2010', 'class_size', 'demographics'])

In [3]:
for key, val in data.items():
    print('--'* 30 + '\n' + key + '\n:')
    print(val.head(2))

------------------------------------------------------------
hs_directory
:
      dbn                          school_name      boro building_code  \
0  17K548  Brooklyn School for Music & Theatre  Brooklyn          K440   
1  09X543     High School for Violin and Dance     Bronx          X400   

   phone_number    fax_number grade_span_min  grade_span_max  \
0  718-230-6250  718-230-6262              9              12   
1  718-842-0687  718-589-9849              9              12   

  expgrade_span_min  expgrade_span_max  \
0               NaN                NaN   
1               NaN                NaN   

                         ...                          \
0                        ...                           
1                        ...                           

                                          priority02  \
0                    Then to New York City residents   
1  Then to New York City residents who attend an ...   

                            priority03     

In [191]:
data["class_size"]["DBN"] = data["class_size"].apply(lambda x: "{0:02d}{1}".format(x["CSD"], x["SCHOOL CODE"]), axis=1)
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

In [192]:
survey1 = pd.read_excel(open('schools/survey_all.xlsx', 'rb'), 
                        sheetname = 'Sheet1', 
                        skiprows = [0, 1])
survey2 = pd.read_excel(open('schools/survey_d75.xlsx', 'rb'), 
                        sheetname = 'Sheet1', 
                        skiprows = [0, 1])

survey1['d75'] = False
survey2['d75'] = True
survey = pandas.concat([survey1, survey2], axis=0)

In [193]:
print(len(survey1.columns), len(survey2.columns))

1938 1769


In [194]:
survey.head()

Unnamed: 0,N_p,N_s,N_t,aca_p_11,aca_s_11,aca_t_11,aca_tot_11,com_p_11,com_s_11,com_t_11,...,t_q8c_1,t_q8c_2,t_q8c_3,t_q8c_4,t_q9,t_q9_1,t_q9_2,t_q9_3,t_q9_4,t_q9_5
0,90.0,,22.0,7.8,,7.9,7.9,7.6,,7.8,...,29.0,67.0,5.0,0.0,,5.0,14.0,52.0,24.0,5.0
1,161.0,,34.0,7.8,,9.1,8.4,7.6,,8.5,...,74.0,21.0,6.0,0.0,,3.0,6.0,3.0,78.0,9.0
2,367.0,,42.0,8.6,,7.5,8.0,8.3,,6.3,...,33.0,35.0,20.0,13.0,,3.0,5.0,16.0,70.0,5.0
3,151.0,145.0,29.0,8.5,7.4,7.8,7.9,8.2,5.9,6.2,...,21.0,45.0,28.0,7.0,,0.0,18.0,32.0,39.0,11.0
4,90.0,,23.0,7.9,,8.1,8.0,7.9,,7.3,...,59.0,36.0,5.0,0.0,,10.0,5.0,10.0,60.0,15.0


In [195]:
survey["DBN"] = survey["dbn"]

survey_fields = ["DBN", 
                 "rr_s", "rr_t", "rr_p", 
                 "N_s", "N_t", "N_p", 
                 "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", 
                 "saf_t_11", "com_t_11", "eng_t_10", "aca_t_11", 
                 "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", 
                 "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"
                ]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
survey.shape

(1702, 23)

## condensing dataset

In [196]:
# condense class_size
data['class_size'].head()

Unnamed: 0,CSD,BOROUGH,SCHOOL CODE,SCHOOL NAME,GRADE,PROGRAM TYPE,CORE SUBJECT (MS CORE and 9-12 ONLY),CORE COURSE (MS CORE and 9-12 ONLY),SERVICE CATEGORY(K-9* ONLY),NUMBER OF STUDENTS / SEATS FILLED,NUMBER OF SECTIONS,AVERAGE CLASS SIZE,SIZE OF SMALLEST CLASS,SIZE OF LARGEST CLASS,DATA SOURCE,SCHOOLWIDE PUPIL-TEACHER RATIO,DBN
0,1,M,M015,P.S. 015 Roberto Clemente,0K,GEN ED,-,-,-,19.0,1.0,19.0,19.0,19.0,ATS,,01M015
1,1,M,M015,P.S. 015 Roberto Clemente,0K,CTT,-,-,-,21.0,1.0,21.0,21.0,21.0,ATS,,01M015
2,1,M,M015,P.S. 015 Roberto Clemente,01,GEN ED,-,-,-,17.0,1.0,17.0,17.0,17.0,ATS,,01M015
3,1,M,M015,P.S. 015 Roberto Clemente,01,CTT,-,-,-,17.0,1.0,17.0,17.0,17.0,ATS,,01M015
4,1,M,M015,P.S. 015 Roberto Clemente,02,GEN ED,-,-,-,15.0,1.0,15.0,15.0,15.0,ATS,,01M015


In [197]:
for key in data.keys():
    print(key + ': \n'  + 
          str(len(data[key].index)) + ' rows; ' + 
          str(data[key].DBN.nunique())  + ' unique DBN'
         )

demographics: 
10075 rows; 1594 unique DBN
sat_results: 
478 rows; 478 unique DBN
survey: 
1702 rows; 1702 unique DBN
graduation: 
25096 rows; 423 unique DBN
ap_2010: 
258 rows; 257 unique DBN
class_size: 
27611 rows; 1487 unique DBN
hs_directory: 
435 rows; 435 unique DBN
math_test_results: 
28478 rows; 1132 unique DBN


need to condense these datasets: demographics, graduation, ap_2010, class_size, math_test_results. The goal is to aggregate data to one row per school.

In [198]:
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(np.mean)

In [199]:
class_size.head()

Unnamed: 0_level_0,CSD,NUMBER OF STUDENTS / SEATS FILLED,NUMBER OF SECTIONS,AVERAGE CLASS SIZE,SIZE OF SMALLEST CLASS,SIZE OF LARGEST CLASS,SCHOOLWIDE PUPIL-TEACHER RATIO
DBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01M292,1,88.0,4.0,22.564286,18.5,26.571429,
01M332,1,46.0,2.0,22.0,21.0,23.5,
01M378,1,33.0,1.0,33.0,33.0,33.0,
01M448,1,105.6875,4.75,22.23125,18.25,27.0625,
01M450,1,57.6,2.733333,21.2,19.4,22.866667,


In [200]:
class_size.reset_index(inplace=True)
data['class_size'] = class_size

In [201]:
# condensing demographics
data['demographics'].head()

Unnamed: 0,DBN,Name,schoolyear,fl_percent,frl_percent,total_enrollment,prek,k,grade1,grade2,...,black_num,black_per,hispanic_num,hispanic_per,white_num,white_per,male_num,male_per,female_num,female_per
0,01M015,P.S. 015 ROBERTO CLEMENTE,20052006,89.4,,281,15,36,40,33,...,74,26.3,189,67.3,5,1.8,158.0,56.2,123.0,43.8
1,01M015,P.S. 015 ROBERTO CLEMENTE,20062007,89.4,,243,15,29,39,38,...,68,28.0,153,63.0,4,1.6,140.0,57.6,103.0,42.4
2,01M015,P.S. 015 ROBERTO CLEMENTE,20072008,89.4,,261,18,43,39,36,...,77,29.5,157,60.2,7,2.7,143.0,54.8,118.0,45.2
3,01M015,P.S. 015 ROBERTO CLEMENTE,20082009,89.4,,252,17,37,44,32,...,75,29.8,149,59.1,7,2.8,149.0,59.1,103.0,40.9
4,01M015,P.S. 015 ROBERTO CLEMENTE,20092010,,96.5,208,16,40,28,32,...,67,32.2,118,56.7,6,2.9,124.0,59.6,84.0,40.4


In [202]:
demographics = data["demographics"]
demographics = demographics[demographics["schoolyear"] == 20112012]

In [203]:
type(demographics)

pandas.core.frame.DataFrame

In [204]:
demographics = demographics.groupby(['DBN', 'Name']).mean().reset_index()

In [205]:
data['demographics'] = demographics

In [206]:
# condense gradutaion dataset 
data['graduation'].head()

Unnamed: 0,Demographic,DBN,School Name,Cohort,Total Cohort,Total Grads - n,Total Grads - % of cohort,Total Regents - n,Total Regents - % of cohort,Total Regents - % of grads,...,Regents w/o Advanced - n,Regents w/o Advanced - % of cohort,Regents w/o Advanced - % of grads,Local - n,Local - % of cohort,Local - % of grads,Still Enrolled - n,Still Enrolled - % of cohort,Dropped Out - n,Dropped Out - % of cohort
0,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2003,5,s,s,s,s,s,...,s,s,s,s,s,s,s,s,s,s
1,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2004,55,37,67.3%,17,30.9%,45.9%,...,17,30.9%,45.9%,20,36.4%,54.1%,15,27.3%,3,5.5%
2,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2005,64,43,67.2%,27,42.2%,62.8%,...,27,42.2%,62.8%,16,25%,37.200000000000003%,9,14.1%,9,14.1%
3,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2006,78,43,55.1%,36,46.2%,83.7%,...,36,46.2%,83.7%,7,9%,16.3%,16,20.5%,11,14.1%
4,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2006 Aug,78,44,56.4%,37,47.4%,84.1%,...,37,47.4%,84.1%,7,9%,15.9%,15,19.2%,11,14.1%


In [207]:
data['graduation'].Demographic.value_counts()

Total Cohort                   2493
English Proficient Students    2471
General Education Students     2471
Special Education Students     2471
Male                           2412
Black                          2403
Female                         2397
Hispanic                       2385
English Language Learners      2036
Asian                          1780
White                          1777
Name: Demographic, dtype: int64

The post I am following filtered out levels other than 'Total Cohort', but they may offer some new info so I decide to keep them.

In [231]:
grad = data['graduation']

In [221]:
grad.columns

Index(['Demographic', 'DBN', 'School Name', 'Cohort', 'Total Cohort',
       'Total Grads - n', 'Total Grads - % of cohort', 'Total Regents - n',
       'Total Regents - % of cohort', 'Total Regents - % of grads',
       'Advanced Regents - n', 'Advanced Regents - % of cohort',
       'Advanced Regents - % of grads', 'Regents w/o Advanced - n',
       'Regents w/o Advanced - % of cohort',
       'Regents w/o Advanced - % of grads', 'Local - n', 'Local - % of cohort',
       'Local - % of grads', 'Still Enrolled - n',
       'Still Enrolled - % of cohort', 'Dropped Out - n',
       'Dropped Out - % of cohort'],
      dtype='object')

In [233]:
grad.head()

Unnamed: 0,Demographic,DBN,School Name,Cohort,Total Cohort,Total Grads - n,Total Grads - % of cohort,Total Regents - n,Total Regents - % of cohort,Total Regents - % of grads,...,Regents w/o Advanced - n,Regents w/o Advanced - % of cohort,Regents w/o Advanced - % of grads,Local - n,Local - % of cohort,Local - % of grads,Still Enrolled - n,Still Enrolled - % of cohort,Dropped Out - n,Dropped Out - % of cohort
0,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2003,5,s,,,,,...,,,,,,,,,,
1,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2004,55,37,67.3,17.0,30.9,45.9,...,17.0,30.9,45.9,20.0,36.4,54.1,15.0,27.3,3.0,5.5
2,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2005,64,43,67.2,27.0,42.2,62.8,...,27.0,42.2,62.8,16.0,25.0,37.2,9.0,14.1,9.0,14.1
3,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2006,78,43,55.1,36.0,46.2,83.7,...,36.0,46.2,83.7,7.0,9.0,16.3,16.0,20.5,11.0,14.1
4,Total Cohort,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,2006 Aug,78,44,56.4,37.0,47.4,84.1,...,37.0,47.4,84.1,7.0,9.0,15.9,15.0,19.2,11.0,14.1


In [232]:
grad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25096 entries, 0 to 25095
Data columns (total 23 columns):
Demographic                           25096 non-null object
DBN                                   25096 non-null object
School Name                           25096 non-null object
Cohort                                25096 non-null object
Total Cohort                          25096 non-null int64
Total Grads - n                       25096 non-null object
Total Grads - % of cohort             16704 non-null float64
Total Regents - n                     16704 non-null float64
Total Regents - % of cohort           16704 non-null float64
Total Regents - % of grads            16652 non-null float64
Advanced Regents - n                  16704 non-null float64
Advanced Regents - % of cohort        16704 non-null float64
Advanced Regents - % of grads         16652 non-null float64
Regents w/o Advanced - n              16704 non-null float64
Regents w/o Advanced - % of cohort    16704 

In [234]:
to_convert = ['Total Grads - n', 'Total Grads - % of cohort', 'Total Regents - n',
       'Total Regents - % of cohort', 'Total Regents - % of grads',
       'Advanced Regents - n', 'Advanced Regents - % of cohort',
       'Advanced Regents - % of grads', 'Regents w/o Advanced - n',
       'Regents w/o Advanced - % of cohort',
       'Regents w/o Advanced - % of grads', 'Local - n', 'Local - % of cohort',
       'Local - % of grads', 'Still Enrolled - n',
       'Still Enrolled - % of cohort', 'Dropped Out - n',
       'Dropped Out - % of cohort']

for col in to_convert:
    grad[col] = pd.to_numeric(grad[col],errors='coerce')

In [235]:
grad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25096 entries, 0 to 25095
Data columns (total 23 columns):
Demographic                           25096 non-null object
DBN                                   25096 non-null object
School Name                           25096 non-null object
Cohort                                25096 non-null object
Total Cohort                          25096 non-null int64
Total Grads - n                       16704 non-null float64
Total Grads - % of cohort             16704 non-null float64
Total Regents - n                     16704 non-null float64
Total Regents - % of cohort           16704 non-null float64
Total Regents - % of grads            16652 non-null float64
Advanced Regents - n                  16704 non-null float64
Advanced Regents - % of cohort        16704 non-null float64
Advanced Regents - % of grads         16652 non-null float64
Regents w/o Advanced - n              16704 non-null float64
Regents w/o Advanced - % of cohort    16704

In [236]:
grad = grad.groupby(['DBN', 'School Name']).mean().reset_index()
grad.head()

Unnamed: 0,DBN,School Name,Total Cohort,Total Grads - n,Total Grads - % of cohort,Total Regents - n,Total Regents - % of cohort,Total Regents - % of grads,Advanced Regents - n,Advanced Regents - % of cohort,...,Regents w/o Advanced - n,Regents w/o Advanced - % of cohort,Regents w/o Advanced - % of grads,Local - n,Local - % of cohort,Local - % of grads,Still Enrolled - n,Still Enrolled - % of cohort,Dropped Out - n,Dropped Out - % of cohort
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL,25.87037,27.62963,61.755556,19.814815,42.737037,70.448148,0.0,0.0,...,19.814815,42.737037,70.448148,7.814815,19.0,29.551852,9.259259,20.337037,5.703704,12.1
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,44.051948,35.320755,58.637736,21.886792,36.35283,61.415094,4.622642,8.462264,...,17.264151,27.884906,47.928302,13.433962,22.281132,38.588679,17.226415,28.264151,6.509434,9.958491
2,01M450,EAST SIDE COMMUNITY SCHOOL,36.565789,37.729167,70.4625,35.5625,66.075,93.714583,0.0,0.0,...,35.5625,66.075,93.714583,2.166667,4.383333,6.285417,9.229167,18.210417,5.229167,9.985417
3,01M509,MARTA VALLE HIGH SCHOOL,34.797297,27.380952,49.902381,18.285714,31.861905,61.852381,6.309524,10.540476,...,11.97619,21.321429,42.711905,9.095238,18.028571,38.15,15.952381,30.502381,7.333333,14.266667
4,01M515,LOWER EAST SIDE PREPARATORY HIGH SCHO,87.876712,66.106383,49.151064,49.638298,36.508511,74.361702,30.829787,21.876596,...,18.808511,14.631915,30.225532,16.574468,12.725532,25.787234,44.659574,34.219149,22.297872,16.370213


In [238]:
data['graduation'] = grad

In [240]:
# clean ap_2010
data['ap_2010'].DBN.value_counts().head()

04M610    2
02M620    1
02M489    1
03M470    1
07X547    1
Name: DBN, dtype: int64

In [241]:
data['ap_2010'].loc[data['ap_2010'].DBN=='04M610', :]

Unnamed: 0,DBN,SchoolName,AP Test Takers,Total Exams Taken,Number of Exams with scores 3 4 or 5
51,04M610,THE YOUNG WOMEN'S LEADERSHIP SCHOOL OF EAST HA...,41,55,29
52,04M610,YOUNG WOMEN'S LEADERSHIP SCH,s,s,s


In [242]:
data['ap_2010'].head()

Unnamed: 0,DBN,SchoolName,AP Test Takers,Total Exams Taken,Number of Exams with scores 3 4 or 5
0,01M448,UNIVERSITY NEIGHBORHOOD H.S.,39,49,10
1,01M450,EAST SIDE COMMUNITY HS,19,21,s
2,01M515,LOWER EASTSIDE PREP,24,26,24
3,01M539,"NEW EXPLORATIONS SCI,TECH,MATH",255,377,191
4,02M296,High School of Hospitality Management,s,s,s


The cell with 's' in it must indicating missing value.

In [243]:
ap = data['ap_2010']
ap.columns = ap.columns.str.strip()

In [244]:
ap.columns

Index(['DBN', 'SchoolName', 'AP Test Takers', 'Total Exams Taken',
       'Number of Exams with scores 3 4 or 5'],
      dtype='object')

In [245]:
mask = (ap['AP Test Takers'] == 's') & (ap['Total Exams Taken']=='s') & (ap['Number of Exams with scores 3 4 or 5']=='s')

In [246]:
ap = ap.loc[~mask, :]

In [247]:
print(ap['DBN'].nunique(), len(ap.index))

233 233


In [248]:
data['ap_2010'] = ap

In [271]:
# clean math_test_results
math = data['math_test_results']
math.head()

Unnamed: 0,DBN,Grade,Year,Category,Number Tested,Mean Scale Score,Level 1 #,Level 1 %,Level 2 #,Level 2 %,Level 3 #,Level 3 %,Level 4 #,Level 4 %,Level 3+4 #,Level 3+4 %
0,01M015,3,2006,All Students,39,667,2,5.1%,11,28.2%,20,51.3%,6,15.4%,26,66.7%
1,01M015,3,2007,All Students,31,672,2,6.5%,3,9.7%,22,71%,4,12.9%,26,83.9%
2,01M015,3,2008,All Students,37,668,0,0%,6,16.2%,29,78.4%,2,5.4%,31,83.8%
3,01M015,3,2009,All Students,33,668,0,0%,4,12.1%,28,84.8%,1,3%,29,87.9%
4,01M015,3,2010,All Students,26,677,6,23.1%,12,46.2%,6,23.1%,2,7.7%,8,30.8%


In [250]:
math.Grade.value_counts()

All Grades    6479
3             4466
5             4452
4             4444
6             3186
7             2806
8             2645
Name: Grade, dtype: int64

In [272]:
math = math.loc[math.Year==2011, :]

In [273]:
len(math.DBN) - math.DBN.nunique()

3818

In [274]:
math.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4950 entries, 5 to 28477
Data columns (total 16 columns):
DBN                 4950 non-null object
Grade               4950 non-null object
Year                4950 non-null int64
Category            4950 non-null object
Number Tested       4950 non-null int64
Mean Scale Score    4950 non-null object
Level 1 #           4950 non-null object
Level 1 %           4950 non-null object
Level 2 #           4950 non-null object
Level 2 %           4950 non-null object
Level 3 #           4950 non-null object
Level 3 %           4950 non-null object
Level 4 #           4950 non-null object
Level 4 %           4950 non-null object
Level 3+4 #         4950 non-null object
Level 3+4 %         4950 non-null object
dtypes: int64(2), object(14)
memory usage: 657.4+ KB


In [275]:
math.columns

Index(['DBN', 'Grade', 'Year', 'Category', 'Number Tested', 'Mean Scale Score',
       'Level 1 #', 'Level 1 %', 'Level 2 #', 'Level 2 %', 'Level 3 #',
       'Level 3 %', 'Level 4 #', 'Level 4 %', 'Level 3+4 #', 'Level 3+4 %'],
      dtype='object')

In [276]:
['Level 1 #', 'Level 1 %', 'Level 2 #', 'Level 2 %', 
 'Level 3 #', 'Level 3 %', 'Level 4 #', 'Level 4 %', 
 'Level 3+4 #', 'Level 3+4 %'][1::2]


['Level 1 %', 'Level 2 %', 'Level 3 %', 'Level 4 %', 'Level 3+4 %']

In [277]:
# trim off redundant cols
math = math[['DBN', 'Grade', 'Category', 'Number Tested', 'Mean Scale Score',
            'Level 1 %', 'Level 2 %', 'Level 3 %', 'Level 4 %', 'Level 3+4 %']]  


In [278]:
math.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4950 entries, 5 to 28477
Data columns (total 10 columns):
DBN                 4950 non-null object
Grade               4950 non-null object
Category            4950 non-null object
Number Tested       4950 non-null int64
Mean Scale Score    4950 non-null object
Level 1 %           4950 non-null object
Level 2 %           4950 non-null object
Level 3 %           4950 non-null object
Level 4 %           4950 non-null object
Level 3+4 %         4950 non-null object
dtypes: int64(1), object(9)
memory usage: 425.4+ KB


In [280]:
for col in ['Mean Scale Score', 'Level 1 %', 'Level 2 %', 'Level 3 %', 
            'Level 4 %', 'Level 3+4 %']:
    math[col] = math[col].str.replace('%', '')
    math[col] = pd.to_numeric(math[col], errors='coerce')

In [281]:
math.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4950 entries, 5 to 28477
Data columns (total 10 columns):
DBN                 4950 non-null object
Grade               4950 non-null object
Category            4950 non-null object
Number Tested       4950 non-null int64
Mean Scale Score    4882 non-null float64
Level 1 %           4882 non-null float64
Level 2 %           4882 non-null float64
Level 3 %           4882 non-null float64
Level 4 %           4882 non-null float64
Level 3+4 %         4882 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 425.4+ KB


In [282]:
math.head()

Unnamed: 0,DBN,Grade,Category,Number Tested,Mean Scale Score,Level 1 %,Level 2 %,Level 3 %,Level 4 %,Level 3+4 %
5,01M015,3,All Students,28,671.0,35.7,46.4,17.9,0.0,17.9
11,01M015,4,All Students,28,668.0,10.7,50.0,32.1,7.1,39.3
17,01M015,5,All Students,25,667.0,20.0,32.0,48.0,0.0,48.0
24,01M015,All Grades,All Students,81,669.0,22.2,43.2,32.1,2.5,34.6
30,01M019,3,All Students,34,679.0,5.9,67.6,20.6,5.9,26.5


In [286]:
math = math.groupby('DBN').mean().reset_index()

In [288]:
len(math.DBN) - math.DBN.nunique()

0

In [289]:
data['math_test_results'] = math

## computing variables

In [339]:
data['sat_results'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
DBN                                478 non-null object
SCHOOL NAME                        478 non-null object
Num of SAT Test Takers             478 non-null object
SAT Critical Reading Avg. Score    478 non-null object
SAT Math Avg. Score                478 non-null object
SAT Writing Avg. Score             478 non-null object
dtypes: object(6)
memory usage: 22.5+ KB


In [338]:
# compute a total sat score
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c])

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]

ValueError: Unable to parse string

In [94]:
# parse location info of school
data["hs_directory"]['Location 1'].head()

0    883 Classon Avenue\nBrooklyn, NY 11225\n(40.67...
1    1110 Boston Road\nBronx, NY 10456\n(40.8276026...
2    1501 Jerome Avenue\nBronx, NY 10452\n(40.84241...
3    411 Pearl Street\nNew York, NY 10038\n(40.7106...
4    160-20 Goethals Avenue\nJamaica, NY 11432\n(40...
Name: Location 1, dtype: object

In [99]:
data["hs_directory"]['Location 1'][0]

'883 Classon Avenue\nBrooklyn, NY 11225\n(40.67029890700047, -73.96164787599963)'

In [101]:
for i in range(0, 2):
    print(data["hs_directory"]['Location 1'][i])

883 Classon Avenue
Brooklyn, NY 11225
(40.67029890700047, -73.96164787599963)
1110 Boston Road
Bronx, NY 10456
(40.8276026690005, -73.90447525699966)


In [103]:

data["hs_directory"]['lat'] = data["hs_directory"]['Location 1'].apply(lambda x: x.split("\n")[-1].replace("(", "").replace(")", "").split(", ")[0])
data["hs_directory"]['lon'] = data["hs_directory"]['Location 1'].apply(lambda x: x.split("\n")[-1].replace("(", "").replace(")", "").split(", ")[1])

for c in ['lat', 'lon']:
    data["hs_directory"][c] = pd.to_numeric(data["hs_directory"][c])

In [108]:
for key, val in data.items():
    print('--'* 30 + '\n' + key + ':\n')
    print(val.head(2))

------------------------------------------------------------
demographics:

       DBN                                              Name  schoolyear  \
6   01M015  P.S. 015 ROBERTO CLEMENTE                           20112012   
13  01M019  P.S. 019 ASHER LEVY                                 20112012   

   fl_percent  frl_percent  total_enrollment prek   k grade1 grade2  \
6         NaN         89.4               189   13  31     35     28   
13        NaN         61.5               328   32  46     52     54   

      ...     black_num black_per hispanic_num hispanic_per white_num  \
6     ...            63      33.3          109         57.7         4   
13    ...            81      24.7          158         48.2        28   

   white_per male_num male_per female_num female_per  
6        2.1     97.0     51.3       92.0       48.7  
13       8.5    147.0     44.8      181.0       55.2  

[2 rows x 38 columns]
------------------------------------------------------------
sat_results:

## combining the dataset

In [290]:
for key in data.keys():
    print(key)
    print(len(data[key]['DBN']) - len(data[key]['DBN'].unique()))

demographics
0
sat_results
0
survey
0
graduation
0
ap_2010
0
class_size
0
hs_directory
0
math_test_results
0


Keep all the records from sat_results, graduation, ap_2010, math_test_results. As they are considered the variables that to be explained.

In [293]:
data.keys()

dict_keys(['demographics', 'sat_results', 'survey', 'graduation', 'ap_2010', 'class_size', 'hs_directory', 'math_test_results'])

In [311]:
outer = ['graduation', 'ap_2010','math_test_results']
full = data['sat_results']

In [312]:
for key in data.keys():
    if df in outer:
        full = pd.merge(full, data[key], on='DBN', how='outer')
    else:
        full = pd.merge(full, data[key], on='DBN', how='left')


In [313]:
full.shape

(478, 149)

In [315]:
full.DBN.nunique()

478

In [318]:
full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 478 entries, 0 to 477
Columns: 149 entries, DBN to Level 3+4 %
dtypes: float64(79), object(70)
memory usage: 560.2+ KB


## adding in values


In [321]:
cols = ['AP Test Takers', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    print(full[col].dtype)

object
object
object


后记，数据格式转换这一步其实应该在condensing datasets时顺手做掉，  
因为到现在已经合并了df了，full有太多栏，到时候报错比较难发现到底哪里出错。

In [322]:
for col in cols:
    full[col] = pd.to_numeric(full[col], errors='coerce')

full[cols] = full[cols].fillna(value=0)

In [324]:
full["school_dist"] = full["DBN"].apply(lambda x: x[:2])

In [326]:
full.isnull().any()

DBN                                  False
SCHOOL NAME_x                        False
Num of SAT Test Takers_x             False
SAT Critical Reading Avg. Score_x    False
SAT Math Avg. Score_x                False
SAT Writing Avg. Score_x             False
Name                                  True
schoolyear                            True
frl_percent                           True
total_enrollment                      True
ell_num                               True
ell_percent                           True
sped_num                              True
sped_percent                          True
asian_num                             True
asian_per                             True
black_num                             True
black_per                             True
hispanic_num                          True
hispanic_per                          True
white_num                             True
white_per                             True
male_num                              True
male_per   

In [327]:
full = full.fillna(full.mean())

## computing correlations

In [333]:
full.corr().shape

(82, 82)

In [337]:
full.columns

Index(['DBN', 'SCHOOL NAME_x', 'Num of SAT Test Takers_x',
       'SAT Critical Reading Avg. Score_x', 'SAT Math Avg. Score_x',
       'SAT Writing Avg. Score_x', 'Name', 'schoolyear', 'frl_percent',
       'total_enrollment',
       ...
       'priority10', 'Location 1', 'Number Tested', 'Mean Scale Score',
       'Level 1 %', 'Level 2 %', 'Level 3 %', 'Level 4 %', 'Level 3+4 %',
       'school_dist'],
      dtype='object', length=150)

这时才发现full里并没有'sat_score'一栏，哎，作者在condensing df时真的对每一个表做了探索和清洗了，  
只是在博客文章里没有表现出来。哎。。重新来过。