In [9]:
import pandas as pd

### Extract methodology fields for each year

In [10]:
mf = pd.read_excel('/Users/cave/Desktop/discriminology/methodology_fields.xlsx')

In [291]:
fields_1112 = mf['Field_Name 2011/12'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().reset_index(drop=True).values
fields_1314 = mf['Field_Name 2013/14'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().reset_index(drop=True).values
fields_1516 = mf['Field_Name 2015/16'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().reset_index(drop=True).values


### Create helper functions to aggregate dataframes and label coded columns

In [357]:
def aggregate_dataframes(frames, desired_fields):
    '''
    INPUTS: array of dataframes for a given year, list of desired fields from
    methodology doc
    
    OUTPUT: aggregated dataframe indexed by COMBOKEY
    
    Pulls desired columns out of each spreadsheet, joins by COMBOKEY,
    replaces negative values with 0.
    
    '''
    output = None

    for df in frames:
        # subset dataframe to desired columns
        temp = df[list(set(df.columns[df.columns.isin(desired_fields)]))]
        num = temp._get_numeric_data()
        num[num < 0] = 0 # replace negative numbers with 0
        temp.set_index('COMBOKEY', inplace=True)

        if output is None:
            output = temp 
        else:
            output = pd.merge(output, temp, left_index=True, right_index=True, how='outer')

    output = output.T.drop_duplicates().T
    return output

In [None]:
for df in frames:
    # subset dataframe to desired columns
    temp = df[list(set(df.columns[df.columns.isin(desired_fields)]))]
    num = temp._get_numeric_data()
    num[num < 0] = 0 # replace negative numbers with 0
    temp.set_index('COMBOKEY', inplace=True)

    if output is None:
        output = temp 
    else:
        output = pd.concat([output, temp], axis=1)
        print(output.columns.value_counts())


In [352]:
len(fields_1112)

222

In [351]:
len(df_1112.columns)

226

In [350]:
df_1112.index.nunique()

101133

In [353]:
df_1112.loc['999909600001'][['SCH_NAME_x', 'SCH_NAME_y']]


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


SCH_NAME_x    New Millennium Institute of Education Charter
SCH_NAME_x                                              NaN
SCH_NAME_y                                              NaN
Name: 999909600001, dtype: object

In [356]:
df_1112 = aggregate_dataframes(frames_1112, fields_1112)

1
1
2
2
3
3
4
4
5
5
6
6
7
7
8


In [343]:
df_1112 = aggregate_dataframes(frames_1112, fields_1112)
df_1112.loc['999909600001'][['SCH_NAME_x', 'SCH_NAME_y']]
df_1112.columns.value_counts()

LEA_STATE_x                        2
LEAID_x                            2
LEA_NAME_x                         2
SCHID_x                            2
JJ_x                               2
SCH_NAME_x                         2
M_WHI_7_GT                         1
M_2_OR_MORE_7_LAW_DIS              1
M_AME_7_ENROL                      1
M_HIS_7_ARREST_NO_DIS              1
F_WHI_7_ONE_AP                     1
M_BLA_7_ARREST_NO_DIS              1
F_WHI_7_SINGLE_SUS_NO_DIS          1
F_ASI_7_LAW_NO_DIS                 1
F_AME_7_ARREST_DIS                 1
M_AME_7_LAW_NO_DIS                 1
F_ASI_7_LAW_DIS                    1
M_ASI_7_SINGLE_SUS_DIS             1
M_HIS_7_IDEA                       1
F_HIS_7_SINGLE_SUS_NO_DIS          1
M_ASI_7_MULT_SUS_DIS               1
F_HI_PAC_7_SINGLE_SUS_DIS          1
F_AME_7_IDEA                       1
M_BLA_7_LAW_NO_DIS                 1
F_BLA_7_LAW_DIS                    1
M_BLA_7_SINGLE_SUS_NO_DIS          1
M_BLA_7_LAW_DIS                    1
F

In [None]:
df_1112.columns.value_counts()

In [330]:
df15_1112[df15_1112['COMBOKEY']=='999909600001']['SCH_NAME']

Series([], Name: SCH_NAME, dtype: object)

In [97]:
def join_col_descriptions(frames, agg_data, fields, year):
    '''
    INPUTS
    
    frames (list of DataFrames): Array of dataframes with field codes and descriptions
    agg_data (DataFrame): Aggregated data with coded fields as columns
    year (STR): year range of data e.g. '2015-16'
    
    '''
    descriptions = pd.concat(frames, axis=0)
    descriptions.drop_duplicates(inplace=True)
    descriptions.set_index('Field Name', inplace=True)
    descriptions = descriptions.loc[fields]
    descriptions.columns = ['description']
    final = pd.merge(agg_data.T, descriptions, left_index=True, right_index=True, how='left').set_index('description', append=True).T
    final['YEAR','School Year'] = year
    final.to_csv(f'~/Desktop/discriminology/output/final_data_{year}.csv')
    return final
    

### Load all data + col description files for 2011-12

In [4]:
df1_1112, df1_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/05 - Overall Enrollment.xlsx', sheet_name=None).values()
df2_1112, df2_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/08 - Students enrolled in Gifted-Talented Programs.xlsx', sheet_name=None).values()
df3_1112, df3_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/10-1 - Students with Disabilities Served under IDEA Enrollment.xlsx', sheet_name=None).values()
df4_1112, df4_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/10-2 - Students with Disabilities Served under 504 Enrollment.xlsx', sheet_name=None).values()
df5_1112, df5_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/W:O Disabilities/35-3 - Students WO Disab Receiving only one out-of-school suspension.xlsx', sheet_name=None).values()
df6_1112, df6_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/W:O Disabilities/35-4 - Students WO Disab Rec more than one out-of-school suspension.xlsx', sheet_name=None).values()
df7_1112, df7_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/With Disabilities/36-3 - Students With Disabilities Receiving only one out-of-school suspension.xlsx', sheet_name=None).values()
df8_1112, df8_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/With Disabilities/36-4 - Students With Disab Receiving more than one out-of-school suspension.xlsx', sheet_name=None).values()
df9_1112, df9_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Academic/Advanced Placement/17 - Students who are taking at least one AP course.xlsx', sheet_name=None).values()
df10_1112, df10_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Referral to law enforcement/W:O Disabilities/35-8 - Students Without Disabilities Referral to law enforcement.xlsx', sheet_name=None).values()
df11_1112, df11_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Referral to law enforcement/With Disabilities/36-8 - Students With Disabilities Referral to law enforcement.xlsx', sheet_name=None).values()
df12_1112, df12_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/School Related Arrest/W:O Disabilities/35-9 - Students Without Disabilities School-related arrest.xlsx', sheet_name=None).values()
df13_1112, df13_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/School Related Arrest/With Disabilities/36-9 - Students With Disabilities School-related arrest.xlsx', sheet_name=None).values()
df14_1112, df14_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/School Characteristics/02 - School Characteristics.xlsx', sheet_name=None).values()
df15_1112, df15_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Staff/08-1 School Support and Security Staff (required elements).xlsx', sheet_name=None).values()
df16_1112, df16_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/06 - Enrolled in Early Childhood and Prekindergarten.xlsx', sheet_name=None).values()

Collect data and description frames into arrays:

In [67]:

frames_1112 = [df1_1112, df2_1112, df3_1112, df4_1112, df5_1112, df6_1112,
               df7_1112, df8_1112, df9_1112, df10_1112, df11_1112, df12_1112,
               df13_1112, df14_1112, df15_1112, df16_1112]



desc_frames_1112 = [df1_desc_1112, df2_desc_1112, df3_desc_1112, df4_desc_1112,
                    df5_desc_1112, df6_desc_1112, df7_desc_1112, df8_desc_1112,
                    df9_desc_1112, df10_desc_1112, df11_desc_1112, df12_desc_1112,
                    df13_desc_1112, df14_desc_1112, df15_desc_1112, df16_desc_1112]


In [71]:
# descriptions = pd.concat(desc_frames_1112, axis=0)
# descriptions.drop_duplicates(inplace=True)
# descriptions.set_index('Field Name', inplace=True)
# descriptions = descriptions.loc[fields_1112]
# descriptions.columns = ['2011_12_description']
# descriptions.reset_index().to_csv('/Users/cave/Desktop/2011_12_field_descriptions.csv')

# descriptions = pd.concat(desc_frames_1314, axis=0)
# descriptions.drop_duplicates(inplace=True)
# descriptions.set_index('Field Name', inplace=True)
# descriptions = descriptions.loc[fields_1314]
# descriptions.columns = ['2013_14_description']
# descriptions.reset_index().to_csv('/Users/cave/Desktop/2013_14_field_descriptions.csv')

Use helper function to aggregate frames and clean up negative values.

In [306]:
df_1112 = aggregate_dataframes(frames_1112, fields_1112)
df_1112.loc['999909600001']
df_1112_final = join_col_descriptions(desc_frames_1112, df_1112, fields_1112, '2011-12')

In [315]:
df_1112.loc['999909600001']['SCH_NAME']

'New Millennium Institute of Education Charter'

In [286]:
df_1112['SCH_NAME'].isna()

Unnamed: 0,SCH_NAME,SCH_NAME.1
010000201705,False,False
010000201706,False,False
010000299995,False,False
010000299996,False,True
010000500870,False,False
010000500871,False,False
010000500879,False,False
010000500889,False,False
010000501616,False,False
010000502150,False,False


In [233]:
df_1112_final.SCH_NAME

description,School Name,School Name.1
010000201705,WALLACE SCH - MT MEIGS CAMPUS,WALLACE SCH - MT MEIGS CAMPUS
010000201706,MCNEEL SCH - VACCA CAMPUS,MCNEEL SCH - VACCA CAMPUS
010000299995,AUTAUGA CAMPUS,AUTAUGA CAMPUS
010000299996,WALLACE ANNEX III,
010000500870,ALBERTVILLE MIDDLE SCHOOL,ALA AVENUE MIDDLE SCH
010000500871,ALBERTVILLE HIGH SCH,ALBERTVILLE HIGH SCH
010000500879,EVANS ELEM SCH,EVANS ELEM SCH
010000500889,ALBERTVILLE ELEM SCH,ALBERTVILLE ELEM SCH
010000501616,BIG SPRING LAKE KINDERG SCH,BIG SPRING LAKE KINDERG SCH
010000502150,ALBERTVILLE PRIMARY SCH,ALBERTVILLE PRIMARY SCH


In [128]:
df_1112.head()

Unnamed: 0,LEA_STATE,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,LEA_STATE.1,LEAID.1,LEA_NAME.1,SCHID.1,...,M_2_OR_MORE_7_ENROL,M_TOT_7_ENROL,F_AME_7_ENROL,F_ASI_7_ENROL,F_HIS_7_ENROL,F_BLA_7_ENROL,F_WHI_7_ENROL,F_HI_PAC_7_ENROL,F_2_OR_MORE_7_ENROL,F_TOT_7_ENROL
10000201705,AL,100002,ALABAMA YOUTH SERVICES,1705,WALLACE SCH - MT MEIGS CAMPUS,Z,AL,100002.0,ALABAMA YOUTH SERVICES,1705.0,...,5,382,0,0,0,0,0,0,0,0
10000201706,AL,100002,ALABAMA YOUTH SERVICES,1706,MCNEEL SCH - VACCA CAMPUS,Z,AL,100002.0,ALABAMA YOUTH SERVICES,1706.0,...,2,185,0,0,0,0,0,0,0,0
10000299995,AL,100002,ALABAMA YOUTH SERVICES,99995,AUTAUGA CAMPUS,X,AL,100002.0,ALA YOUTH SER,99995.0,...,14,470,0,0,0,0,0,0,0,0
10000299996,AL,100002,ALABAMA YOUTH SERVICES,99996,WALLACE ANNEX III,Z,,,,,...,0,61,0,0,0,0,0,0,0,0
10000500870,AL,100005,ALBERTVILLE CITY,870,ALBERTVILLE MIDDLE SCHOOL,,AL,100005.0,ALBERTVILLE CITY,870.0,...,2,301,2,0,80,5,221,0,2,310


In [151]:
df_1112_final.shape

(101133, 230)

In [136]:
df_1112.combine_first?

In [229]:
df1_1112.head()

Unnamed: 0,LEA_STATE,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,M_AME_7_ENROL,M_ASI_7_ENROL,M_HIS_7_ENROL,...,F_HIS_7_ENROL,F_BLA_7_ENROL,F_WHI_7_ENROL,F_HI_PAC_7_ENROL,F_2_OR_MORE_7_ENROL,F_TOT_7_ENROL,F_DIS_IDEA_7_ENROL,F_DIS_504_7_ENROL,F_LEP_7_ENROL,Incomplete
0,AL,100002,ALABAMA YOUTH SERVICES,1705,WALLACE SCH - MT MEIGS CAMPUS,10000201705,Z,2,0,8,...,0,0,0,0,0,0,<=2,0,0,
1,AL,100002,ALABAMA YOUTH SERVICES,1706,MCNEEL SCH - VACCA CAMPUS,10000201706,Z,0,0,2,...,0,0,0,0,0,0,<=2,0,0,
2,AL,100002,ALABAMA YOUTH SERVICES,99995,AUTAUGA CAMPUS,10000299995,X,0,0,2,...,0,0,0,0,0,0,<=2,0,0,
3,AL,100002,ALABAMA YOUTH SERVICES,99996,WALLACE ANNEX III,10000299996,Z,0,0,0,...,0,0,0,0,0,0,<=2,0,0,
4,AL,100005,ALBERTVILLE CITY,870,ALBERTVILLE MIDDLE SCHOOL,10000500870,,0,2,98,...,80,5,221,0,2,310,16,2,23,


In [231]:
df14_1112.JJ

0          Z
1          Z
2          X
3          Z
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
        ... 
95605    NaN
95606    NaN
95607    NaN
95608    NaN
95609    NaN
95610    NaN
95611    NaN
95612    NaN
95613    NaN
95614    NaN
95615    NaN
95616    NaN
95617    NaN
95618    NaN
95619      X
95620    NaN
95621    NaN
95622    NaN
95623    NaN
95624    NaN
95625    NaN
95626    NaN
95627    NaN
95628    NaN
95629    NaN
95630    NaN
95631    NaN
95632    NaN
95633    NaN
95634    NaN
Name: JJ, Length: 95635, dtype: object

In [226]:
df_1112[['LEA_STATE', 'LEA_NAME', 'JJ', 'SCHID', 'LEAID', 'SCH_NAME']].columns = ['LEA_STATE_x', 'LEA_STATE_y', 'LEA_NAME_x', 'LEA_NAME_y', 'JJ_x', 'JJ_y', 'SCHID_x',
       'SCHID_y', 'LEAID_x', 'LEAID_y', 'SCH_NAME_x', 'SCH_NAME_y']

In [228]:
df_1112['LEA_STATE'] = df_1112['LEA_STATE_x'].combine_first(df_1112['LEA_STATE_y'])

KeyError: 'LEA_STATE_x'

In [133]:
df_1112.combine_first?

In [None]:
df_1112['LEA_STATE'] = df_1112.combine_first

In [125]:
df_1112.T.index.value_counts()

LEA_STATE                       2
LEA_NAME                        2
JJ                              2
SCHID                           2
LEAID                           2
SCH_NAME                        2
F_2_OR_MORE_7_SINGLE_SUS_DIS    1
F_AME_7_MULT_SUS_DIS            1
F_WHI_7_ONE_AP                  1
M_HIS_7_SINGLE_SUS_NO_DIS       1
F_WHI_7_SINGLE_SUS_NO_DIS       1
M_HIS_7_IDEA                    1
M_AME_7_LAW_NO_DIS              1
M_HIS_7_ONE_AP                  1
F_ASI_7_LAW_DIS                 1
M_HI_PAC_7_SINGLE_SUS_DIS       1
F_AME_7_ENROL                   1
F_ASI_7_LAW_NO_DIS              1
M_TOT_7_MULT_SUS_NO_DIS         1
M_ASI_7_MULT_SUS_DIS            1
M_HI_PAC_7_LAW_NO_DIS           1
F_BLA_7_LAW_DIS                 1
M_BLA_7_SINGLE_SUS_NO_DIS       1
F_ASI_7_ARREST_DIS              1
M_HIS_7_ENROL                   1
M_TOT_7_ONE_AP                  1
F_HIS_7_SINGLE_SUS_NO_DIS       1
M_AME_7_ONE_AP                  1
M_BLA_7_LAW_DIS                 1
M_HIS_7_ARREST

In [121]:
df_1112_final['JJ'].head()

description,"Juvenile Justice Facility - Z: Long term secure facility, X: Other JJ facility","Juvenile Justice Facility: ""Yes"" indicates a long-term secure facility; ""No"" indicates not a JJ facility","Juvenile Justice Facility - Z: Long term secure facility, X: Other JJ facility.1","Juvenile Justice Facility: ""Yes"" indicates a long-term secure facility; ""No"" indicates not a JJ facility.1"
10000201705,Z,Z,Yes,Yes
10000201706,Z,Z,Yes,Yes
10000299995,X,X,Yes,Yes
10000299996,Z,Z,,
10000500870,,,No,No


In [None]:
df_1112.haed

In [50]:
missing_fields = pd.Series(fields_1112)[~pd.Series(fields_1112).isin(df_1112.columns)].values
missing_fields

### Load all data + col description files for 2013-14

Collect column name descriptions from each spreadsheet

In [52]:
df1_1314, df1_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/01 School Characteristics.xlsx', sheet_name=None).values()
df2_1314, df2_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/03 Enrollment.xlsx', sheet_name=None).values()
df3_1314, df3_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/04-1 Gifted and Talented Enrollment.xlsx', sheet_name=None).values()
df4_1314, df4_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/06 Advanced Placement and International Baccalaureate Diploma Programme Enrollment.xlsx', sheet_name=None).values()
df5_1314, df5_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/07-2 Advanced Placement Exams.xlsx', sheet_name=None).values()
df6_1314, df6_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/08-1 School Support and Security Staff (required elements).xlsx', sheet_name=None).values()
df7_1314, df7_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/09-1 Chronic Absenteeism.xlsx', sheet_name=None).values()
df8_1314, df8_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/11-2 Suspensions (required elements).xlsx', sheet_name=None).values()
df9_1314, df9_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/11-3 Expulsions.xlsx', sheet_name=None).values()
df10_1314, df10_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/12 Student Referrals and Arrests.xlsx', sheet_name=None).values()
df11_1314, df11_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/16 School Expenditures.xlsx', sheet_name=None).values()
df12_1314, df12_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/17 Justice Facilities.xlsx', sheet_name=None).values()

In [75]:
frames_1314 = [df1_1314, df2_1314, df3_1314, df4_1314, df5_1314,
               df6_1314, df7_1314, df8_1314, df9_1314, df10_1314,
               df11_1314, df12_1314]

desc_frames_1314 = [df1_desc_1314, df2_desc_1314, df3_desc_1314,
                    df4_desc_1314, df5_desc_1314, df6_desc_1314,
                    df7_desc_1314, df8_desc_1314, df9_desc_1314,
                    df10_desc_1314, df11_desc_1314, df12_desc_1314]


In [7]:
df_1314 = aggregate_dataframes(frames_1314, fields_1314)
df_1314_final = join_col_descriptions(desc_frames_1314, df_1314, fields_1314, '2013-14')

In [18]:
df_1314_final.head()

222

In [None]:
missing_fields = pd.Series(fields_1314)[~pd.Series(fields_1314).isin(df_1314.columns)].values
missing_fields

In [None]:
# output = pd.read_csv('/Users/cave/Desktop/discriminology/output/2013_14_master_file.csv')

### Join 2014-15 cleaned file to column descriptions.

In [None]:
df_1314_final.to_csv('/Users/cave/Desktop/discriminology/output/2013_14_master_file.csv')

### Get column decriptions for the 2015-16 data

In [81]:
col_descr_1516 = pd.read_excel('/Users/cave/Desktop/discriminology/OCR School data sample/2015:16/CRDC 2015-16 School Data Record Layout copy.xlsx', index_col='Field_Name')
decoded_names = pd.DataFrame(col_descr_1516.loc[fields3]['Field_Description'])
decoded_names.columns = ['2015_16_description']

In [83]:
decoded_names.reset_index().to_csv('/Users/cave/Desktop/2015_16_field_descriptions.csv')

### Isolate numeric columns in 15/16 and replace negative numbers with zeroes.

In [213]:
df3_raw = pd.read_csv('/Users/cave/Desktop/discriminology/OCR School data sample/2015:16/CRDC 2015-16 School Data copy.csv'
                      , encoding='iso-8859-1'
                     )
df3_raw['COMBOKEY'] = df3_raw['LEAID'].astype(str) + df3_raw['SCHID'].astype(str).str.zfill(5)
df3_raw = df3_raw[list(set(fields3))]

  interactivity=interactivity, compiler=compiler, result=result)


In [216]:
df3_raw.COMBOKEY.nunique()

96360

In [218]:
df3_raw[['LEAID', 'SCHID', 'COMBOKEY']]

Unnamed: 0,LEAID,SCHID,COMBOKEY
0,100002,1705,10000201705
1,100002,1706,10000201706
2,100002,1876,10000201876
3,100002,99995,10000299995
4,100005,870,10000500870
5,100005,871,10000500871
6,100005,879,10000500879
7,100005,889,10000500889
8,100005,1616,10000501616
9,100005,2150,10000502150


In [198]:
df3_raw.COMBOKEY

0        10000201705
1        10000201706
2        10000201876
3        10000299995
4        10000500870
5        10000500871
6        10000500879
7        10000500889
8        10000501616
9        10000502150
10       10000600193
11       10000600872
12       10000600876
13       10000600877
14       10000600878
15       10000600880
16       10000600883
17       10000600887
18       10000601413
19       10000601434
20       10000601585
21       10000601685
22       10000601812
23       10000602209
24       10000700091
25       10000700248
26       10000700251
27       10000700337
28       10000700342
29       10000701422
            ...     
96330       5.61E+11
96331       5.61E+11
96332       5.61E+11
96333       5.61E+11
96334       5.61E+11
96335       5.61E+11
96336       5.61E+11
96337       5.61E+11
96338       5.61E+11
96339       5.61E+11
96340       5.61E+11
96341       5.61E+11
96342       5.61E+11
96343       5.61E+11
96344       5.61E+11
96345       5.61E+11
96346       5

### Replace negative values with zeroes.

In [157]:
num = df3_raw._get_numeric_data()
num[num < 0] = 0
df3 = df3_raw.set_index(['COMBOKEY'])

In [159]:
df3

Unnamed: 0_level_0,SCH_GTENR_LEP_M,TOT_ENR_M,SCH_DISCWDIS_SINGOOS_IDEA_HI_F,SCH_DISCWDIS_REF_IDEA_HP_F,TOT_APENR_M,SCH_DISCWDIS_MULTOOS_IDEA_BL_F,SCH_GT_IND,SCH_DISCWDIS_SINGOOS_IDEA_HI_M,SCH_DISCWDIS_MULTOOS_IDEA_TR_F,SCH_GTENR_IDEA_F,...,TOT_DISCWODIS_MULTOOS_M,TOT_DISCWODIS_SINGOOS_M,SCH_GTENR_AS_M,SCH_DISCWODIS_ARR_AM_M,SCH_DISCWDIS_REF_IDEA_TR_F,SCH_GTENR_TR_F,TOT_IDEAENR_F,SCH_DISCWDIS_REF_IDEA_AM_M,SCH_ENR_AM_F,TOT_DISCWODIS_ARR_M
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.000020e+10,0,128,0,0,0,0,No,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.000020e+10,0,52,0,0,0,0,No,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.000020e+10,0,908,0,0,0,0,No,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.000030e+10,0,38,0,0,0,0,No,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1.000050e+10,0,358,0,0,0,0,No,0,0,0,...,7,19,0,0,0,0,14,0,0,0
1.000050e+10,0,645,0,0,70,0,No,0,0,0,...,4,30,0,0,0,0,29,0,2,2
1.000050e+10,0,381,0,0,0,0,Yes,0,0,0,...,6,5,2,0,0,0,23,0,2,0
1.000050e+10,0,430,0,0,0,0,Yes,0,0,2,...,0,0,0,0,0,2,14,0,2,0
1.000050e+10,0,264,0,0,0,0,No,0,0,0,...,0,0,0,0,0,0,14,0,0,0
1.000050e+10,0,555,0,0,0,0,No,0,0,0,...,0,0,0,0,0,0,35,0,2,0


In [None]:
df3_final = pd.merge(df3.T, decoded_names, left_index=True, right_index=True).set_index('description', append=True).T
df3_final['Year'] = '2015-16'

In [192]:
df_1516 = pd.read_csv('/Users/cave/Desktop/discriminology/output/2015_16_clean_masterfile.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [196]:
len(df_1516)

96361

In [195]:
df_1516.COMBOKEY.nunique()

18390

In [None]:
df3_final.to_csv('/Users/cave/Desktop/discriminology/output/2015_16_clean_masterfile.csv')