In [227]:
import pandas as pd

### Extract methodology fields for each year

In [228]:
mf = pd.read_excel('/Users/cave/Desktop/discriminology/methodology_fields.xlsx')

In [229]:
fields_1112 = mf['Field_Name 2011/12'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().drop_duplicates().values

fields_1314 = mf['Field_Name 2013/14'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().drop_duplicates().values

fields_1516 = mf['Field_Name 2015/16'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().drop_duplicates().values


Check that there are no duplicates in the desired fields

In [230]:
assert len(fields_1112) == pd.Series(fields_1112).nunique()
assert len(fields_1314) == pd.Series(fields_1314).nunique()
assert len(fields_1516) == pd.Series(fields_1516).nunique()

In [231]:
print(f"{len(fields_1112)} desired columns from 11/12 data")
print(f"{len(fields_1314)} desired columns from 13/14 data")
print(f"{len(fields_1516)} desired columns from 15/16 data")

221 desired columns from 11/12 data
219 desired columns from 13/14 data
231 desired columns from 15/16 data


Read field mappings and data types

In [232]:
field_mappings = pd.read_excel('/Users/cave/Desktop/discriminology/field_mapping.xlsx')

In [233]:
map_ = field_mappings[['11_12_field', '13_14_field', 'type']].dropna(subset=['11_12_field'])
map_2 = field_mappings[['11_12_field', '13_14_field']].dropna(subset=['11_12_field', '13_14_field'])
map_11_12 = pd.Series(map_2['13_14_field'].values, index=map_2['11_12_field']).to_dict()
type_dict_11_12 = pd.Series(map_['type'].values, index=map_['11_12_field']).to_dict()
final_type_dict_11_12 = {k:v for k,v in type_dict_11_12.items() if v !='str'}

In [234]:
field_mappings['col_superset'] = field_mappings['15_16_field'].combine_first(field_mappings['11_12_field'])
type_map = pd.Series(field_mappings.type.values, index=field_mappings.col_superset).to_dict()
numeric_type_map = {k:v for k,v in type_map.items() if v !='str'}

In [235]:
field_mappings[['col_superset', 'description']]
descriptions = field_mappings[['col_superset', 'description']].set_index('col_superset')

In [236]:
universal_types = {'SCH_ZIP ': str
                   ,'SCHID': str
                   ,'COMBOKEY': str
                   ,'LEAID': str
                  }

### Create helper functions to aggregate dataframes and label coded columns

In [247]:
def aggregate_data(frame_array, desired_fields):

    clean_frames = []

    for df in frame_array:
        # subset dataframe to desired columns

        temp = df[list(set(df.columns[df.columns.isin(desired_fields)]))]

        num = temp._get_numeric_data()
        num[num < 0] = 0 # replace negative numbers with 0
        temp = temp.replace('<=2', '0') # remove misc symbols from values
        temp = temp.replace('‡', None) # remove misc symbols from values
        temp['COMBOKEY'] = temp['COMBOKEY'].astype(str)
        temp.set_index('COMBOKEY', inplace=True)
        clean_frames.append(temp)


    concat = pd.concat(clean_frames, axis=1, sort=True)
    flipped = concat.T.drop_duplicates()
    grouped = flipped.groupby(lambda x: x).agg({c: 'last' for c in flipped.columns})
    return grouped.T

In [248]:
def join_col_descriptions(agg_data, year):
    '''
    INPUTS
    
    frames (list of DataFrames): Array of dataframes with field codes and descriptions
    agg_data (DataFrame): Aggregated data with coded fields as columns
    year (STR): year range of data e.g. '2015-16'
    
    '''
    final = pd.merge(agg_data.T, descriptions, left_index=True, right_index=True, how='inner').set_index('description', append=True).T
    final['YEAR','School Year'] = year
    final.to_csv(f'~/Desktop/discriminology/output/final_data_{year}.csv')
    return final
    

### Load all data + col description files for 2011-12

In [249]:
## Reads originals from excel

# df1_1112, df1_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/05 - Overall Enrollment.xlsx', sheet_name=None).values()
# df2_1112, df2_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/08 - Students enrolled in Gifted-Talented Programs.xlsx', sheet_name=None).values()
# df3_1112, df3_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/10-1 - Students with Disabilities Served under IDEA Enrollment.xlsx', sheet_name=None).values()
# df4_1112, df4_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/10-2 - Students with Disabilities Served under 504 Enrollment.xlsx', sheet_name=None).values()
# df5_1112, df5_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/W:O Disabilities/35-3 - Students WO Disab Receiving only one out-of-school suspension.xlsx', sheet_name=None).values()
# df6_1112, df6_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/W:O Disabilities/35-4 - Students WO Disab Rec more than one out-of-school suspension.xlsx', sheet_name=None).values()
# df7_1112, df7_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/With Disabilities/36-3 - Students With Disabilities Receiving only one out-of-school suspension.xlsx', sheet_name=None).values()
# df8_1112, df8_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/With Disabilities/36-4 - Students With Disab Receiving more than one out-of-school suspension.xlsx', sheet_name=None).values()
# df9_1112, df9_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Academic/Advanced Placement/17 - Students who are taking at least one AP course.xlsx', sheet_name=None).values()
# df10_1112, df10_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Referral to law enforcement/W:O Disabilities/35-8 - Students Without Disabilities Referral to law enforcement.xlsx', sheet_name=None).values()
# df11_1112, df11_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Referral to law enforcement/With Disabilities/36-8 - Students With Disabilities Referral to law enforcement.xlsx', sheet_name=None).values()
# df12_1112, df12_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/School Related Arrest/W:O Disabilities/35-9 - Students Without Disabilities School-related arrest.xlsx', sheet_name=None).values()
# df13_1112, df13_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/School Related Arrest/With Disabilities/36-9 - Students With Disabilities School-related arrest.xlsx', sheet_name=None).values()
# df14_1112, df14_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/School Characteristics/02 - School Characteristics.xlsx', sheet_name=None).values()
# df15_1112, df15_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Staff/08-1 School Support and Security Staff (required elements).xlsx', sheet_name=None).values()
# df16_1112, df16_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/06 - Enrolled in Early Childhood and Prekindergarten.xlsx', sheet_name=None).values()

In [250]:
# ctr = 0
# for frame in desc_frames_1112:
#     frame.to_csv(f'/Users/cave/Desktop/discriminology/2011_12/descriptions/file_{ctr}.csv')
#     ctr += 1

# ctr = 0
# for frame in frames_1314:
#     frame.to_csv(f'~/Desktop/discriminology/2013_14/data/file_{ctr}.csv')
#     ctr += 1


In [251]:
df1_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_1.csv', dtype=universal_types)
df2_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_2.csv', dtype=universal_types)
df3_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_3.csv', dtype=universal_types)
df4_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_4.csv', dtype=universal_types)
df5_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_5.csv', dtype=universal_types)
df6_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_6.csv', dtype=universal_types)
df7_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_7.csv', dtype=universal_types)
df8_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_8.csv', dtype=universal_types)
df9_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_9.csv', dtype=universal_types)
df10_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_10.csv', dtype=universal_types)
df11_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_11.csv', dtype=universal_types)
df12_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_12.csv', dtype=universal_types)
df13_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_13.csv', dtype=universal_types)
df14_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_14.csv', dtype=universal_types)
df15_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_15.csv', dtype=universal_types)
df16_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_0.csv', dtype=universal_types)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [252]:
df1_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_1.csv', index_col=0)
df2_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_2.csv', index_col=0)
df3_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_3.csv', index_col=0)
df4_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_4.csv', index_col=0)
df5_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_5.csv', index_col=0)
df6_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_6.csv', index_col=0)
df7_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_7.csv', index_col=0)
df8_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_8.csv', index_col=0)
df9_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_9.csv', index_col=0)
df10_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_10.csv', index_col=0)
df11_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_11.csv', index_col=0)
df12_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_12.csv', index_col=0)
df13_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_13.csv', index_col=0)
df14_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_14.csv', index_col=0)
df15_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_15.csv', index_col=0)
df16_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_0.csv', index_col=0)

In [253]:

frames_1112 = [df1_1112, df2_1112, df3_1112, df4_1112, df5_1112, df6_1112,
               df7_1112, df8_1112, df9_1112, df10_1112, df11_1112, df12_1112,
               df13_1112, df14_1112, df15_1112, df16_1112]



desc_frames_1112 = [df1_desc_1112, df2_desc_1112, df3_desc_1112, df4_desc_1112,
                    df5_desc_1112, df6_desc_1112, df7_desc_1112, df8_desc_1112,
                    df9_desc_1112, df10_desc_1112, df11_desc_1112, df12_desc_1112,
                    df13_desc_1112, df14_desc_1112, df15_desc_1112, df16_desc_1112]


Use helper function to aggregate frames and clean up negative values, standardize length of zipcodes.

In [254]:
# # Careful, this cell runs for a while
df_1112_raw = aggregate_data(frames_1112, fields_1112)

In [260]:
df_1112 = df_1112_raw.copy()

In [261]:
df_1112['SCH_ZIP '] = df_1112['SCH_ZIP '].str.zfill(5) # pad zipcodes with leading zeroes

Change 0/1 indicator vars to Yes / No for selected columns

In [262]:
int_cols_to_str = [
    'PreK'
    ,'K'
    ,'G1'
    ,'G2'
    ,'G3'
    ,'G4'
    ,'G5'
    ,'G6'
    ,'G7'
    ,'G8' 
    ,'G9'
    ,'G10'
    ,'G11'
    ,'G12'
    ,'MG_SCH'
    ,'CHARTER_SCH'
    ,'ALT_SCH'
                  ]

In [263]:
df_1112[int_cols_to_str] =  df_1112[int_cols_to_str].replace('0', 'No').replace('1', 'Yes').replace(0, 'No').replace(1, 'Yes')

In [266]:
df_1112['SCH_FTESECURITY_IND'] = df_1112['SCH_FTESECURITY_IND'].str.replace('-9','No').str.replace('-5','No')

Make types consistent within each column

In [268]:
type_dict_11_12.pop('COMBOKEY')

'str'

In [269]:
df_1112 = df_1112.astype(final_type_dict_11_12)

Custom case handling: Juvenile Justice facilities

In [270]:
df_1112['JJ'] = df_1112['JJ'].str.replace('Z', 'Yes').str.replace('X', 'Yes')

Rename columns to match 13/14 and 15/16 data

In [272]:
df_1112.rename(columns=map_11_12, inplace=True)

In [273]:
superset = field_mappings['col_superset'].values
ordered_cols_1112 = [c for c in superset if c in df_1112.columns]

In [587]:
df_1112.index.nunique()

101133

In [274]:
df_1112 = df_1112[ordered_cols_1112]
df_1112_final = join_col_descriptions(df_1112, '2011-12')

In [275]:
df_1112_final[::100].to_csv('~/Desktop/discriminology/output/11_12_sample.csv')

In [276]:
df_1112_final.shape

(101133, 220)

### Load all data + col description files for 2013-14

Collect column name descriptions from each spreadsheet

In [358]:
# # Read originals from excel

# df1_1314, df1_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/01 School Characteristics.xlsx', sheet_name=None).values()
# df2_1314, df2_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/03 Enrollment.xlsx', sheet_name=None).values()
# df3_1314, df3_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/04-1 Gifted and Talented Enrollment.xlsx', sheet_name=None).values()
# df4_1314, df4_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/06 Advanced Placement and International Baccalaureate Diploma Programme Enrollment.xlsx', sheet_name=None).values()
# df5_1314, df5_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/07-2 Advanced Placement Exams.xlsx', sheet_name=None).values()
# df6_1314, df6_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/08-1 School Support and Security Staff (required elements).xlsx', sheet_name=None).values()
# df7_1314, df7_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/09-1 Chronic Absenteeism.xlsx', sheet_name=None).values()
# df8_1314, df8_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/11-2 Suspensions (required elements).xlsx', sheet_name=None).values()
# df9_1314, df9_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/11-3 Expulsions.xlsx', sheet_name=None).values()
# df10_1314, df10_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/12 Student Referrals and Arrests.xlsx', sheet_name=None).values()
# df11_1314, df11_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/16 School Expenditures.xlsx', sheet_name=None).values()
# df12_1314, df12_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/17 Justice Facilities.xlsx', sheet_name=None).values()

In [359]:
# ctr = 0
# for frame in desc_frames_1314:
#     frame.to_csv(f'~/Desktop/discriminology/2013_14/descriptions/file_{ctr}.csv')
#     ctr += 1


# ctr = 0
# for frame in frames_1314:
#     frame.to_csv(f'~/Desktop/discriminology/2013_14/data/file_{ctr}.csv')
#     ctr += 1


In [211]:
df1_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_1.csv', dtype=universal_types)
df2_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_2.csv', dtype=universal_types)
df3_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_3.csv', dtype=universal_types)
df4_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_4.csv', dtype=universal_types)
df5_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_5.csv', dtype=universal_types)
df6_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_6.csv', dtype=universal_types)
df7_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_7.csv', dtype=universal_types)
df8_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_8.csv', dtype=universal_types)
df9_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_9.csv', dtype=universal_types)
df10_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_10.csv', dtype=universal_types)
df11_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_11.csv', dtype=universal_types)
df12_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_0.csv', dtype=universal_types)

In [213]:
df1_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_1.csv', dtype=universal_types, index_col=0)
df2_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_2.csv', dtype=universal_types, index_col=0)
df3_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_3.csv', dtype=universal_types, index_col=0)
df4_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_4.csv', dtype=universal_types, index_col=0)
df5_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_5.csv', dtype=universal_types, index_col=0)
df6_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_6.csv', dtype=universal_types, index_col=0)
df7_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_7.csv', dtype=universal_types, index_col=0)
df8_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_8.csv', dtype=universal_types, index_col=0)
df9_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_9.csv', dtype=universal_types, index_col=0)
df10_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_10.csv', dtype=universal_types, index_col=0)
df11_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_11.csv', dtype=universal_types, index_col=0)
df12_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_0.csv', dtype=universal_types, index_col=0)


In [214]:
frames_1314 = [df1_1314, df2_1314, df3_1314, df4_1314, df5_1314,
               df6_1314, df7_1314, df8_1314, df9_1314, df10_1314,
               df11_1314, df12_1314]


desc_frames_1314 = [df1_desc_1314, df2_desc_1314, df3_desc_1314,
                    df4_desc_1314, df5_desc_1314, df6_desc_1314,
                    df7_desc_1314, df8_desc_1314, df9_desc_1314,
                    df10_desc_1314, df11_desc_1314, df12_desc_1314]


In [218]:
df_1314_raw = aggregate_data(frames_1314, fields_1314)

In [277]:
df_1314 = df_1314_raw.copy()

In [278]:
df_1314['SCH_FTESECURITY_IND'] = df_1314['SCH_FTESECURITY_IND'].str.replace('-9','No').str.replace('-5','No')
df_1314 = df_1314.replace('-5', None)
df_1314 = df_1314.replace('-9', None)

In [589]:
df_1314.index.nunique()

95507

In [280]:
df_1314_final = join_col_descriptions(df_1314, '2013-14')

In [282]:
df_1314_final[::100].to_csv('~/Desktop/discriminology/output/13_14_sample.csv')

In [283]:
df_1314_final.shape

(95507, 218)

### Get column decriptions for the 2015-16 data

In [284]:
col_descr_1516 = pd.read_excel('/Users/cave/Desktop/discriminology/OCR School data sample/2015:16/CRDC 2015-16 School Data Record Layout copy.xlsx', index_col='Field_Name')
decoded_names = pd.DataFrame(col_descr_1516.loc[fields_1516]['Field_Description'])
decoded_names.columns = ['description']

In [285]:
decoded_names.reset_index().to_csv('/Users/cave/Desktop/2015_16_field_descriptions.csv')

### Isolate numeric columns in 15/16 and replace negative numbers with zeroes.

In [523]:
df1516_raw = pd.read_csv('/Users/cave/Desktop/discriminology/OCR School data sample/2015:16/CRDC 2015-16 School Data copy.csv'
                      , encoding='iso-8859-1'
                     )
df1516_raw['LEAID'] = df1516_raw['LEAID'].astype(str).str.zfill(7)
df1516_raw['COMBOKEY'] = df1516_raw['LEAID'].astype(str).str.zfill(7) + df1516_raw['SCHID'].astype(str).str.zfill(5)
df1516_raw = df1516_raw[fields_1516]

  interactivity=interactivity, compiler=compiler, result=result)


### Replace negative values with zeroes.

In [524]:
num = df1516_raw._get_numeric_data()
num[num < 0] = 0

In [525]:
df1516_raw = df1516_raw.replace('-5', 'No').replace('-9', 'No')

In [526]:
df1516_raw.set_index('COMBOKEY', inplace=True)

In [590]:
df_1516_final.index.nunique()

96360

In [527]:
df_1516_final = pd.merge(df1516_raw.T, descriptions, left_index=True, right_index=True).set_index('description', append=True).T
df_1516_final['YEAR','School Year'] = '2015-16'

In [528]:
df_1516_final[::100].to_csv('~/Desktop/discriminology/output/15_16_sample.csv')
df_1516_final.to_csv('~/Desktop/discriminology/output/final_data_2015-16.csv')

### Concatenate all three years of data together

In [609]:
df_1112_raw['LEAID'].nunique()

17342

In [610]:
df_1314_raw['LEAID'].nunique()

16758

In [611]:
df1516_raw['LEAID'].nunique()

17337

In [571]:
full_table = pd.concat([df_1516_final, df_1314_final, df_1112_final], axis=0)

In [572]:
full_table = full_table.astype(numeric_type_map)

In [573]:
full_table['LEA_NAME', 'District Name'] = full_table['LEA_NAME', 'District Name'].str.title()
full_table['SCH_NAME', 'School Name'] = full_table['SCH_NAME', 'School Name'].str.title()
full_table['SCH_ADDRESS', 'School address'] = full_table['SCH_ADDRESS', 'School address'].str.title()
full_table['SCHID', '5 Digit School Identification Code'] = full_table['SCHID', '5 Digit School Identification Code'].astype(str).str.zfill(5)

In [574]:
full_table.to_csv('/Users/cave/Desktop/discriminology/output/full_table_all_years.csv')

In [575]:
full_table[::100].to_csv('/Users/cave/Desktop/discriminology/output/full_table_sample.csv')

### Group by districts and sum over fields

In [563]:
# full_table = pd.read_csv('/Users/cave/Desktop/discriminology/output/full_table_all_years.csv', header=[0,1], index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [632]:
district = full_table.copy()

In [636]:
district.columns = district.columns.droplevel(1)
district = district.astype(numeric_type_map)

In [639]:
agg_cols = ['LEA_NAME', 'LEA_STATE_NAME',
'SCH_DISCWODIS_MULTOOS_HI_M',
 'SCH_DISCWDIS_ARR_IDEA_AM_M',
 'SCH_GTENR_IDEA_F',
 'SCH_DISCWODIS_SINGOOS_HP_F',
 'SCH_FTECOUNSELORS',
 'SCH_DISCWODIS_ARR_WH_M',
 'TOT_DISCWDIS_ARR_IDEA_M',
 'SCH_GTENR_TR_M',
 'SCH_DISCWODIS_MULTOOS_AS_M',
 'SCH_APENR_AM_F',
 'SCH_DISCWODIS_REF_BL_F',
 'SCH_DISCWODIS_ARR_AM_F',
 'TOT_DISCWODIS_REF_M',
 'SCH_IDEAENR_HI_M',
 'SCH_FTESERVICES_PSY',
 'SCH_DISCWDIS_MULTOOS_IDEA_BL_F',
 'TOT_APENR_F',
 'SCH_DISCWODIS_ARR_WH_F',
 'SCH_IDEAENR_HP_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_TR_F',
 'SCH_DISCWDIS_REF_IDEA_WH_F',
 'SCH_DISCWODIS_MULTOOS_WH_M',
 'SCH_FTESECURITY_GUA',
 'SCH_DISCWODIS_SINGOOS_TR_M',
 'SCH_DISCWDIS_REF_IDEA_WH_M',
 'SCH_DISCWODIS_ARR_TR_M',
 'SCH_DISCWDIS_ARR_IDEA_BL_F',
 'SCH_DISCWDIS_ARR_IDEA_TR_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_BL_M',
 'SCH_DISCWODIS_ARR_AM_M',
 'SCH_DISCWDIS_ARR_IDEA_AS_M',
 'SCH_DISCWDIS_REF_IDEA_BL_M',
 'SCH_GTENR_IDEA_M',
 'TOT_DISCWODIS_MULTOOS_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_HI_F',
 'TOT_DISCWODIS_SINGOOS_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_AM_F',
 'SCH_DISCWODIS_MULTOOS_TR_F',
 'TOT_DISCWODIS_SINGOOS_M',
 'SCH_ENR_HI_M',
 'SCH_APENR_HI_F',
 'SCH_IDEAENR_HI_F',
 'SCH_DISCWDIS_REF_IDEA_BL_F',
 'SCH_DISCWODIS_MULTOOS_AM_F',
 'TOT_DISCWDIS_MULTOOS_IDEA_M',
 'SCH_DISCWDIS_REF_IDEA_AS_M',
 'SCH_GTENR_HI_M',
 'SCH_DISCWODIS_REF_HI_M',
 'SCH_FTESERVICES_SOC',
 'SCH_DISCWDIS_SINGOOS_IDEA_AM_F',
 'SCH_DISCWODIS_REF_TR_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_HP_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_BL_M',
 'TOT_ENR_F',
 'SCH_GTENR_AS_F',
 'SCH_GTENR_HP_F',
 'SCH_ENR_BL_F',
 'SCH_DISCWODIS_SINGOOS_BL_M',
 'SCH_DISCWODIS_SINGOOS_TR_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_WH_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_AS_F',
 'SCH_DISCWODIS_SINGOOS_HP_M',
 'SCH_GTENR_WH_M',
 'SCH_DISCWODIS_REF_AM_F',
 'SCH_DISCWODIS_SINGOOS_AS_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_HI_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_HI_M',
 'SCH_DISCWODIS_SINGOOS_HI_M',
 'SCH_APENR_TR_M',
 'SCH_DISCWODIS_REF_HI_F',
 'SCH_DISCWODIS_ARR_BL_F',
 'SCH_ENR_TR_F',
 'SCH_DISCWODIS_SINGOOS_BL_F',
 'SCH_DISCWODIS_ARR_HP_F',
 'SCH_APENR_AS_M',
 'SCH_GTENR_LEP_M',
 'SCH_DISCWDIS_REF_IDEA_HI_F',
 'SCH_DISCWODIS_MULTOOS_BL_F',
 'SCH_DISCWODIS_REF_HP_M',
 'SCH_APENR_BL_F',
 'SCH_IDEAENR_AS_M',
 'TOT_DISCWODIS_ARR_M',
 'SCH_DISCWODIS_SINGOOS_HI_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_WH_F',
 'SCH_IDEAENR_AM_F',
 'TOT_APENR_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_TR_M',
 'SCH_DISCWODIS_REF_TR_F',
 'SCH_DISCWDIS_REF_IDEA_AM_M',
 'SCH_GTENR_BL_F',
 'SCH_IDEAENR_TR_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_AM_M',
 'SCH_GTENR_HI_F',
 'SCH_DISCWODIS_ARR_AS_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_TR_F',
 'TOT_DISCWODIS_MULTOOS_F',
 'SCH_DISCWODIS_REF_AS_M',
 'SCH_DISCWDIS_ARR_IDEA_WH_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_BL_F',
 'TOT_DISCWDIS_SINGOOS_IDEA_M',
 'SCH_APENR_HI_M',
 'SCH_DISCWODIS_SINGOOS_WH_F',
 'SCH_DISCWDIS_ARR_IDEA_HP_M',
 'SCH_IDEAENR_AM_M',
 'SCH_APENR_AM_M',
 'SCH_GTENR_TR_F',
 'TOT_GTENR_F',
 'SCH_DISCWODIS_SINGOOS_AM_F',
 'SCH_APENR_WH_M',
 'SCH_DISCWODIS_REF_AM_M',
 'TOT_DISCWODIS_REF_F',
 'SCH_IDEAENR_BL_F',
 'SCH_DISCWODIS_SINGOOS_AS_M',
 'TOT_DISCWDIS_SINGOOS_IDEA_F',
 'SCH_DISCWODIS_ARR_HI_M',
 'SCH_DISCWDIS_ARR_IDEA_HP_F',
 'SCH_DISCWDIS_REF_IDEA_HI_M',
 'SCH_ENR_TR_M',
 'SCH_GTENR_BL_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_AM_M',
 'SCH_DISCWODIS_ARR_HI_F',
 'SCH_IDEAENR_AS_F',
 'SCH_DISCWDIS_ARR_IDEA_HI_F',
 'SCH_APENR_HP_F',
 'TOT_IDEAENR_M',
 'SCH_GTENR_AM_M',
 'SCH_ENR_BL_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_AS_M',
 'SCH_ENR_AM_M',
 'SCH_DISCWODIS_MULTOOS_WH_F',
 'SCH_ENR_HP_M',
 'SCH_IDEAENR_BL_M',
 'SCH_IDEAENR_TR_M',
 'SCH_APENR_WH_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_AS_F',
 'SCH_ENR_AM_F',
 'SCH_DISCWDIS_REF_IDEA_TR_M',
 'SCH_DISCWODIS_REF_AS_F',
 'SCH_DISCWDIS_ARR_IDEA_AM_F',
 'SCH_ENR_WH_M',
 'SCH_DISCWODIS_REF_BL_M',
 'SCH_DISCWODIS_MULTOOS_AM_M',
 'SCH_DISCWDIS_ARR_IDEA_HI_M',
 'SCH_DISCWODIS_REF_HP_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_AS_M',
 'SCH_DISCWODIS_ARR_AS_M',
 'SCH_IDEAENR_WH_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_HP_F',
 'TOT_DISCWDIS_REF_IDEA_M',
 'SCH_FTESECURITY_LEO',
 'SCH_DISCWODIS_MULTOOS_TR_M',
 'SCH_IDEAENR_HP_F',
 'SCH_DISCWODIS_MULTOOS_HI_F',
 'SCH_DISCWDIS_ARR_IDEA_WH_F',
 'SCH_GTENR_HP_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_TR_M',
 'SCH_DISCWDIS_ARR_IDEA_AS_F',
 'SCH_DISCWODIS_REF_WH_M',
 'SCH_DISCWODIS_ARR_BL_M',
 'TOT_IDEAENR_F',
 'TOT_DISCWDIS_ARR_IDEA_F',
 'SCH_DISCWODIS_MULTOOS_AS_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_WH_M',
 'SCH_APENR_TR_F',
 'SCH_ENR_WH_F',
 'SCH_DISCWODIS_REF_WH_F',
 'SCH_APENR_HP_M',
 'SCH_DISCWODIS_MULTOOS_BL_M',
 'SCH_GTENR_AM_F',
 'TOT_DISCWDIS_MULTOOS_IDEA_F',
 'SCH_DISCWDIS_REF_IDEA_HP_M',
 'SCH_ENR_HI_F',
 'SCH_DISCWDIS_REF_IDEA_HP_F',
 'TOT_ENR_M',
 'SCH_ENR_AS_F',
 'SCH_DISCWDIS_REF_IDEA_AM_F',
 'SCH_DISCWODIS_ARR_HP_M',
 'SCH_APENR_AS_F',
 'TOT_DISCWODIS_ARR_F',
 'SCH_ENR_AS_M',
 'SCH_GTENR_WH_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_HP_F',
 'SCH_GTENR_LEP_F',
 'SCH_IDEAENR_WH_F',
 'SCH_DISCWODIS_SINGOOS_AM_M',
 'SCH_DISCWODIS_SINGOOS_WH_M',
 'SCH_DISCWDIS_ARR_IDEA_TR_F',
 'SCH_ENR_HP_F',
 'SCH_DISCWODIS_MULTOOS_HP_M',
 'SCH_DISCWDIS_REF_IDEA_TR_F',
 'SCH_GTENR_AS_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_WH_M',
 'TOT_GTENR_M',
 'SCH_DISCWODIS_ARR_TR_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_HP_M',
 'SCH_DISCWODIS_MULTOOS_HP_F',
 'SCH_DISCWDIS_ARR_IDEA_BL_M',
 'SCH_DISCWDIS_REF_IDEA_AS_F',
 'SCH_APENR_BL_M',
 'TOT_DISCWDIS_REF_IDEA_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_HI_M']

Custom aggregation dictionary - for string values, take the first non-null value.  For numeric fields, take the sum

In [640]:
agg_dict = {k:sum if k not in {'LEA_NAME', 'LEA_STATE_NAME'} else 'first' for k in agg_cols}

In [641]:
len(agg_dict) == len(agg_cols)

True

In [642]:
grouped_by_distyr = district.groupby(['LEAID', 'YEAR'])[agg_cols].agg(agg_dict)
grouped_by_distyr = grouped_by_distyr[agg_cols]
grouped_by_distyr.reset_index(inplace=True)
grouped_by_distyr.sort_values(by=['LEAID', 'YEAR'], inplace=True)

Fill in missing district states for 11/12 and 13/14 using the 15/16 value

In [643]:
grouped_by_distyr['LEA_STATE_NAME'] = grouped_by_distyr['LEA_STATE_NAME'].fillna(method='bfill')

Read geodata from file and join to district aggregation.

In [651]:
geo_data = pd.read_excel('/Users/cave/Desktop/discriminology/LEA Profile Info.xlsx', dtype={'LEAID': str})

In [652]:
len(geo_data)

16758

In [656]:
geo_data['LEAID'] = geo_data['LEAID'].str.zfill(7)

In [658]:
geo_data['LEAID'].nunique()

16758

In [654]:
len(set(grouped_by_distyr['LEAID'].values))

18135

In [659]:
len(set(geo_data['LEAID'].values).intersection(set(grouped_by_distyr['LEAID'].values)))

16758

In [393]:
district_geo = geo_data[['LEAID', 'LEA_ADDRESS', 'LEA_CITY', 'LEA_ZIP',
       'CJJ', 'LEA_ENR', 'LEA_SCHOOLS', 'Latitude', 'Longitude']]

In [394]:
district_geo.head()

Unnamed: 0,LEAID,LEA_ADDRESS,LEA_CITY,LEA_ZIP,CJJ,LEA_ENR,LEA_SCHOOLS,Latitude,Longitude
0,100002,1000 INDUSTRIAL SCHOOL ROAD,MT. MEIGS,36057,Yes,3674,3,32.371901,-86.083791
1,100005,107 WEST MAIN STREET,ALBERTVILLE,35950,No,4712,6,34.268415,-86.209216
2,100006,12380 US HIGHWAY 431 S,GUNTERSVILLE,35976,No,5624,14,34.305366,-86.287334
3,100007,2810 METROPOLITAN WAY,HOOVER,35243,No,14054,16,33.445932,-86.750203
4,100008,211 CELTIC DRIVE,MADISON,35758,Yes,9206,11,34.686749,-86.745762


In [396]:
district_group_w_geo = pd.merge(grouped_by_distyr, district_geo, left_on='LEAID', right_on='LEAID', how='left')

In [397]:
district_group_w_geo.head()

Unnamed: 0,LEAID,YEAR,LEA_NAME,LEA_STATE_NAME,SCH_DISCWODIS_MULTOOS_HI_M,SCH_DISCWDIS_ARR_IDEA_AM_M,SCH_GTENR_IDEA_F,SCH_DISCWODIS_SINGOOS_HP_F,SCH_FTECOUNSELORS,SCH_DISCWODIS_ARR_WH_M,...,TOT_DISCWDIS_REF_IDEA_F,SCH_DISCWDIS_MULTOOS_IDEA_HI_M,LEA_ADDRESS,LEA_CITY,LEA_ZIP,CJJ,LEA_ENR,LEA_SCHOOLS,Latitude,Longitude
0,100002,2011-12,Alabama Youth Services,ALABAMA,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,1000 INDUSTRIAL SCHOOL ROAD,MT. MEIGS,36057.0,Yes,3674.0,3.0,32.371901,-86.083791
1,100002,2013-14,Alabama Youth Services,ALABAMA,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,1000 INDUSTRIAL SCHOOL ROAD,MT. MEIGS,36057.0,Yes,3674.0,3.0,32.371901,-86.083791
2,100002,2015-16,Alabama Youth Services,ALABAMA,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,1000 INDUSTRIAL SCHOOL ROAD,MT. MEIGS,36057.0,Yes,3674.0,3.0,32.371901,-86.083791
3,100005,2011-12,Albertville City,ALABAMA,8.0,0.0,0.0,0.0,10.0,2.0,...,2.0,2.0,107 WEST MAIN STREET,ALBERTVILLE,35950.0,No,4712.0,6.0,34.268415,-86.209216
4,100005,2013-14,Albertville City,ALABAMA,9.0,0.0,0.0,0.0,10.0,0.0,...,0.0,4.0,107 WEST MAIN STREET,ALBERTVILLE,35950.0,No,4712.0,6.0,34.268415,-86.209216


In [380]:
district_merge = pd.merge(district_group_w_geo.T, descriptions, left_index=True, right_index=True, how='left').set_index('description', append=True).T

In [386]:
sample = district_merge[~district_merge['Latitude', None].isna()]
sample[-600:].to_csv('~/Desktop/discriminology/output/district_level_sample.csv')

In [401]:
district_merge[district_merge['Latitude', None].isna()]

Unnamed: 0_level_0,LEAID,YEAR,LEA_NAME,LEA_STATE_NAME,SCH_DISCWODIS_MULTOOS_HI_M,SCH_DISCWDIS_ARR_IDEA_AM_M,SCH_GTENR_IDEA_F,SCH_DISCWODIS_SINGOOS_HP_F,SCH_FTECOUNSELORS,SCH_DISCWODIS_ARR_WH_M,...,TOT_DISCWDIS_REF_IDEA_F,SCH_DISCWDIS_MULTOOS_IDEA_HI_M,LEA_ADDRESS,LEA_CITY,LEA_ZIP,CJJ,LEA_ENR,LEA_SCHOOLS,Latitude,Longitude
description,7 Digit LEAID District Identification Code,NaN,District Name,District State Name,Students without disabilities who received more than one out-of-school suspension: Hispanic Male,Students with disabilities who received a school-related arrest: American Indian/Alaska Native Male,Gifted and Talented Student Enrollment: IDEA Female,Students without disabilities who received only one out-of-school suspension: Native Hawaiian/Pacific Islander Female,School Counselors: Number of FTE school counselors,Students without disabilities who received a school-related arrest: White Male,...,Students with disabilities who were referred to a law enforcement agency or official: Calculated Female Total,Students with disabilities who received more than one out-of-school suspension: Hispanic Male,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
69,100040,2011-12,Brantwood Children'S Home,ALABAMA,0,0,0,0,0,0,...,0,0,,,,,,,,
97,100179,2011-12,The Bridge Ii,ALABAMA,0,0,0,0,0,0,...,0,0,,,,,,,,
115,100189,2015-16,Satsuma City,ALABAMA,0,0,2,0,2.5,0,...,0,0,,,,,,,,
119,100194,2015-16,Pelham City,ALABAMA,6,0,0,0,7,0,...,0,0,,,,,,,,
120,100195,2015-16,Pike Road,ALABAMA,2,0,0,0,1,0,...,0,0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51453,804680,2015-16,Holly School District No. Re-3,COLORADO,0,0,0,0,0,0,...,0,0,,,,,,,,
51454,804710,2015-16,Holyoke School District No. Re-1J,COLORADO,2,0,0,0,2,0,...,0,0,,,,,,,,
51455,804740,2015-16,Genoa-Hugo School District No. C-113,COLORADO,0,0,0,0,0,0,...,0,0,,,,,,,,
51456,804770,2015-16,Ignacio School District No. 11Jt,COLORADO,4,0,2,0,4,0,...,0,4,,,,,,,,


In [398]:
len(sample)

28726

In [342]:
district_merge.to_csv('~/Desktop/discriminology/output/district_level_aggregates.csv')

### Code Sandbox - everything below only needs to be run once

In [None]:
# descriptions = pd.concat(desc_frames_1112, axis=0)
# descriptions.drop_duplicates(inplace=True)
# descriptions.set_index('Field Name', inplace=True)
# descriptions = descriptions.loc[fields_1112]
# descriptions.columns = ['2011_12_description']
# descriptions.reset_index().to_csv('/Users/cave/Desktop/2011_12_field_descriptions.csv')

# descriptions = pd.concat(desc_frames_1314, axis=0)
# descriptions.drop_duplicates(inplace=True)
# descriptions.set_index('Field Name', inplace=True)
# descriptions = descriptions.loc[fields_1314]
# descriptions.columns = ['2013_14_description']
# descriptions.reset_index().to_csv('/Users/cave/Desktop/2013_14_field_descriptions.csv')