In [32]:
import pandas as pd

### Extract methodology fields for each year

In [33]:
mf = pd.read_excel('/Users/cave/Desktop/discriminology/methodology_fields.xlsx')

In [34]:
fields_1112 = mf['Field_Name 2011/12'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().drop_duplicates().values

fields_1314 = mf['Field_Name 2013/14'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().drop_duplicates().values

fields_1516 = mf['Field_Name 2015/16'].str.replace('\n', ',').str.split(',').apply(pd.Series).stack().drop_duplicates().values


Check that there are no duplicates in the desired fields

In [35]:
assert len(fields_1112) == pd.Series(fields_1112).nunique()
assert len(fields_1314) == pd.Series(fields_1314).nunique()
assert len(fields_1516) == pd.Series(fields_1516).nunique()

In [36]:
print(f"{len(fields_1112)} desired columns from 11/12 data")
print(f"{len(fields_1314)} desired columns from 13/14 data")
print(f"{len(fields_1516)} desired columns from 15/16 data")

221 desired columns from 11/12 data
219 desired columns from 13/14 data
231 desired columns from 15/16 data


Read field mappings and data types

In [37]:
field_mappings = pd.read_excel('/Users/cave/Desktop/discriminology/field_mapping.xlsx')

In [38]:
map_ = field_mappings[['11_12_field', '13_14_field', 'type']].dropna(subset=['11_12_field'])
map_2 = field_mappings[['11_12_field', '13_14_field']].dropna(subset=['11_12_field', '13_14_field'])
map_11_12 = pd.Series(map_2['13_14_field'].values, index=map_2['11_12_field']).to_dict()
type_dict_11_12 = pd.Series(map_['type'].values, index=map_['11_12_field']).to_dict()
final_type_dict_11_12 = {k:v for k,v in type_dict_11_12.items() if v !='str'}

In [39]:
field_mappings['col_superset'] = field_mappings['15_16_field'].combine_first(field_mappings['11_12_field'])
type_map = pd.Series(field_mappings.type.values, index=field_mappings.col_superset).to_dict()
numeric_type_map = {k:v for k,v in type_map.items() if v !='str'}

In [40]:
field_mappings[['col_superset', 'description']]
descriptions = field_mappings[['col_superset', 'description']].set_index('col_superset')

In [41]:
print(len(descriptions))

230


In [42]:
universal_types = {'SCH_ZIP ': str
                   ,'SCHID': str
                   ,'COMBOKEY': str
                   ,'LEAID': str
                  }

### Create helper functions to aggregate dataframes and label coded columns

In [82]:
def aggregate_data(frame_array, desired_fields):

    clean_frames = []

    for df in frame_array:
        # subset dataframe to desired columns

        temp = df[list(set(df.columns[df.columns.isin(desired_fields)]))]

        num = temp._get_numeric_data()
        num[num < 0] = 0 # replace negative numbers with 0
        temp = temp.replace('<=2', '1') # round <=2 to 1, following ProPublica convention
        temp = temp.replace('‡', None) # remove misc symbols from values
        temp['COMBOKEY'] = temp['COMBOKEY'].astype(str)
        temp.set_index('COMBOKEY', inplace=True)
        clean_frames.append(temp)


    concat = pd.concat(clean_frames, axis=1, sort=True)
    flipped = concat.T.drop_duplicates()
    grouped = flipped.groupby(lambda x: x).agg({c: 'last' for c in flipped.columns})
    return grouped.T

In [83]:
def join_col_descriptions(agg_data, year):
    '''
    INPUTS
    
    frames (list of DataFrames): Array of dataframes with field codes and descriptions
    agg_data (DataFrame): Aggregated data with coded fields as columns
    year (STR): year range of data e.g. '2015-16'
    
    '''
    final = pd.merge(agg_data.T, descriptions, left_index=True, right_index=True, how='inner').set_index('description', append=True).T
    final['YEAR','School Year'] = year
    final.to_csv(f'~/Desktop/discriminology/output/final_data_{year}.csv')
    return final
    

### Load all data + col description files for 2011-12

In [84]:
## Reads originals from excel

# df1_1112, df1_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/05 - Overall Enrollment.xlsx', sheet_name=None).values()
# df2_1112, df2_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/08 - Students enrolled in Gifted-Talented Programs.xlsx', sheet_name=None).values()
# df3_1112, df3_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/10-1 - Students with Disabilities Served under IDEA Enrollment.xlsx', sheet_name=None).values()
# df4_1112, df4_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/10-2 - Students with Disabilities Served under 504 Enrollment.xlsx', sheet_name=None).values()
# df5_1112, df5_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/W:O Disabilities/35-3 - Students WO Disab Receiving only one out-of-school suspension.xlsx', sheet_name=None).values()
# df6_1112, df6_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/W:O Disabilities/35-4 - Students WO Disab Rec more than one out-of-school suspension.xlsx', sheet_name=None).values()
# df7_1112, df7_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/With Disabilities/36-3 - Students With Disabilities Receiving only one out-of-school suspension.xlsx', sheet_name=None).values()
# df8_1112, df8_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Out of School Suspensions/With Disabilities/36-4 - Students With Disab Receiving more than one out-of-school suspension.xlsx', sheet_name=None).values()
# df9_1112, df9_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Academic/Advanced Placement/17 - Students who are taking at least one AP course.xlsx', sheet_name=None).values()
# df10_1112, df10_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Referral to law enforcement/W:O Disabilities/35-8 - Students Without Disabilities Referral to law enforcement.xlsx', sheet_name=None).values()
# df11_1112, df11_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/Referral to law enforcement/With Disabilities/36-8 - Students With Disabilities Referral to law enforcement.xlsx', sheet_name=None).values()
# df12_1112, df12_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/School Related Arrest/W:O Disabilities/35-9 - Students Without Disabilities School-related arrest.xlsx', sheet_name=None).values()
# df13_1112, df13_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Discipline/School Related Arrest/With Disabilities/36-9 - Students With Disabilities School-related arrest.xlsx', sheet_name=None).values()
# df14_1112, df14_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/School Characteristics/02 - School Characteristics.xlsx', sheet_name=None).values()
# df15_1112, df15_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Staff/08-1 School Support and Security Staff (required elements).xlsx', sheet_name=None).values()
# df16_1112, df16_desc_1112 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2011:12/Enrollment/06 - Enrolled in Early Childhood and Prekindergarten.xlsx', sheet_name=None).values()

In [85]:
# ctr = 0
# for frame in desc_frames_1112:
#     frame.to_csv(f'/Users/cave/Desktop/discriminology/2011_12/descriptions/file_{ctr}.csv')
#     ctr += 1

# ctr = 0
# for frame in frames_1314:
#     frame.to_csv(f'~/Desktop/discriminology/2013_14/data/file_{ctr}.csv')
#     ctr += 1


In [47]:
df1_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_1.csv', dtype=universal_types)
df2_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_2.csv', dtype=universal_types)
df3_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_3.csv', dtype=universal_types)
df4_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_4.csv', dtype=universal_types)
df5_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_5.csv', dtype=universal_types)
df6_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_6.csv', dtype=universal_types)
df7_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_7.csv', dtype=universal_types)
df8_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_8.csv', dtype=universal_types)
df9_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_9.csv', dtype=universal_types)
df10_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_10.csv', dtype=universal_types)
df11_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_11.csv', dtype=universal_types)
df12_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_12.csv', dtype=universal_types)
df13_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_13.csv', dtype=universal_types)
df14_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_14.csv', dtype=universal_types)
df15_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_15.csv', dtype=universal_types)
df16_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/data/file_0.csv', dtype=universal_types)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [48]:
df1_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_1.csv', index_col=0)
df2_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_2.csv', index_col=0)
df3_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_3.csv', index_col=0)
df4_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_4.csv', index_col=0)
df5_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_5.csv', index_col=0)
df6_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_6.csv', index_col=0)
df7_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_7.csv', index_col=0)
df8_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_8.csv', index_col=0)
df9_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_9.csv', index_col=0)
df10_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_10.csv', index_col=0)
df11_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_11.csv', index_col=0)
df12_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_12.csv', index_col=0)
df13_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_13.csv', index_col=0)
df14_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_14.csv', index_col=0)
df15_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_15.csv', index_col=0)
df16_desc_1112 = pd.read_csv('~/Desktop/discriminology/2011_12/descriptions/file_0.csv', index_col=0)

In [49]:

frames_1112 = [df1_1112, df2_1112, df3_1112, df4_1112, df5_1112, df6_1112,
               df7_1112, df8_1112, df9_1112, df10_1112, df11_1112, df12_1112,
               df13_1112, df14_1112, df15_1112, df16_1112]



desc_frames_1112 = [df1_desc_1112, df2_desc_1112, df3_desc_1112, df4_desc_1112,
                    df5_desc_1112, df6_desc_1112, df7_desc_1112, df8_desc_1112,
                    df9_desc_1112, df10_desc_1112, df11_desc_1112, df12_desc_1112,
                    df13_desc_1112, df14_desc_1112, df15_desc_1112, df16_desc_1112]


Use helper function to aggregate frames and clean up negative values, standardize length of zipcodes.

In [92]:
df2_1112.head()

Unnamed: 0.1,Unnamed: 0,LEA_STATE,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,M_AME_7_IDEA,M_ASI_7_IDEA,...,F_AME_7_IDEA,F_ASI_7_IDEA,F_HIS_7_IDEA,F_BLA_7_IDEA,F_WHI_7_IDEA,F_HI_PAC_7_IDEA,F_2_OR_MORE_7_IDEA,F_TOT_7_IDEA,F_LEP_7_IDEA,Incomplete
0,0,AL,100002,ALABAMA YOUTH SERVICES,1705,WALLACE SCH - MT MEIGS CAMPUS,10000201705,Z,<=2,<=2,...,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,
1,1,AL,100002,ALABAMA YOUTH SERVICES,1706,MCNEEL SCH - VACCA CAMPUS,10000201706,Z,<=2,<=2,...,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,
2,2,AL,100002,ALABAMA YOUTH SERVICES,99995,AUTAUGA CAMPUS,10000299995,X,<=2,<=2,...,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,
3,3,AL,100002,ALABAMA YOUTH SERVICES,99996,WALLACE ANNEX III,10000299996,Z,<=2,<=2,...,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,<=2,
4,4,AL,100005,ALBERTVILLE CITY,870,ALBERTVILLE MIDDLE SCHOOL,10000500870,,<=2,<=2,...,<=2,<=2,<=2,<=2,13,<=2,<=2,13,<=2,


In [95]:
df_1314_raw[df_1314_raw['LEAID']=='0100005']['SCH_IDEAENR_AM_M']

010000500870    0
010000500871    0
010000500879    0
010000500889    0
010000501616    0
010000502150    0
Name: SCH_IDEAENR_AM_M, dtype: object

In [50]:
# Careful, this cell runs for a while
df_1112_raw = aggregate_data(frames_1112, fields_1112)

In [90]:
df_1112 = df_1112_raw.copy()

In [91]:
[x for x in df_1112.columns if 'IDEA' in x]

['F_2_OR_MORE_7_IDEA',
 'F_AME_7_IDEA',
 'F_ASI_7_IDEA',
 'F_BLA_7_IDEA',
 'F_HIS_7_IDEA',
 'F_HI_PAC_7_IDEA',
 'F_TOT_7_IDEA',
 'F_TOT_IDEA_7_ARREST_DIS',
 'F_TOT_IDEA_7_LAW_DIS',
 'F_TOT_IDEA_7_MULT_SUS_DIS',
 'F_TOT_IDEA_7_SINGLE_SUS_DIS',
 'F_WHI_7_IDEA',
 'M_2_OR_MORE_7_IDEA',
 'M_AME_7_IDEA',
 'M_ASI_7_IDEA',
 'M_BLA_7_IDEA',
 'M_HIS_7_IDEA',
 'M_HI_PAC_7_IDEA',
 'M_TOT_7_IDEA',
 'M_TOT_IDEA_7_ARREST_DIS',
 'M_TOT_IDEA_7_LAW_DIS',
 'M_TOT_IDEA_7_MULT_SUS_DIS',
 'M_TOT_IDEA_7_SINGLE_SUS_DIS',
 'M_WHI_7_IDEA']

In [52]:
df_1112['SCH_ZIP '] = df_1112['SCH_ZIP '].str.zfill(5) # pad zipcodes with leading zeroes

Change 0/1 indicator vars to Yes / No for selected columns

In [53]:
int_cols_to_str = [
    'PreK'
    ,'K'
    ,'G1'
    ,'G2'
    ,'G3'
    ,'G4'
    ,'G5'
    ,'G6'
    ,'G7'
    ,'G8' 
    ,'G9'
    ,'G10'
    ,'G11'
    ,'G12'
    ,'MG_SCH'
    ,'CHARTER_SCH'
    ,'ALT_SCH'
                  ]

In [54]:
df_1112[int_cols_to_str] =  df_1112[int_cols_to_str].replace('0', 'No').replace('1', 'Yes').replace(0, 'No').replace(1, 'Yes')

In [55]:
df_1112['SCH_FTESECURITY_IND'] = df_1112['SCH_FTESECURITY_IND'].str.replace('-9','No').str.replace('-5','No')

Make types consistent within each column

In [56]:
type_dict_11_12.pop('COMBOKEY')

'str'

In [57]:
df_1112 = df_1112.astype(final_type_dict_11_12)

Custom case handling: Juvenile Justice facilities

In [58]:
df_1112['JJ'] = df_1112['JJ'].str.replace('Z', 'Yes').str.replace('X', 'Yes')

Rename columns to match 13/14 and 15/16 data

In [59]:
df_1112.rename(columns=map_11_12, inplace=True)

In [60]:
superset = field_mappings['col_superset'].values
ordered_cols_1112 = [c for c in superset if c in df_1112.columns]

In [61]:
df_1112.index.nunique()

101133

In [62]:
df_1112 = df_1112[ordered_cols_1112]
df_1112_final = join_col_descriptions(df_1112, '2011-12')

In [63]:
df_1112_final[::100].to_csv('~/Desktop/discriminology/output/11_12_sample.csv')

In [64]:
df_1112_final.shape

(101133, 220)

### Load all data + col description files for 2013-14

Collect column name descriptions from each spreadsheet

In [65]:
# # Read originals from excel

# df1_1314, df1_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/01 School Characteristics.xlsx', sheet_name=None).values()
# df2_1314, df2_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/03 Enrollment.xlsx', sheet_name=None).values()
# df3_1314, df3_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/04-1 Gifted and Talented Enrollment.xlsx', sheet_name=None).values()
# df4_1314, df4_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/06 Advanced Placement and International Baccalaureate Diploma Programme Enrollment.xlsx', sheet_name=None).values()
# df5_1314, df5_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/07-2 Advanced Placement Exams.xlsx', sheet_name=None).values()
# df6_1314, df6_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/08-1 School Support and Security Staff (required elements).xlsx', sheet_name=None).values()
# df7_1314, df7_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/09-1 Chronic Absenteeism.xlsx', sheet_name=None).values()
# df8_1314, df8_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/11-2 Suspensions (required elements).xlsx', sheet_name=None).values()
# df9_1314, df9_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/11-3 Expulsions.xlsx', sheet_name=None).values()
# df10_1314, df10_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/12 Student Referrals and Arrests.xlsx', sheet_name=None).values()
# df11_1314, df11_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/16 School Expenditures.xlsx', sheet_name=None).values()
# df12_1314, df12_desc_1314 = pd.read_excel('~/Desktop/discriminology/OCR School data sample/2013:14/CRDC-collected data file for Schools/17 Justice Facilities.xlsx', sheet_name=None).values()

In [66]:
# ctr = 0
# for frame in desc_frames_1314:
#     frame.to_csv(f'~/Desktop/discriminology/2013_14/descriptions/file_{ctr}.csv')
#     ctr += 1


# ctr = 0
# for frame in frames_1314:
#     frame.to_csv(f'~/Desktop/discriminology/2013_14/data/file_{ctr}.csv')
#     ctr += 1


In [67]:
df1_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_1.csv', dtype=universal_types)
df2_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_2.csv', dtype=universal_types)
df3_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_3.csv', dtype=universal_types)
df4_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_4.csv', dtype=universal_types)
df5_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_5.csv', dtype=universal_types)
df6_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_6.csv', dtype=universal_types)
df7_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_7.csv', dtype=universal_types)
df8_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_8.csv', dtype=universal_types)
df9_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_9.csv', dtype=universal_types)
df10_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_10.csv', dtype=universal_types)
df11_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_11.csv', dtype=universal_types)
df12_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/data/file_0.csv', dtype=universal_types)

In [68]:
df1_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_1.csv', dtype=universal_types, index_col=0)
df2_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_2.csv', dtype=universal_types, index_col=0)
df3_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_3.csv', dtype=universal_types, index_col=0)
df4_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_4.csv', dtype=universal_types, index_col=0)
df5_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_5.csv', dtype=universal_types, index_col=0)
df6_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_6.csv', dtype=universal_types, index_col=0)
df7_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_7.csv', dtype=universal_types, index_col=0)
df8_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_8.csv', dtype=universal_types, index_col=0)
df9_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_9.csv', dtype=universal_types, index_col=0)
df10_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_10.csv', dtype=universal_types, index_col=0)
df11_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_11.csv', dtype=universal_types, index_col=0)
df12_desc_1314 = pd.read_csv('~/Desktop/discriminology/2013_14/descriptions/file_0.csv', dtype=universal_types, index_col=0)


In [69]:
frames_1314 = [df1_1314, df2_1314, df3_1314, df4_1314, df5_1314,
               df6_1314, df7_1314, df8_1314, df9_1314, df10_1314,
               df11_1314, df12_1314]


desc_frames_1314 = [df1_desc_1314, df2_desc_1314, df3_desc_1314,
                    df4_desc_1314, df5_desc_1314, df6_desc_1314,
                    df7_desc_1314, df8_desc_1314, df9_desc_1314,
                    df10_desc_1314, df11_desc_1314, df12_desc_1314]


In [70]:
df_1314_raw = aggregate_data(frames_1314, fields_1314)

In [71]:
df_1314 = df_1314_raw.copy()

In [73]:
df_1314['SCH_FTESECURITY_IND'] = df_1314['SCH_FTESECURITY_IND'].str.replace('-9','No').str.replace('-5','No')
df_1314 = df_1314.replace('-5', None)
df_1314 = df_1314.replace('-9', None)

In [74]:
df_1314.index.nunique()

95507

In [75]:
df_1314_final = join_col_descriptions(df_1314, '2013-14')

In [76]:
df_1314_final[::100].to_csv('~/Desktop/discriminology/output/13_14_sample.csv')

### Get column decriptions for the 2015-16 data

In [78]:
col_descr_1516 = pd.read_excel('/Users/cave/Desktop/discriminology/OCR School data sample/2015:16/CRDC 2015-16 School Data Record Layout copy.xlsx', index_col='Field_Name')
decoded_names = pd.DataFrame(col_descr_1516.loc[fields_1516]['Field_Description'])
decoded_names.columns = ['description']

In [79]:
decoded_names.reset_index().to_csv('/Users/cave/Desktop/2015_16_field_descriptions.csv')

### Isolate numeric columns in 15/16 and replace negative numbers with zeroes.

In [81]:
df1516_raw = pd.read_csv('/Users/cave/Desktop/discriminology/OCR School data sample/2015:16/CRDC 2015-16 School Data copy.csv'
                      , encoding='iso-8859-1'
                     )
df1516_raw['LEAID'] = df1516_raw['LEAID'].astype(str).str.zfill(7)
df1516_raw['COMBOKEY'] = df1516_raw['LEAID'] + df1516_raw['SCHID'].astype(str).str.zfill(5)
df1516_raw = df1516_raw[fields_1516]

### Replace negative values with zeroes.

In [99]:
num = df1516_raw._get_numeric_data()
num[num < 0] = 0

In [100]:
df1516_raw = df1516_raw.replace('-5', 'No').replace('-9', 'No')

In [101]:
df1516_raw.set_index('COMBOKEY', inplace=True)

In [103]:
df_1516_final = pd.merge(df1516_raw.T, descriptions, left_index=True, right_index=True).set_index('description', append=True).T
df_1516_final['YEAR','School Year'] = '2015-16'

In [104]:
df_1516_final[::100].to_csv('~/Desktop/discriminology/output/15_16_sample.csv')
df_1516_final.to_csv('~/Desktop/discriminology/output/final_data_2015-16.csv')

### Concatenate all three years of data together

In [105]:
df_1112_raw['LEAID'].nunique()

17342

In [106]:
df_1314_raw['LEAID'].nunique()

16758

In [107]:
df1516_raw['LEAID'].nunique()

17337

In [243]:
full_table = pd.concat([df_1516_final, df_1314_final, df_1112_final], axis=0)

In [244]:
full_table = full_table.astype(numeric_type_map)

In [245]:
full_table['LEA_NAME', 'District Name'] = full_table['LEA_NAME', 'District Name'].str.title()
full_table['SCH_NAME', 'School Name'] = full_table['SCH_NAME', 'School Name'].str.title()
full_table['SCH_ADDRESS', 'School address'] = full_table['SCH_ADDRESS', 'School address'].str.title()
full_table['SCHID', '5 Digit School Identification Code'] = full_table['SCHID', '5 Digit School Identification Code'].astype(str).str.zfill(5)

In [246]:
full_table.to_csv('/Users/cave/Desktop/discriminology/output/full_table_all_years.csv')

In [247]:
full_table.sort_index(inplace=True)

In [262]:
def categorize_school_level(SCH_GRADE_KG, SCH_GRADE_G01,
                            SCH_GRADE_G02, SCH_GRADE_G03, SCH_GRADE_G04, SCH_GRADE_G05,
                            SCH_GRADE_G06, SCH_GRADE_G07, SCH_GRADE_G08, SCH_GRADE_G09,
                            SCH_GRADE_G10, SCH_GRADE_G11, SCH_GRADE_G12):
    
    
    SCH_GRADE_KG = SCH_GRADE_KG[0] == 'Yes'
    SCH_GRADE_G01 = SCH_GRADE_G01[0] == 'Yes'
    SCH_GRADE_G02 = SCH_GRADE_G02[0] == 'Yes'
    SCH_GRADE_G03 = SCH_GRADE_G03[0] == 'Yes'
    SCH_GRADE_G04 = SCH_GRADE_G04[0] == 'Yes'
    SCH_GRADE_G05 = SCH_GRADE_G05[0] == 'Yes'
    SCH_GRADE_G06 = SCH_GRADE_G06[0] == 'Yes'
    SCH_GRADE_G07 = SCH_GRADE_G07[0] == 'Yes'
    SCH_GRADE_G08 = SCH_GRADE_G08[0] == 'Yes'
    SCH_GRADE_G09 = SCH_GRADE_G09[0] == 'Yes'
    SCH_GRADE_G10 = SCH_GRADE_G10[0] == 'Yes'
    SCH_GRADE_G11 = SCH_GRADE_G11[0] == 'Yes'
    SCH_GRADE_G12 = SCH_GRADE_G12[0] == 'Yes'

    if sum([SCH_GRADE_KG, SCH_GRADE_G01, SCH_GRADE_G02, SCH_GRADE_G03,
          SCH_GRADE_G04, SCH_GRADE_G05, SCH_GRADE_G06]) >= 2 and sum([SCH_GRADE_G07, SCH_GRADE_G08]) == 0:
        return 'Elementary School'
    
    elif sum([SCH_GRADE_G05, SCH_GRADE_G06, SCH_GRADE_G07, SCH_GRADE_G08, SCH_GRADE_G09]) >= 3 or SCH_GRADE_G06:
        return 'Middle School'
    
    elif sum([SCH_GRADE_G09, SCH_GRADE_G10, SCH_GRADE_G11, SCH_GRADE_G12]) >= 3 or SCH_GRADE_G09:
        return 'High School'
    else:
        return 'Other'

In [None]:
full_table.apply(lambda row: categorize_school_level(row['SCH_GRADE_KG'], row['SCH_GRADE_G01'],
                            row['SCH_GRADE_G02'], row['SCH_GRADE_G03'], row['SCH_GRADE_G04'], row['SCH_GRADE_G05'],
                            row['SCH_GRADE_G06'], row['SCH_GRADE_G07'], row['SCH_GRADE_G08'], row['SCH_GRADE_G09'],
                            row['SCH_GRADE_G10'], row['SCH_GRADE_G11'], row['SCH_GRADE_G12']), axis=1
                            )

In [663]:
full_table[:2000].to_csv('/Users/cave/Desktop/discriminology/output/full_table_sample.csv')

### Group by districts and sum over fields

In [563]:
# full_table = pd.read_csv('/Users/cave/Desktop/discriminology/output/full_table_all_years.csv', header=[0,1], index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [632]:
district = full_table.copy()

In [636]:
district.columns = district.columns.droplevel(1)
district = district.astype(numeric_type_map)

In [689]:
district['TOTAL_SCHOOLS'] = 1.0
district['TOTAL_ENROLLMENT'] = district['TOT_ENR_F'] + district['TOT_ENR_M']

In [690]:
agg_cols = ['LEA_NAME', 'LEA_STATE_NAME', 'TOTAL_SCHOOLS', 'TOTAL_ENROLLMENT',
'SCH_DISCWODIS_MULTOOS_HI_M',
 'SCH_DISCWDIS_ARR_IDEA_AM_M',
 'SCH_GTENR_IDEA_F',
 'SCH_DISCWODIS_SINGOOS_HP_F',
 'SCH_FTECOUNSELORS',
 'SCH_DISCWODIS_ARR_WH_M',
 'TOT_DISCWDIS_ARR_IDEA_M',
 'SCH_GTENR_TR_M',
 'SCH_DISCWODIS_MULTOOS_AS_M',
 'SCH_APENR_AM_F',
 'SCH_DISCWODIS_REF_BL_F',
 'SCH_DISCWODIS_ARR_AM_F',
 'TOT_DISCWODIS_REF_M',
 'SCH_IDEAENR_HI_M',
 'SCH_FTESERVICES_PSY',
 'SCH_DISCWDIS_MULTOOS_IDEA_BL_F',
 'TOT_APENR_F',
 'SCH_DISCWODIS_ARR_WH_F',
 'SCH_IDEAENR_HP_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_TR_F',
 'SCH_DISCWDIS_REF_IDEA_WH_F',
 'SCH_DISCWODIS_MULTOOS_WH_M',
 'SCH_FTESECURITY_GUA',
 'SCH_DISCWODIS_SINGOOS_TR_M',
 'SCH_DISCWDIS_REF_IDEA_WH_M',
 'SCH_DISCWODIS_ARR_TR_M',
 'SCH_DISCWDIS_ARR_IDEA_BL_F',
 'SCH_DISCWDIS_ARR_IDEA_TR_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_BL_M',
 'SCH_DISCWODIS_ARR_AM_M',
 'SCH_DISCWDIS_ARR_IDEA_AS_M',
 'SCH_DISCWDIS_REF_IDEA_BL_M',
 'SCH_GTENR_IDEA_M',
 'TOT_DISCWODIS_MULTOOS_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_HI_F',
 'TOT_DISCWODIS_SINGOOS_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_AM_F',
 'SCH_DISCWODIS_MULTOOS_TR_F',
 'TOT_DISCWODIS_SINGOOS_M',
 'SCH_ENR_HI_M',
 'SCH_APENR_HI_F',
 'SCH_IDEAENR_HI_F',
 'SCH_DISCWDIS_REF_IDEA_BL_F',
 'SCH_DISCWODIS_MULTOOS_AM_F',
 'TOT_DISCWDIS_MULTOOS_IDEA_M',
 'SCH_DISCWDIS_REF_IDEA_AS_M',
 'SCH_GTENR_HI_M',
 'SCH_DISCWODIS_REF_HI_M',
 'SCH_FTESERVICES_SOC',
 'SCH_DISCWDIS_SINGOOS_IDEA_AM_F',
 'SCH_DISCWODIS_REF_TR_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_HP_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_BL_M',
 'TOT_ENR_F',
 'SCH_GTENR_AS_F',
 'SCH_GTENR_HP_F',
 'SCH_ENR_BL_F',
 'SCH_DISCWODIS_SINGOOS_BL_M',
 'SCH_DISCWODIS_SINGOOS_TR_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_WH_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_AS_F',
 'SCH_DISCWODIS_SINGOOS_HP_M',
 'SCH_GTENR_WH_M',
 'SCH_DISCWODIS_REF_AM_F',
 'SCH_DISCWODIS_SINGOOS_AS_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_HI_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_HI_M',
 'SCH_DISCWODIS_SINGOOS_HI_M',
 'SCH_APENR_TR_M',
 'SCH_DISCWODIS_REF_HI_F',
 'SCH_DISCWODIS_ARR_BL_F',
 'SCH_ENR_TR_F',
 'SCH_DISCWODIS_SINGOOS_BL_F',
 'SCH_DISCWODIS_ARR_HP_F',
 'SCH_APENR_AS_M',
 'SCH_GTENR_LEP_M',
 'SCH_DISCWDIS_REF_IDEA_HI_F',
 'SCH_DISCWODIS_MULTOOS_BL_F',
 'SCH_DISCWODIS_REF_HP_M',
 'SCH_APENR_BL_F',
 'SCH_IDEAENR_AS_M',
 'TOT_DISCWODIS_ARR_M',
 'SCH_DISCWODIS_SINGOOS_HI_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_WH_F',
 'SCH_IDEAENR_AM_F',
 'TOT_APENR_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_TR_M',
 'SCH_DISCWODIS_REF_TR_F',
 'SCH_DISCWDIS_REF_IDEA_AM_M',
 'SCH_GTENR_BL_F',
 'SCH_IDEAENR_TR_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_AM_M',
 'SCH_GTENR_HI_F',
 'SCH_DISCWODIS_ARR_AS_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_TR_F',
 'TOT_DISCWODIS_MULTOOS_F',
 'SCH_DISCWODIS_REF_AS_M',
 'SCH_DISCWDIS_ARR_IDEA_WH_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_BL_F',
 'TOT_DISCWDIS_SINGOOS_IDEA_M',
 'SCH_APENR_HI_M',
 'SCH_DISCWODIS_SINGOOS_WH_F',
 'SCH_DISCWDIS_ARR_IDEA_HP_M',
 'SCH_IDEAENR_AM_M',
 'SCH_APENR_AM_M',
 'SCH_GTENR_TR_F',
 'TOT_GTENR_F',
 'SCH_DISCWODIS_SINGOOS_AM_F',
 'SCH_APENR_WH_M',
 'SCH_DISCWODIS_REF_AM_M',
 'TOT_DISCWODIS_REF_F',
 'SCH_IDEAENR_BL_F',
 'SCH_DISCWODIS_SINGOOS_AS_M',
 'TOT_DISCWDIS_SINGOOS_IDEA_F',
 'SCH_DISCWODIS_ARR_HI_M',
 'SCH_DISCWDIS_ARR_IDEA_HP_F',
 'SCH_DISCWDIS_REF_IDEA_HI_M',
 'SCH_ENR_TR_M',
 'SCH_GTENR_BL_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_AM_M',
 'SCH_DISCWODIS_ARR_HI_F',
 'SCH_IDEAENR_AS_F',
 'SCH_DISCWDIS_ARR_IDEA_HI_F',
 'SCH_APENR_HP_F',
 'TOT_IDEAENR_M',
 'SCH_GTENR_AM_M',
 'SCH_ENR_BL_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_AS_M',
 'SCH_ENR_AM_M',
 'SCH_DISCWODIS_MULTOOS_WH_F',
 'SCH_ENR_HP_M',
 'SCH_IDEAENR_BL_M',
 'SCH_IDEAENR_TR_M',
 'SCH_APENR_WH_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_AS_F',
 'SCH_ENR_AM_F',
 'SCH_DISCWDIS_REF_IDEA_TR_M',
 'SCH_DISCWODIS_REF_AS_F',
 'SCH_DISCWDIS_ARR_IDEA_AM_F',
 'SCH_ENR_WH_M',
 'SCH_DISCWODIS_REF_BL_M',
 'SCH_DISCWODIS_MULTOOS_AM_M',
 'SCH_DISCWDIS_ARR_IDEA_HI_M',
 'SCH_DISCWODIS_REF_HP_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_AS_M',
 'SCH_DISCWODIS_ARR_AS_M',
 'SCH_IDEAENR_WH_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_HP_F',
 'TOT_DISCWDIS_REF_IDEA_M',
 'SCH_FTESECURITY_LEO',
 'SCH_DISCWODIS_MULTOOS_TR_M',
 'SCH_IDEAENR_HP_F',
 'SCH_DISCWODIS_MULTOOS_HI_F',
 'SCH_DISCWDIS_ARR_IDEA_WH_F',
 'SCH_GTENR_HP_M',
 'SCH_DISCWDIS_MULTOOS_IDEA_TR_M',
 'SCH_DISCWDIS_ARR_IDEA_AS_F',
 'SCH_DISCWODIS_REF_WH_M',
 'SCH_DISCWODIS_ARR_BL_M',
 'TOT_IDEAENR_F',
 'TOT_DISCWDIS_ARR_IDEA_F',
 'SCH_DISCWODIS_MULTOOS_AS_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_WH_M',
 'SCH_APENR_TR_F',
 'SCH_ENR_WH_F',
 'SCH_DISCWODIS_REF_WH_F',
 'SCH_APENR_HP_M',
 'SCH_DISCWODIS_MULTOOS_BL_M',
 'SCH_GTENR_AM_F',
 'TOT_DISCWDIS_MULTOOS_IDEA_F',
 'SCH_DISCWDIS_REF_IDEA_HP_M',
 'SCH_ENR_HI_F',
 'SCH_DISCWDIS_REF_IDEA_HP_F',
 'TOT_ENR_M',
 'SCH_ENR_AS_F',
 'SCH_DISCWDIS_REF_IDEA_AM_F',
 'SCH_DISCWODIS_ARR_HP_M',
 'SCH_APENR_AS_F',
 'TOT_DISCWODIS_ARR_F',
 'SCH_ENR_AS_M',
 'SCH_GTENR_WH_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_HP_F',
 'SCH_GTENR_LEP_F',
 'SCH_IDEAENR_WH_F',
 'SCH_DISCWODIS_SINGOOS_AM_M',
 'SCH_DISCWODIS_SINGOOS_WH_M',
 'SCH_DISCWDIS_ARR_IDEA_TR_F',
 'SCH_ENR_HP_F',
 'SCH_DISCWODIS_MULTOOS_HP_M',
 'SCH_DISCWDIS_REF_IDEA_TR_F',
 'SCH_GTENR_AS_M',
 'SCH_DISCWDIS_SINGOOS_IDEA_WH_M',
 'TOT_GTENR_M',
 'SCH_DISCWODIS_ARR_TR_F',
 'SCH_DISCWDIS_SINGOOS_IDEA_HP_M',
 'SCH_DISCWODIS_MULTOOS_HP_F',
 'SCH_DISCWDIS_ARR_IDEA_BL_M',
 'SCH_DISCWDIS_REF_IDEA_AS_F',
 'SCH_APENR_BL_M',
 'TOT_DISCWDIS_REF_IDEA_F',
 'SCH_DISCWDIS_MULTOOS_IDEA_HI_M']

Custom aggregation dictionary - for string values, take the first non-null value.  For numeric fields, take the sum

In [691]:
agg_dict = {k:sum if k not in {'LEA_NAME', 'LEA_STATE_NAME'} else 'first' for k in agg_cols}

In [692]:
len(agg_dict) == len(agg_cols)

True

In [693]:
grouped_by_distyr = district.groupby(['LEAID', 'YEAR'])[agg_cols].agg(agg_dict)
grouped_by_distyr = grouped_by_distyr[agg_cols]
grouped_by_distyr.reset_index(inplace=True)
grouped_by_distyr.sort_values(by=['LEAID', 'YEAR'], inplace=True)

Fill in missing district states for 11/12 and 13/14 using the 15/16 value

In [694]:
grouped_by_distyr['LEA_STATE_NAME'] = grouped_by_distyr['LEA_STATE_NAME'].fillna(method='bfill')

Read geodata from file and join to district aggregation.

In [174]:
sch_geo_1516 = pd.read_excel('/Users/cave/Desktop/discriminology/Geocoded Schools:Districts/2015:16/Schools_EDGE_GEOCODE_PUBLICSCH_1516.xlsx', dtype={'NCESSCH': str})
dist_geo_1516 = pd.read_excel('/Users/cave/Desktop/discriminology/Geocoded Schools:Districts/2015:16/Districts_EDGE_GEOCODE_PUBLICLEA_1516.xlsx', dtype={'LEAID': str})


In [175]:
sch_geo_1516['NCESSCH'] = sch_geo_1516['NCESSCH'].str.zfill(12)
dist_geo_1516['LEAID'] = dist_geo_1516['LEAID'].str.zfill(7)
sch_geo_1516.NCESSCH.nunique()

102209

In [176]:
sch_geo_1819 = pd.read_excel('/Users/cave/Desktop/discriminology/Geocoded Schools:Districts/2018:19/Schools_EDGE_GEOCODE_PUBLICSCH_1819.xlsx', dtype={'NCESSCH': str})
dist_geo_1819 = pd.read_excel('/Users/cave/Desktop/discriminology/Geocoded Schools:Districts/2018:19/Districts_EDGE_GEOCODE_PUBLICLEA_1819.xlsx', dtype={'LEAID': str})

In [177]:
sch_geo_1819['NCESSCH'] = sch_geo_1819['NCESSCH'].str.zfill(12)
dist_geo_1819['LEAID'] = dist_geo_1819['LEAID'].str.zfill(7)
sch_geo_1819.NCESSCH.nunique()

102176

In [178]:
dist_geo_1819['LEAID'].nunique()

19840

In [179]:
dist_geo_1516['LEAID'].nunique()

18862

In [186]:
pd.merge(df1516_raw, dist_geo_1819, left_on='LEAID', right_on='LEAID', how='inner')

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,PCT_CITY13,PCT_SUB21,PCT_SUB22,PCT_SUB23,PCT_TOWN31,PCT_TOWN32,PCT_TOWN33,PCT_RURAL41,PCT_RURAL42,PCT_RURAL43
0,AL,ALABAMA,0100002,Alabama Youth Services,1705,Wallace Sch - Mt Meigs Campus,Yes,No,No,No,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.00,-2.0,-2.00,-2.0
1,AL,ALABAMA,0100002,Alabama Youth Services,1706,McNeel Sch - Vacca Campus,Yes,No,No,No,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.00,-2.0,-2.00,-2.0
2,AL,ALABAMA,0100002,Alabama Youth Services,1876,Alabama Youth Services,No,No,No,No,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.00,-2.0,-2.00,-2.0
3,AL,ALABAMA,0100002,Alabama Youth Services,99995,AUTAUGA CAMPUS,Yes,No,No,No,...,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.00,-2.0,-2.00,-2.0
4,AL,ALABAMA,0100005,Albertville City,870,Albertville Middle School,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,100.0,0.00,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95355,WY,WYOMING,5680250,Region V BOCES,48,C-Bar-V Ranch,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,100.00,0.0
95356,WY,WYOMING,5680251,Wyoming Department of Family Services,534,Wyoming Girls School,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,36.21,0.0,63.79,0.0
95357,WY,WYOMING,5680251,Wyoming Department of Family Services,538,Wyoming Boys School,Yes,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,36.21,0.0,63.79,0.0
95358,WY,WYOMING,5680252,Youth Emergency Services Inc. - Administration...,350,Youth Emergency Services Inc.,No,No,No,No,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00,100.0,0.00,0.0


In [187]:
dist_grouped_1516 = _

In [188]:
dist_grouped_1516['LEAID'].nunique()

16553

Districts that are in school data but in Geodata.

In [190]:
df1516_raw.set_index('LEAID', inplace=True)

In [201]:
missing_in_geo = list(set(df1516_raw.index.values) - set(dist_geo_1516['LEAID'].values))
in_geo_not_original = list(set(dist_geo_1516['LEAID'].values) - set(df1516_raw.index.values))

In [222]:
df1516_raw[df1516_raw['LEA_STATE']=='CA'].index.nunique()

1557

In [225]:
df1516_raw.index

Index(['0100002', '0100002', '0100002', '0100002', '0100005', '0100005',
       '0100005', '0100005', '0100005', '0100005',
       ...
       '5606240', '5606240', '5606240', '5606240', '5680180', '5680250',
       '5680251', '5680251', '5680252', '5680254'],
      dtype='object', name='LEAID', length=96360)

In [224]:
dist_geo_1516[dist_geo_1516['LSTATE']=='CA'].LEAID.nunique()

1168

In [226]:
len(in_geo_not_original)

2111

In [228]:
dist_geo_1516.set_index('LEAID', inplace=True)

In [233]:
dist_geo_1516[dist_geo_1516['LSTATE']=='M']['NAME']

LEAID
0100193                      St. Marys Home
0100192                      McInnis School
0100191                    Evergreen School
5900004    Arizona Navajo Central Education
5900009    Arizona Navajo South Education L
                         ...               
6300011                    Okinawa District
6300023                      Isles District
6300024             Kaiserslautern District
6300025              Mediterranean District
6300026                      Japan District
Name: NAME, Length: 1374, dtype: object

In [235]:
'6300026' in df1516_raw.index

False

In [242]:
dist_geo_1516.loc[in_geo_not_original]

Unnamed: 0_level_0,NAME,OPSTFIPS,LSTREE,LCITY,LSTATE,LZIP,LZIP4,STFIP15,CNTY15,NMCNTY15,...,CBSA15,NMCBSA15,CBSATYPE15,CSA15,NMCSA15,NECTA15,NMNECTA15,CD15,SLDL15,SLDU15
LEAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4503470,Spartanburg 80,45,M,M,M,M,M,45,45083,Spartanburg County,...,43900,"Spartanburg, SC",1,273,"Greenville-Spartanburg-Anderson, SC",N,N,4504,032,013
4680280,Northwest Area Schools ED Cooper,46,503 N Main,Isabel,SD,57633,0035,46,46041,Dewey County,...,N,N,N,N,N,N,N,4600,28A,028
2000024,Department of Corrections,20,815 SE Rice Rd.,Topeka,KS,66607,M,20,20177,Shawnee County,...,45820,"Topeka, KS",1,N,N,N,N,2002,057,019
3500159,LA JICARITA COMMUNITY SCHOOL,35,15025 STATE RD 75,PENASCO,NM,87553,M,35,35055,Taos County,...,45340,"Taos, NM",2,N,N,N,N,3503,042,006
0406930,Redington Elementary District,4,130 W Congress St.,Tucson,AZ,85701,M,04,04019,Pima County,...,46060,"Tucson, AZ",1,536,"Tucson-Nogales, AZ",N,N,0403,003,003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0601365,School Project Utility Rate Redu,6,313 W. Winton Ave.,Hayward,CA,94544,1136,06,06001,Alameda County,...,41860,"San Francisco-Oakland-Hayward, CA",1,488,"San Jose-San Francisco-Oakland, CA",N,N,0615,020,010
2200142,Shreveport Charter School Inc.,22,401 West 70th Street,Shreveport,LA,71106,3034,22,22017,Caddo Parish,...,43340,"Shreveport-Bossier City, LA",1,N,N,N,N,2204,003,039
4033360,YALE,40,315 East Chicago Avenue,Yale,OK,74085,3513,40,40119,Payne County,...,44660,"Stillwater, OK",2,N,N,N,N,4003,033,021
3880500,SHEYENNE VALLEY SPECIAL ED UNIT,38,232 3rd St NE,Valley City,ND,58072,M,38,38003,Barnes County,...,N,N,N,N,N,N,N,3800,024,024


In [231]:
dist_geo_1516.loc[in_geo_not_original].LSTATE.value_counts()

M     366
CA    146
VT    135
NH    121
MT     91
PA     91
OH     84
CO     79
AZ     77
ME     69
OK     66
MN     65
VA     58
TX     56
NY     54
ND     49
IN     49
LA     47
MI     45
AR     29
NC     27
WA     25
NJ     24
KS     23
NE     20
GA     20
SD     19
AL     18
WI     17
UT     15
MO     15
OR     15
IA     12
KY     11
DC     11
MA     10
NM      8
CT      8
DE      7
MS      6
WY      5
SC      5
RI      4
ID      3
FL      2
VI      2
PR      1
IL      1
Name: LSTATE, dtype: int64

In [200]:
len(list(set(df1516_raw.index.values) - set(dist_geo_1516['LEAID'].values)))

586

In [241]:
df1516_raw.loc[missing_in_geo]['SCH_STATUS_CHARTER'].value_counts() / len(df1516_raw.loc[missing_in_geo])

Yes    0.877828
No     0.122172
Name: SCH_STATUS_CHARTER, dtype: float64

In [184]:
len(set(df1516_raw['LEAID'].values).intersection(set(dist_geo_1516['LEAID'].values)))

16751

In [185]:
dist_1516['LEAID'].nunique()

16751

In [108]:
'010000201706' in df_1516_final.index

True

In [110]:
sch_geo_1516.head()

Unnamed: 0,NCESSCH,NAME,OPSTFIPS,LSTREE,LCITY,LSTATE,LZIP,LZIP4,STFIP15,CNTY15,...,CBSA15,NMCBSA15,CBSATYPE15,CSA15,NMCSA15,NECTA15,NMNECTA15,CD15,SLDL15,SLDU15
0,10000200277,Sequoyah Sch - Chalkville Campus,1,1000 Industrial School Road,Birmingham,AL,35220,M,1,1073,...,13820,"Birmingham-Hoover, AL",1,142,"Birmingham-Hoover-Talladega, AL",N,N,106,44,20
1,10000201667,Camps,1,1601 County Rd. 57,Prattville,AL,36067,M,1,1001,...,33860,"Montgomery, AL",1,N,N,N,N,102,42,30
2,10000201670,Det Ctr,1,2109 Bashi Rd Bldg 509,Thomasville,AL,36784,M,1,1025,...,N,N,N,N,N,N,N,107,68,24
3,10000201705,Wallace Sch - Mt Meigs Campus,1,1000 Industrial School Road,Mount Meigs,AL,36057,M,1,1101,...,33860,"Montgomery, AL",1,N,N,N,N,103,75,25
4,10000201706,McNeel Sch - Vacca Campus,1,8950 Roebuck Blvd,Birmingham,AL,35206,M,1,1073,...,13820,"Birmingham-Hoover, AL",1,142,"Birmingham-Hoover-Talladega, AL",N,N,107,58,20


In [23]:
geo_data = pd.read_excel('/Users/cave/Desktop/discriminology/LEA Profile Info.xlsx', dtype={'LEAID': str})
geo_data['LEAID'] = geo_data['LEAID'].str.zfill(7)

In [24]:
len(geo_data)

16758

In [25]:
geo_data['LEAID'].nunique()

16758

In [26]:
len(set(grouped_by_distyr['LEAID'].values))

NameError: name 'grouped_by_distyr' is not defined

In [701]:
len(set(geo_data['LEAID'].values).intersection(set(grouped_by_distyr['LEAID'].values)))

16758

In [31]:
geo_data.columns[-120:]

Index(['Current Representative Address', 'Current Representative Phone',
       'Current Representative Twitter', 'Current Representative Facebook',
       'Current Representative Source', 'Current Senator #1 Last name',
       'Current Senator #1 First name', 'Current Senator #1 Party',
       'Current Senator #1 Url', 'Current Senator #1 Address',
       ...
       'Mayor Last name', 'Mayor email address', 'Mayor Phone',
       'Mayor Twitter ', 'Alderman #1 First name', 'Alderman #1 Last name',
       'Alderman #1 Ward/legislative district # ', 'Alderman #1 email address',
       'Alderman #1 Phone', 'Alderman #1 Twitter '],
      dtype='object', length=120)

In [702]:
district_geo = geo_data[['LEAID', 'LEA_ADDRESS', 'LEA_CITY', 'LEA_ZIP',
       'CJJ', 'LEA_ENR', 'LEA_SCHOOLS', 'Latitude', 'Longitude']]

In [703]:
district_geo.head()

Unnamed: 0,LEAID,LEA_ADDRESS,LEA_CITY,LEA_ZIP,CJJ,LEA_ENR,LEA_SCHOOLS,Latitude,Longitude
0,100002,1000 INDUSTRIAL SCHOOL ROAD,MT. MEIGS,36057,Yes,3674,3,32.371901,-86.083791
1,100005,107 WEST MAIN STREET,ALBERTVILLE,35950,No,4712,6,34.268415,-86.209216
2,100006,12380 US HIGHWAY 431 S,GUNTERSVILLE,35976,No,5624,14,34.305366,-86.287334
3,100007,2810 METROPOLITAN WAY,HOOVER,35243,No,14054,16,33.445932,-86.750203
4,100008,211 CELTIC DRIVE,MADISON,35758,Yes,9206,11,34.686749,-86.745762


In [704]:
district_group_w_geo = pd.merge(grouped_by_distyr, district_geo, left_on='LEAID', right_on='LEAID', how='left')

In [717]:
(df_1112_raw[df_1112_raw['LEAID']=='0100002']['F_TOT_7_ENROL'] + df_1112_raw[df_1112_raw['LEAID']=='0100002']['M_TOT_7_ENROL']).sum()

1098.0

In [719]:
(df_1314_raw[df_1314_raw['LEAID']=='0100002']['TOT_ENR_F'] + df_1314_raw[df_1314_raw['LEAID']=='0100002']['TOT_ENR_M']).sum()



3702

In [720]:
(df1516_raw[df1516_raw['LEAID']=='0100002']['TOT_ENR_F'] + df1516_raw[df1516_raw['LEAID']=='0100002']['TOT_ENR_M']).sum()



1126

In [726]:
totals_compare = district_group_w_geo[['LEAID', 'YEAR', 'LEA_ENR', 'TOTAL_ENROLLMENT', 'TOTAL_SCHOOLS', 'LEA_SCHOOLS']][~district_group_w_geo['LEA_ENR'].isna()]



In [738]:
totals_compare['enrollment_discrepancy'] = totals_compare['TOTAL_ENROLLMENT'] - totals_compare['LEA_ENR']
totals_compare['enrollment_discrepancy_percentage'] = 100*(totals_compare.enrollment_discrepancy / (totals_compare['TOTAL_ENROLLMENT']+1))

In [742]:
totals_compare[totals_compare['YEAR']=='2015-16']

Unnamed: 0,LEAID,YEAR,LEA_ENR,TOTAL_ENROLLMENT,TOTAL_SCHOOLS,LEA_SCHOOLS,enrollment_discrepancy,enrollment_discrepancy_percentage
2,0100002,2015-16,3674.0,1126.0,4.0,3.0,-2548.0,-226.086957
5,0100005,2015-16,4712.0,5203.0,6.0,6.0,491.0,9.435050
8,0100006,2015-16,5624.0,5671.0,14.0,14.0,47.0,0.828632
11,0100007,2015-16,14054.0,14479.0,16.0,16.0,425.0,2.935083
14,0100008,2015-16,9206.0,10021.0,11.0,11.0,815.0,8.132109
...,...,...,...,...,...,...,...,...
51345,5606240,2015-16,1388.0,1359.0,5.0,5.0,-29.0,-2.132353
51348,5680180,2015-16,32.0,41.0,1.0,16.0,9.0,21.428571
51351,5680250,2015-16,41.0,50.0,1.0,3.0,9.0,17.647059
51354,5680251,2015-16,95.0,269.0,2.0,2.0,174.0,64.444444


In [743]:
totals_compare[totals_compare['YEAR']=='2013-14']

Unnamed: 0,LEAID,YEAR,LEA_ENR,TOTAL_ENROLLMENT,TOTAL_SCHOOLS,LEA_SCHOOLS,enrollment_discrepancy,enrollment_discrepancy_percentage
1,0100002,2013-14,3674.0,3702.0,3.0,3.0,28.0,0.756144
4,0100005,2013-14,4712.0,4723.0,6.0,6.0,11.0,0.232854
7,0100006,2013-14,5624.0,5658.0,14.0,14.0,34.0,0.600813
10,0100007,2013-14,14054.0,13916.0,16.0,16.0,-138.0,-0.991593
13,0100008,2013-14,9206.0,9618.0,11.0,11.0,412.0,4.283190
...,...,...,...,...,...,...,...,...
51344,5606240,2013-14,1388.0,1385.0,5.0,5.0,-3.0,-0.216450
51347,5680180,2013-14,32.0,34.0,1.0,16.0,2.0,5.714286
51350,5680250,2013-14,41.0,36.0,1.0,3.0,-5.0,-13.513514
51353,5680251,2013-14,95.0,111.0,2.0,2.0,16.0,14.285714


In [744]:
df_1314_raw[df_1314_raw['LEAID']=='5680250']

Unnamed: 0,JJ,LEAID,LEA_NAME,LEA_STATE,SCHID,SCH_APENR_AM_F,SCH_APENR_AM_M,SCH_APENR_AS_F,SCH_APENR_AS_M,SCH_APENR_BL_F,...,TOT_DISCWODIS_REF_F,TOT_DISCWODIS_REF_M,TOT_DISCWODIS_SINGOOS_F,TOT_DISCWODIS_SINGOOS_M,TOT_ENR_F,TOT_ENR_M,TOT_GTENR_F,TOT_GTENR_M,TOT_IDEAENR_F,TOT_IDEAENR_M
568025000048,No,5680250,REGION V BOCES,WY,48,0,0,0,0,0,...,0,0,0,0,8,28,0,0,7,28


In [740]:
district_merge = pd.merge(district_group_w_geo.T, descriptions, left_index=True, right_index=True, how='left').set_index('description', append=True).T

In [386]:
sample = district_merge[~district_merge['Latitude', None].isna()]
sample[-600:].to_csv('~/Desktop/discriminology/output/district_level_sample.csv')

In [401]:
district_merge[district_merge['Latitude', None].isna()]

Unnamed: 0_level_0,LEAID,YEAR,LEA_NAME,LEA_STATE_NAME,SCH_DISCWODIS_MULTOOS_HI_M,SCH_DISCWDIS_ARR_IDEA_AM_M,SCH_GTENR_IDEA_F,SCH_DISCWODIS_SINGOOS_HP_F,SCH_FTECOUNSELORS,SCH_DISCWODIS_ARR_WH_M,...,TOT_DISCWDIS_REF_IDEA_F,SCH_DISCWDIS_MULTOOS_IDEA_HI_M,LEA_ADDRESS,LEA_CITY,LEA_ZIP,CJJ,LEA_ENR,LEA_SCHOOLS,Latitude,Longitude
description,7 Digit LEAID District Identification Code,NaN,District Name,District State Name,Students without disabilities who received more than one out-of-school suspension: Hispanic Male,Students with disabilities who received a school-related arrest: American Indian/Alaska Native Male,Gifted and Talented Student Enrollment: IDEA Female,Students without disabilities who received only one out-of-school suspension: Native Hawaiian/Pacific Islander Female,School Counselors: Number of FTE school counselors,Students without disabilities who received a school-related arrest: White Male,...,Students with disabilities who were referred to a law enforcement agency or official: Calculated Female Total,Students with disabilities who received more than one out-of-school suspension: Hispanic Male,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
69,100040,2011-12,Brantwood Children'S Home,ALABAMA,0,0,0,0,0,0,...,0,0,,,,,,,,
97,100179,2011-12,The Bridge Ii,ALABAMA,0,0,0,0,0,0,...,0,0,,,,,,,,
115,100189,2015-16,Satsuma City,ALABAMA,0,0,2,0,2.5,0,...,0,0,,,,,,,,
119,100194,2015-16,Pelham City,ALABAMA,6,0,0,0,7,0,...,0,0,,,,,,,,
120,100195,2015-16,Pike Road,ALABAMA,2,0,0,0,1,0,...,0,0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51453,804680,2015-16,Holly School District No. Re-3,COLORADO,0,0,0,0,0,0,...,0,0,,,,,,,,
51454,804710,2015-16,Holyoke School District No. Re-1J,COLORADO,2,0,0,0,2,0,...,0,0,,,,,,,,
51455,804740,2015-16,Genoa-Hugo School District No. C-113,COLORADO,0,0,0,0,0,0,...,0,0,,,,,,,,
51456,804770,2015-16,Ignacio School District No. 11Jt,COLORADO,4,0,2,0,4,0,...,0,4,,,,,,,,


In [398]:
len(sample)

28726

In [342]:
district_merge.to_csv('~/Desktop/discriminology/output/district_level_aggregates.csv')

### Code Sandbox - everything below only needs to be run once

In [None]:
# descriptions = pd.concat(desc_frames_1112, axis=0)
# descriptions.drop_duplicates(inplace=True)
# descriptions.set_index('Field Name', inplace=True)
# descriptions = descriptions.loc[fields_1112]
# descriptions.columns = ['2011_12_description']
# descriptions.reset_index().to_csv('/Users/cave/Desktop/2011_12_field_descriptions.csv')

# descriptions = pd.concat(desc_frames_1314, axis=0)
# descriptions.drop_duplicates(inplace=True)
# descriptions.set_index('Field Name', inplace=True)
# descriptions = descriptions.loc[fields_1314]
# descriptions.columns = ['2013_14_description']
# descriptions.reset_index().to_csv('/Users/cave/Desktop/2013_14_field_descriptions.csv')