In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Set Global Variables and Create Dataframes

Dataframe Name | Table Name
| --- | --- |
df_epp | Expenditures Per Pupil
df_nxp | Inexperienced Teachers and Principals
df_ocert | Teachers Teaching Out of Certification
df_y | Annual Regents Exams


In [2]:
df_epp = pd.read_excel("../Data/SRC2022/Expenditures Per Pupil.xlsx")
df_nxp = pd.read_excel("../Data/SRC2022/Inexperienced Teachers and Principals.xlsx")
df_ocert = pd.read_excel("../Data/SRC2022/Teachers Teaching Out of Certification.xlsx")
df_y = pd.read_excel("../Data/SRC2022/Annual Regents Exams.xlsx")

# Only the standarized tests we wish to look at. 
chosen_text_subjects = ('Regents Common Core Algebra I', 'Regents Common Core English Language Art', 'Regents Phy Set/Earth Sci')

### Trimming Dataframes
#### Defining a Trimming Function that will only keep the columns we wish and only keep the rows that have our desired YEAR and SUBGROUP_NAME of "All Students"

In [3]:
def trimmer(df: pd.DataFrame, *column_names, year: int=2022, year_col_name: str="YEAR")->pd.DataFrame:
    """
    Keeps only the columns specified, only the rows whose year agrees with the year argument, 
    and if there is a SUBGROUP column then it keeps only rows with the subgroup classification 
    of "All Students". 
    This is NOT an in-place method. 
    """
    # Trimming Rows
    if year_col_name in df.columns:
        df = df.loc[df[year_col_name]==year]
    if "SUBGROUP_NAME" in df.columns:
        df = df.loc[df["SUBGROUP_NAME"]=="All Students"] #<-----This drops all rows that do not have "All Students" as the "SUBGROUP" column entry. 
    if "SUBJECT" in df.columns:
        df = df.loc[df.SUBJECT.isin(chosen_text_subjects)]
    # Trimming Columns
    return df[list(column_names)]

Note that we no longer have to look for ENTITY_CD ENTRIES that don't end in 0000.

#### Applying the Trimming Function

In [13]:
# Expenditures Per Pupil: Keep only rows with the year 2021 and keeping only school ID, Name and Normalized Expenditures
df_epp_t = trimmer(df_epp,'ENTITY_NAME', 'ENTITY_CD', 'PER_FED_STATE_LOCAL_EXP', year=2022)
df_nxp_t = trimmer(df_nxp,'ENTITY_CD', 'PER_TEACH_INEXP', year=2022)
df_ocert_t = trimmer(df_ocert,'ENTITY_CD', "PER_OUT_CERT", year=2022)

# df_y: Drop NaN values and make INSTITUTION_ID a string
df_y_c = df_y.dropna(subset=['INSTITUTION_ID'], ignore_index=True)

# df_y: Trim and Pivot
df_y_t = trimmer(df_y_c, 'INSTITUTION_ID', 'ENTITY_CD', 'ENTITY_NAME', 'SUBJECT', "PER_PROF", year=2022)


We had a lot of trouble with pivoting before. Now, hopefully the code all works well and we can just pivot. 
#### Checking for duplicates. 

In [14]:

df_y_t.ENTITY_CD.value_counts()

ENTITY_CD
42302040000     3
22401040003     3
60601040003     3
121701040001    3
120301040000    3
               ..
50301040002     1
50301040000     1
43011020001     1
43001040002     1
131701060004    1
Name: count, Length: 218, dtype: int64

Success! We have at most 3 instances for each ENTITY_CD, one for each type of exam we decided on. 

But if we look on the row of the last cell, it says "Length: 218". So there are only 218 values. That means there are only 218 schools in the list now. We get the same number when using the INSTITUTION_ID column, so it's not the choice of id:

In [17]:
len(df_y_t.INSTITUTION_ID.unique())

218

But if we go back to the original Regents Exams Dataframe, we get almost 3,000 schools. Does this mean that 90% of these didn't have an exam entry for our matching tests? 

In [16]:
df_y.ENTITY_CD.value_counts()

ENTITY_CD
60601040000     214
60601040003     212
62901040000     195
62901040002     195
10100860907     193
               ... 
320900010323      1
320900010328      1
261600010045      1
331700011524      1
342400010113      1
Name: count, Length: 2974, dtype: int64

#### Pivoting the exam results

In [9]:
df_y_t = df_y_t.pivot(index=['INSTITUTION_ID'], 
                            values=["PER_PROF"], 
                            columns=["SUBJECT"]
                            )

### Merging the Tables

In [51]:
from functools import reduce

trimmed_data_frames = [df_epp_t, df_nxp_t, df_ocert_t, df_y_t]
df = reduce(lambda  left_df, right_df: pd.merge(left_df, right_df, on='ENTITY_CD',
                                            how='inner'
                                            ), 
            trimmed_data_frames)

In [63]:
# Trying df.pivot instead of df.pivot_table (what we used before) because we shouldn't have duplicates
# df_p = df.pivot(index=["ENTITY_NAME", 'ENTITY_CD'], 
#                             values=["PER_PROF"], 
#                             columns=["SUBJECT"], 
#                             aggfunc="first"
#                             )
df_p = df.groupby(["ENTITY_CD", "SUBJECT"]).first().PER_PROF.unstack()

df_p.head()

SUBJECT,Regents Common Core Algebra I,Regents Common Core English Language Art,Regents Phy Set/Earth Sci
ENTITY_CD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10100010030,83,,
10100010034,32,71.0,23.0
10100010045,75,,
10100010051,57,,
10100860867,8,,


In [14]:
df_p.ENTITY_NAME.value_counts()

NameError: name 'df_p' is not defined

In [38]:
df_epp_t.ENTITY_NAME.value_counts()

ENTITY_NAME
PARK AVENUE SCHOOL                      4
JOHN F KENNEDY MIDDLE SCHOOL            4
JOHN F KENNEDY SCHOOL                   4
GEORGE WASHINGTON SCHOOL                3
JEFFERSON ELEMENTARY SCHOOL             3
                                       ..
WORLD VIEW HIGH SCHOOL                  1
BRONX COLLABORATIVE HIGH SCHOOL         1
INTERNATIONAL SCHOOL FOR LIBERAL ART    1
PULSE HIGH SCHOOL                       1
MARION ELEMENTARY SCHOOL                1
Name: count, Length: 5370, dtype: int64

In [46]:
df_epp.loc[df_epp.ENTITY_NAME.isin(["PARK AVENUE SCHOOL", "JOHN F KENNEDY MIDDLE SCHOOL"])]

Unnamed: 0,INSTITUTION_ID,ENTITY_CD,ENTITY_NAME,YEAR,PUPIL_COUNT_TOT,FEDERAL_EXP,PER_FEDERAL_EXP,STATE_LOCAL_EXP,PER_STATE_LOCAL_EXP,FED_STATE_LOCAL_EXP,PER_FED_STATE_LOCAL_EXP,DATA_REPORTED_ENR,DATA_REPORTED_EXP
770,800000052300,140709030003,JOHN F KENNEDY MIDDLE SCHOOL,2021,287.0,210876.0,735.0,4532700.0,15793.0,4743576.0,16528.0,Y,Y
1127,800000048763,280521030008,JOHN F KENNEDY MIDDLE SCHOOL,2021,679.0,67955.0,100.0,15722849.0,23156.0,15790804.0,23256.0,Y,Y
1207,800000049733,280204020004,PARK AVENUE SCHOOL,2021,312.0,84026.0,269.0,4738904.0,15189.0,4822930.0,15458.0,Y,Y
1559,800000034961,661904030005,PARK AVENUE SCHOOL,2021,426.0,216904.0,509.0,6098639.0,14316.0,6315543.0,14825.0,Y,Y
3318,800000049186,280401030001,PARK AVENUE SCHOOL,2021,653.0,413625.0,633.0,15759588.0,24134.0,16173212.0,24768.0,Y,Y
3687,800000037695,580203020004,JOHN F KENNEDY MIDDLE SCHOOL,2021,796.0,267591.0,336.0,14911613.0,18733.0,15179205.0,19069.0,Y,Y
4469,800000048763,280521030008,JOHN F KENNEDY MIDDLE SCHOOL,2022,695.0,188333.0,271.0,15182079.0,21845.0,15370412.0,22116.0,Y,Y
4781,800000041272,412300010022,JOHN F KENNEDY MIDDLE SCHOOL,2021,673.0,854626.0,1270.0,7344290.0,10913.0,8198916.0,12183.0,Y,Y
6246,800000052300,140709030003,JOHN F KENNEDY MIDDLE SCHOOL,2022,267.0,541691.0,2029.0,4307866.0,16134.0,4849557.0,18163.0,Y,Y
7220,800000049733,280204020004,PARK AVENUE SCHOOL,2022,300.0,240112.0,800.0,4951397.0,16505.0,5191509.0,17305.0,Y,Y


In [43]:
df_epp.columns

Index(['INSTITUTION_ID', 'ENTITY_CD', 'ENTITY_NAME', 'YEAR', 'PUPIL_COUNT_TOT',
       'FEDERAL_EXP', 'PER_FEDERAL_EXP', 'STATE_LOCAL_EXP',
       'PER_STATE_LOCAL_EXP', 'FED_STATE_LOCAL_EXP', 'PER_FED_STATE_LOCAL_EXP',
       'DATA_REPORTED_ENR', 'DATA_REPORTED_EXP'],
      dtype='object')