In [2]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Creating the Dataframes

In [54]:
df_epp = pd.read_excel("../Data/SRC2022/Expenditures Per Pupil.xlsx")
df_nxp = pd.read_excel("../Data/SRC2022/Inexperienced Teachers and Principals.xlsx")
df_ocert = pd.read_excel("../Data/SRC2022/Teachers Teaching Out of Certification.xlsx")
df_y = pd.read_excel("../Data/SRC2022/Annual Regents Exams.xlsx")

### Trimming Dataframes
#### Defining a Trimming Function

In [50]:
def trimmer(df: pd.DataFrame, *column_names, year: int=2021, year_col_name: str="YEAR")->pd.DataFrame:
    """
    Keeps only the columns specified, only the rows whose year agrees with the year argument, 
    and if there is a SUBGROUP column then it keeps only rows with the subgroup classification 
    of "All Students". 
    This is NOT an in-place method. 
    """
    if year_col_name in df.columns:
        df = df.loc[df[year_col_name]==year]
    if "SUBGROUP" in df.columns:
        df = df.loc[df["SUBGROUP"]=="All Students"]
    return df[list(column_names)]

### Applying the Trimming Function

In [55]:
# Expenditures Per Pupil: Keep only rows with the year 2021 and keeping only school ID, Name and Normalized Expenditures
df_epp = trimmer(df_epp, 'INSTITUTION_ID', 'ENTITY_NAME', 'PER_FED_STATE_LOCAL_EXP', year=2021)
df_nxp = trimmer(df_nxp, 'INSTITUTION_ID', 'ENTITY_NAME', 'PER_TEACH_INEXP', year=2021)
df_ocert = trimmer(df_ocert, 'INSTITUTION_ID', 'ENTITY_NAME', "PER_OUT_CERT", year=2021)
df_y = trimmer(df_y, 'INSTITUTION_ID', 'ENTITY_NAME', 'SUBJECT', year=2021)


## Original Code Line without the trimmer function
## # df_epp = df_epp.loc[df_epp["YEAR"]==2021][['INSTITUTION_ID', 'ENTITY_NAME', 'PER_FED_STATE_LOCAL_EXP']]

#### Observation: Look at the shapes of each dataframe. The first three are close but the last is almost 4 times as big as the first 3. 

In [56]:
dataframes_list = [df_epp, df_nxp, df_ocert, df_y]
for df in dataframes_list:
    print(df.shape)

(5440, 3)
(5473, 3)
(5473, 3)
(19999, 3)


#### Let's look to see how well the school names and id's overlap. 

In [68]:
# The first dataframe is smaller. Let's check to make sure it doesn't have any school names or ids that the other two don't have.
extra_epp_ids_list_1 = [id for id in df_epp.INSTITUTION_ID.unique() if not id in df_nxp.INSTITUTION_ID.unique()]
extra_epp_ids_list_2 = [id for id in df_epp.INSTITUTION_ID.unique() if not id in df_ocert.INSTITUTION_ID.unique()]
print("School IDs in Expenditures Per Pupil but not in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively):")
print(extra_epp_ids_list_1, extra_epp_ids_list_2)
print("\n \n")

# Now let's take a look at the missing ids missing the other way around. 
missing_epp_ids_list_1 = [id for id in df_nxp.INSTITUTION_ID.unique() if not id in df_epp.INSTITUTION_ID.unique()]
missing_epp_ids_list_2 = [id for id in df_ocert.INSTITUTION_ID.unique() if not id in df_epp.INSTITUTION_ID.unique()]
print("School IDs in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively) but not in Expenditures Per Pupil:")
print(sorted(missing_epp_ids_list_1), sorted(missing_epp_ids_list_2), sep="\n")
print("\n \n")

School IDs in Expenditures Per Pupil but not in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively):
[] []

 

School IDs in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively) but not in Expenditures Per Pupil:
[800000042056, 800000042344, 800000042522, 800000042781, 800000042958, 800000043080, 800000043248, 800000043407, 800000043653, 800000043733, 800000043961, 800000044121, 800000044422, 800000044521, 800000044661, 800000044895, 800000044965, 800000045191, 800000045345, 800000045563, 800000045779, 800000046011, 800000046274, 800000046428, 800000046547, 800000046647, 800000047032, 800000047161, 800000047245, 800000047476, 800000048410, 800000048532, 800000057444]
[800000042056, 800000042344, 800000042522, 800000042781, 800000042958, 800000043080, 800000043248, 800000043407, 800000043653, 800000043733, 800000043961, 800000044121, 800000044422, 800000044521, 800000044661, 800000044895, 800000044965, 

#### So far so good! Out of the first three dataframes, we are only missing a few schools and those schools match perfectly between the two tables that have them. 