In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Creating the Dataframes

In [2]:
df_epp = pd.read_excel("../Data/SRC2022/Expenditures Per Pupil.xlsx")
df_nxp = pd.read_excel("../Data/SRC2022/Inexperienced Teachers and Principals.xlsx")
df_ocert = pd.read_excel("../Data/SRC2022/Teachers Teaching Out of Certification.xlsx")
df_y = pd.read_excel("../Data/SRC2022/Annual Regents Exams.xlsx")

### Trimming Dataframes
#### Defining a Trimming Function

In [3]:
def trimmer(df: pd.DataFrame, *column_names, year: int=2022, year_col_name: str="YEAR")->pd.DataFrame:
    """
    Keeps only the columns specified, only the rows whose year agrees with the year argument, 
    and if there is a SUBGROUP column then it keeps only rows with the subgroup classification 
    of "All Students". 
    This is NOT an in-place method. 
    """
    # Trimming Rows
    if year_col_name in df.columns:
        df = df.loc[df[year_col_name]==year]
    if "SUBGROUP" in df.columns:
        df = df.loc[df["SUBGROUP"]=="All Students"] #<-----This drops all rows that do not have "All Students" as the "SUBGROUP" column entry. 
    # Trimming Columns
    return df[list(column_names)]

### Applying the Trimming Function and Pivoting the Annual Regents Exams Table so each school has one row. 

In [25]:
# Expenditures Per Pupil: Keep only rows with the year 2021 and keeping only school ID, Name and Normalized Expenditures
df_epp_t = trimmer(df_epp,'ENTITY_NAME', 'PER_FED_STATE_LOCAL_EXP', year=2022)
df_nxp_t = trimmer(df_nxp,'ENTITY_NAME', 'PER_TEACH_INEXP', year=2022)
df_ocert_t = trimmer(df_ocert,'ENTITY_NAME', "PER_OUT_CERT", year=2022)
df_y_t = trimmer(df_y.loc[df_y.ENTITY_CD%10000 != 0],'ENTITY_NAME', 'SUBJECT', "PER_PROF", year=2022)
df_y_t = df_y_t.loc[df_y.SUBJECT.isin(['Regents Common Core Algebra I', 'Regents Common Core English Language Art', 'Regents Phy Set/Earth Sci'])]
# df_y_t = df_y_t.pivot_table(index=['ENTITY_NAME'], 
#                             values=["PER_PROF"], 
#                             columns=["SUBJECT"], 
#                             aggfunc="first"
#                             )

### Merging the Tables

In [29]:
from functools import reduce

trimmed_data_frames = [df_epp_t, df_nxp_t, df_ocert_t, df_y_t]
df = reduce(lambda  left_df, right_df: pd.merge(left_df, right_df, on='ENTITY_NAME',
                                            how='inner'
                                            ), 
            trimmed_data_frames)

In [32]:
df_p = df.pivot_table(index=['ENTITY_NAME'], 
                            values=["PER_PROF"], 
                            columns=["SUBJECT"], 
                            aggfunc="first"
                            )
df_p.head()

Unnamed: 0_level_0,PER_PROF,PER_PROF,PER_PROF
SUBJECT,Regents Common Core Algebra I,Regents Common Core English Language Art,Regents Phy Set/Earth Sci
ENTITY_NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A A KINGSTON MIDDLE SCHOOL,100,,
A D OLIVER MIDDLE SCHOOL,100,,100.0
A F PALMER ES / WINDSOR CENTRAL MS,100,,
A M COSGROVE MIDDLE SCHOOL,100,,100.0
A MACARTHUR BARR MS,100,,100.0


In [37]:
df.ENTITY_NAME.value_counts()

ENTITY_NAME
WEST MIDDLE SCHOOL                1026
EAST MIDDLE SCHOOL                 837
JOHN F KENNEDY MIDDLE SCHOOL       384
BEACON HIGH SCHOOL                  96
IROQUOIS MIDDLE SCHOOL              81
                                  ... 
HARLEM RENAISSANCE HIGH SCHOOL       1
MONROE LOWER SCHOOL                  1
GREAT NECK NORTH HIGH SCHOOL         1
IRWIN ALTMAN MIDDLE SCHOOL 172       1
LYONS MIDDLE SCHOOL                  1
Name: count, Length: 1982, dtype: int64

In [24]:
# df_first_two = pd.merge(df_epp_t, 
#                         df_nxp_t, 
#                         on='ENTITY_NAME', 
#                         how="inner"
#                         )
# df_next_two = pd.merge(df_first_two, 
#                        df_ocert_t, 
#                         on='ENTITY_NAME', 
#                         how="inner"
#                         )
# df = pd.merge(df_next_two, 
#                        df_y_t, 
#                         on='ENTITY_NAME', 
#                         how="inner"
#                         )



MergeError: Not allowed to merge between different levels. (1 levels on the left, 2 on the right)