In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Creating the Dataframes

In [2]:
df_epp = pd.read_excel("../Data/SRC2022/Expenditures Per Pupil.xlsx")
df_nxp = pd.read_excel("../Data/SRC2022/Inexperienced Teachers and Principals.xlsx")
df_ocert = pd.read_excel("../Data/SRC2022/Teachers Teaching Out of Certification.xlsx")
df_y = pd.read_excel("../Data/SRC2022/Annual Regents Exams.xlsx")

### Trimming Dataframes
#### Defining a Trimming Function

In [3]:
def trimmer(df: pd.DataFrame, *column_names, year: int=2021, year_col_name: str="YEAR")->pd.DataFrame:
    """
    Keeps only the columns specified, only the rows whose year agrees with the year argument, 
    and if there is a SUBGROUP column then it keeps only rows with the subgroup classification 
    of "All Students". 
    This is NOT an in-place method. 
    """
    # Trimming Rows
    if year_col_name in df.columns:
        df = df.loc[df[year_col_name]==year]
    if "SUBGROUP" in df.columns:
        df = df.loc[df["SUBGROUP"]=="All Students"] #<-----This drops all rows that do not have "All Students" as the "SUBGROUP" column entry. 
    # Trimming Columns
    return df[list(column_names)]

### Applying the Trimming Function

In [4]:
# Expenditures Per Pupil: Keep only rows with the year 2021 and keeping only school ID, Name and Normalized Expenditures
df_epp = trimmer(df_epp, 'INSTITUTION_ID', 'ENTITY_NAME', 'PER_FED_STATE_LOCAL_EXP', year=2021)
df_nxp = trimmer(df_nxp, 'INSTITUTION_ID', 'ENTITY_NAME', 'PER_TEACH_INEXP', year=2021)
df_ocert = trimmer(df_ocert, 'INSTITUTION_ID', 'ENTITY_NAME', "PER_OUT_CERT", year=2021)
df_y = trimmer(df_y, 'ENTITY_CD', 'ENTITY_NAME', 'SUBJECT', year=2021)


## Original Code Line without the trimmer function
## # df_epp = df_epp.loc[df_epp["YEAR"]==2021][['INSTITUTION_ID', 'ENTITY_NAME', 'PER_FED_STATE_LOCAL_EXP']]

#### Observation: Look at the shapes of each dataframe. The first three are close but the last is almost 4 times as big as the first 3. 

In [5]:
dataframes_list = [df_epp, df_nxp, df_ocert, df_y]
for df in dataframes_list:
    print(df.shape)

(5440, 3)
(5473, 3)
(5473, 3)
(19999, 3)


#### Let's look to see how well the school names and id's overlap. 

In [6]:
# The first dataframe is smaller. Let's check to make sure it doesn't have any school names or ids that the other two don't have.
extra_epp_ids_list_1 = [id for id in df_epp.INSTITUTION_ID.unique() if not id in df_nxp.INSTITUTION_ID.unique()]
extra_epp_ids_list_2 = [id for id in df_epp.INSTITUTION_ID.unique() if not id in df_ocert.INSTITUTION_ID.unique()]
print("School IDs in Expenditures Per Pupil but not in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively):")
print(extra_epp_ids_list_1, extra_epp_ids_list_2)
print("\n \n")

# Now let's take a look at the missing ids missing the other way around. 
missing_epp_ids_list_1 = [id for id in df_nxp.INSTITUTION_ID.unique() if not id in df_epp.INSTITUTION_ID.unique()]
missing_epp_ids_list_2 = [id for id in df_ocert.INSTITUTION_ID.unique() if not id in df_epp.INSTITUTION_ID.unique()]
print("School IDs in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively) but not in Expenditures Per Pupil:")
print(sorted(missing_epp_ids_list_1), sorted(missing_epp_ids_list_2), sep="\n")
print("\n \n")

School IDs in Expenditures Per Pupil but not in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively):
[] []

 

School IDs in Inexperienced Teachers or Principles and Teachers Teaching Out of Certification (respectively) but not in Expenditures Per Pupil:
[800000042056, 800000042344, 800000042522, 800000042781, 800000042958, 800000043080, 800000043248, 800000043407, 800000043653, 800000043733, 800000043961, 800000044121, 800000044422, 800000044521, 800000044661, 800000044895, 800000044965, 800000045191, 800000045345, 800000045563, 800000045779, 800000046011, 800000046274, 800000046428, 800000046547, 800000046647, 800000047032, 800000047161, 800000047245, 800000047476, 800000048410, 800000048532, 800000057444]
[800000042056, 800000042344, 800000042522, 800000042781, 800000042958, 800000043080, 800000043248, 800000043407, 800000043653, 800000043733, 800000043961, 800000044121, 800000044422, 800000044521, 800000044661, 800000044895, 800000044965, 

#### So far so good! Out of the first three dataframes, we are only missing a few schools and those schools match perfectly between the two tables that have them. 

In [11]:
len(df_y.ENTITY_CD.unique())

98

In [39]:
df_y = pd.read_excel("../Data/SRC2022/Annual Regents Exams.xlsx")

In [40]:
df_y.columns

Index(['INSTITUTION_ID', 'ENTITY_CD', 'ENTITY_NAME', 'YEAR', 'SUBJECT',
       'SUBGROUP_NAME', 'TESTED', 'NUM_LEVEL1', 'PER_LEVEL1', 'NUM_LEVEL2',
       'PER_LEVEL2', 'NUM_LEVEL3', 'PER_LEVEL3', 'NUM_LEVEL4', 'PER_LEVEL4',
       'NUM_LEVEL5', 'PER_LEVEL5', 'NUM_PROF', 'PER_PROF', 'TOTAL_EXEMPT',
       'NUM_EXEMPT_NTEST', 'PCT_EXEMPT_NTEST', 'NUM_EXEMPT_TEST',
       'PCT_EXEMPT_TEST', 'ASSMNT_FLAG'],
      dtype='object')

In [41]:
print(df_y.loc[df_y.ENTITY_CD%10000 != 0].ENTITY_NAME.value_counts(), sep="\n", end="\n \n")
print(df_y.loc[df_y.ENTITY_CD%10000 != 0].ENTITY_CD.value_counts(), sep="\n", end="\n \n")


# df_y.iloc[].SUBGROUP_NAME.value_counts()

ENTITY_NAME
PINE VALLEY CENTRAL JR-SR HIGH SCH     212
WESTFIELD HIGH SCHOOL                  195
GREEN TECH HIGH CHARTER SCHOOL         193
COLONIE CENTRAL HIGH SCHOOL            192
MAINE-ENDWELL SENIOR HS                183
                                      ... 
PS/MS 280 MOSHOLU PARKWAY                1
RIVERTON STREET CHARTER SCHOOL           1
PS 279 CAPT MANUEL RIVERA JR             1
NEW SCHOOL-LEADERSHIP & JOURNAL          1
BEDFORD STUY NEW BEGINNINGS CHARTER      1
Name: count, Length: 2332, dtype: int64
 
ENTITY_CD
60601040003     212
62901040002     195
10100860907     193
10601060008     192
31101060006     192
               ... 
320700011495      1
342900010355      1
332300010668      1
332300010671      1
320800010337      1
Name: count, Length: 2221, dtype: int64
 


In [42]:
print(df_y.loc[df_y.ENTITY_CD%10000 != 0].SUBGROUP_NAME.value_counts(), sep="\n", end="\n \n")

SUBGROUP_NAME
Small Group Total                                  14323
Not Economically Disadvantaged                      1524
Female                                              1484
Economically Disadvantaged                          1457
Male                                                1439
Hispanic or Latino                                  1379
General Education Students                          1357
Multiracial                                         1349
Not Homeless                                        1315
Non-English Language Learner                        1315
All Students                                        1313
Parent Not in Armed Forces                          1312
Not Migrant                                         1311
Not in Foster Care                                  1305
White                                               1295
Students with Disabilities                          1241
Asian or Native Hawaiian/Other Pacific Islander     1230
Black or African 

In [43]:
print(df_y.loc[df_y.ENTITY_CD%10000 != 0].SUBJECT.value_counts(), sep="\n", end="\n \n")

SUBJECT
Regents Common Core Algebra I               10882
Regents Living Environment                   6811
Regents Phy Set/Earth Sci                    5516
Regents Common Core English Language Art     4636
Regents Common Core Geometry                 2409
Regents Phy Set/Chemistry                    2367
Regents Common Core Algebra II               2321
Regents NF Global History                    2198
Regents Phy Set/Physics                      1964
Name: count, dtype: int64
 


In [44]:
df_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65534 entries, 0 to 65533
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   INSTITUTION_ID    61558 non-null  float64
 1   ENTITY_CD         65534 non-null  int64  
 2   ENTITY_NAME       65534 non-null  object 
 3   YEAR              65534 non-null  int64  
 4   SUBJECT           65534 non-null  object 
 5   SUBGROUP_NAME     65534 non-null  object 
 6   TESTED            65534 non-null  int64  
 7   NUM_LEVEL1        65534 non-null  object 
 8   PER_LEVEL1        65534 non-null  object 
 9   NUM_LEVEL2        65534 non-null  object 
 10  PER_LEVEL2        65534 non-null  object 
 11  NUM_LEVEL3        65534 non-null  object 
 12  PER_LEVEL3        65534 non-null  object 
 13  NUM_LEVEL4        65534 non-null  object 
 14  PER_LEVEL4        65534 non-null  object 
 15  NUM_LEVEL5        65534 non-null  object 
 16  PER_LEVEL5        65534 non-null  object