# Teacher Salary Analysis

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from src.wrangle import read_and_filter_data
from datetime import datetime, timedelta

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 200)

## Acquire + Prepare

In [47]:
data = pd.read_csv('cleaned_salary_schedule_2024.csv')

In [56]:
data

Unnamed: 0,Paystep,BAC1,BA+30C2,C2+ID,C2+PD,C2+ID+PD,MA+30C6,MA+30C6+PD
0,1A,64789,66964,70983,72832,76848,75009,80873
1,1B,64789,66964,70983,72832,76848,75009,80873
2,2A,66078,68253,72272,74121,78137,76298,82162
3,2B,66078,68253,72272,74121,78137,76298,82162
4,3A,66623,68798,72817,74666,78682,76843,82707
5,3B,66623,68798,72817,74666,78682,76843,82707
6,4A,67588,69763,73782,75631,79647,77808,83672
7,4B,67588,69763,73782,75631,79647,77808,83672
8,5A,68422,70597,74616,76465,80481,78642,84506
9,5B,68422,70597,74616,76465,80481,78642,84506


In [54]:
data[data['Paystep'].str.contains('A')].select_dtypes(include='number').pct_change(axis=0).round(2)

Unnamed: 0,BAC1,BA+30C2,C2+ID,C2+PD,C2+ID+PD,MA+30C6,MA+30C6+PD
0,,,,,,,
2,0.02,0.02,0.02,0.02,0.02,0.02,0.02
4,0.01,0.01,0.01,0.01,0.01,0.01,0.01
6,0.01,0.01,0.01,0.01,0.01,0.01,0.01
8,0.01,0.01,0.01,0.01,0.01,0.01,0.01
10,0.01,0.01,0.01,0.01,0.01,0.01,0.01
11,0.02,0.02,0.02,0.02,0.02,0.02,0.02
14,0.03,0.02,0.02,0.02,0.02,0.02,0.02
15,0.02,0.02,0.02,0.02,0.02,0.02,0.02
18,0.1,0.09,0.09,0.09,0.08,0.09,0.08


In [53]:
data[data['Paystep'].str.contains('A')].select_dtypes(include='number').pct_change(axis=1).round(2)

Unnamed: 0,BAC1,BA+30C2,C2+ID,C2+PD,C2+ID+PD,MA+30C6,MA+30C6+PD
0,,0.03,0.06,0.03,0.06,-0.02,0.08
2,,0.03,0.06,0.03,0.05,-0.02,0.08
4,,0.03,0.06,0.03,0.05,-0.02,0.08
6,,0.03,0.06,0.03,0.05,-0.02,0.08
8,,0.03,0.06,0.02,0.05,-0.02,0.07
10,,0.03,0.06,0.02,0.05,-0.02,0.07
11,,0.03,0.06,0.02,0.05,-0.02,0.07
14,,0.03,0.05,0.02,0.05,-0.02,0.07
15,,0.03,0.05,0.02,0.05,-0.02,0.07
18,,0.03,0.05,0.02,0.05,-0.02,0.06


In [55]:
data[data['Paystep'].str.contains('B')].select_dtypes(include='number').pct_change().round(2)

Unnamed: 0,BAC1,BA+30C2,C2+ID,C2+PD,C2+ID+PD,MA+30C6,MA+30C6+PD
1,,,,,,,
3,0.02,0.02,0.02,0.02,0.02,0.02,0.02
5,0.01,0.01,0.01,0.01,0.01,0.01,0.01
7,0.01,0.01,0.01,0.01,0.01,0.01,0.01
9,0.01,0.01,0.01,0.01,0.01,0.01,0.01
12,0.03,0.03,0.03,0.03,0.03,0.03,0.03
13,0.02,0.02,0.02,0.02,0.02,0.02,0.02
16,0.07,0.07,0.06,0.06,0.06,0.06,0.06
17,0.02,0.02,0.02,0.02,0.02,0.02,0.01
20,0.1,0.1,0.09,0.09,0.09,0.09,0.08


In [52]:
data[data['Paystep'].str.contains('B')].select_dtypes(include='number').pct_change(axis=1).round(2)

Unnamed: 0,BAC1,BA+30C2,C2+ID,C2+PD,C2+ID+PD,MA+30C6,MA+30C6+PD
1,,0.03,0.06,0.03,0.06,-0.02,0.08
3,,0.03,0.06,0.03,0.05,-0.02,0.08
5,,0.03,0.06,0.03,0.05,-0.02,0.08
7,,0.03,0.06,0.03,0.05,-0.02,0.08
9,,0.03,0.06,0.02,0.05,-0.02,0.07
12,,0.03,0.06,0.02,0.05,-0.02,0.07
13,,0.03,0.05,0.02,0.05,-0.02,0.07
16,,0.03,0.05,0.02,0.05,-0.02,0.07
17,,0.03,0.05,0.02,0.05,-0.02,0.07
20,,0.03,0.05,0.02,0.04,-0.02,0.06


In [25]:
df = read_and_filter_data()

In [57]:
df[(df['Paystep']==1)].sort_values('Salary').head()

Unnamed: 0,Fiscal Year,FirstMidLastStart,Hire Date,Hire Year,Fiscal Year of Hire,Years of Employment,Employment Category,Paystep,Paystep Letter,Salary,Salary Category,Salary Delta,Salary Monetary Diff,Salary Delta Category,Salary Monetary Diff Category
108261,2023,JerryOWilkie2023-02-06,2023-02-06,2023,2022,0,0-5,1,B,61070.0,60k-80k,0.0,0.0,,
134089,2023,KieranCreighton2023-05-31,2023-05-31,2023,2022,0,0-5,1,B,61070.0,60k-80k,0.0,0.0,,
235267,2023,ThaliaBolanos2023-01-09,2023-01-09,2023,2022,0,0-5,1,B,61070.0,60k-80k,0.0,0.0,,
129316,2023,KathyLQuattlebaum2023-02-27,2023-02-27,2023,2022,0,0-5,1,B,61070.0,60k-80k,0.0,0.0,,
124172,2023,KaitlinSenitt-Escobar2022-12-16,2022-12-16,2022,2022,1,0-5,1,A,61070.0,60k-80k,0.0,0.0,,


In [68]:
data[data['Paystep'].str.contains('B')].select_dtypes(include='number').diff() - (65.60*24)

Unnamed: 0,BAC1,BA+30C2,C2+ID,C2+PD,C2+ID+PD,MA+30C6,MA+30C6+PD
1,,,,,,,
3,-285.4,-285.4,-285.4,-285.4,-285.4,-285.4,-285.4
5,-1029.4,-1029.4,-1029.4,-1029.4,-1029.4,-1029.4,-1029.4
7,-609.4,-609.4,-609.4,-609.4,-609.4,-609.4,-609.4
9,-740.4,-740.4,-740.4,-740.4,-740.4,-740.4,-740.4
12,595.6,595.6,595.6,595.6,595.6,595.6,595.6
13,-209.4,-209.4,-209.4,-209.4,-209.4,-209.4,-209.4
16,3373.6,3373.6,3373.6,3373.6,3373.6,3373.6,3373.6
17,-209.4,-209.4,-209.4,-209.4,-209.4,-209.4,-209.4
20,6112.6,6112.6,6112.6,6112.6,6112.6,6112.6,6112.6


In [63]:
print(f'Annual United Teachers Federation Dues: ${round(65.60*24,2)}')

Annual United Teachers Federation Dues: $1574.4


In [41]:
pd.crosstab(df['Paystep'], df['Paystep Letter'])

Paystep Letter,A,B
Paystep,Unnamed: 1_level_1,Unnamed: 2_level_1
1,15,114
2,1091,287
3,3657,351
4,1611,541
5,5354,799
6,8564,1322
7,9955,1750
8,10369,1695
9,10219,1664
10,9356,2928


In [None]:
df.Label.value_counts()

In [None]:
df.head(2).T

In [None]:
df.tail(2).T

In [None]:
df.describe().round().T

In [None]:
df.hist(figsize=(13,7), ec='black')
plt.tight_layout();

## Exploratory Data Analysis

In [None]:
numeric_cols = df.select_dtypes(include='number').columns.to_list()
category_cols = df.select_dtypes(exclude='number').columns.to_list()

In [None]:
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm');

In [None]:
sns.boxplot(data=df, y='Salary', x='Fiscal Year');

In [None]:
sns.boxplot(data=df, y='Salary', x='Employment Category');

In [None]:
sns.boxplot(data=df, y='Salary', x='Salary Delta Category');

In [None]:
sns.boxplot(data=df, y='Salary', x='Salary Monetary Diff Category');

In [None]:
# res = stats.chi2_contingency(pd.crosstab(df['Employment Category'], df['Salary Category']))
# res = stats.chi2_contingency(pd.crosstab(df['Employment Category'], df['Salary Delta Category']))
# res = stats.chi2_contingency(pd.crosstab(df['Employment Category'], df['Salary Monetary Diff Category']))
# test = stats.chi2_contingency(pd.crosstab(df['Salary Category'], df['Salary Delta Category']))
# test = stats.chi2_contingency(pd.crosstab(df['Salary Category'], df['Salary Monetary Diff Category']))

In [None]:
sns.pairplot(data=df.sample(frac=.25), x_vars=numeric_cols, y_vars=numeric_cols, hue='Employment Category');