In [1]:
import pandas as pd
pd.set_option('display.max_rows', 200)

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
data = pd.read_csv('city_payroll_data.csv')

# Filter data for 'DEPT OF ED PEDAGOGICAL' agency and 'TEACHER' title
df = data[(data['Agency Name'] == 'DEPT OF ED PEDAGOGICAL')&
            (data['Title Description'] == 'TEACHER')&
            # (data['Fiscal Year']>=2020)&
            (data['Leave Status as of June 30']=='ACTIVE')&
            (data['Regular Gross Paid']>0)]

# Drop unused columns
df = df.drop(columns=['Payroll Number', 'Agency Name', 'Work Location Borough',
                        'Title Description', 'Pay Basis',
                        'Regular Hours', 'OT Hours', 'Total OT Paid',
                        'Leave Status as of June 30', 'Regular Gross Paid', 'Total Other Pay'])

df = df.drop_duplicates()

df.rename(columns={'Agency Start Date': 'Hire Date',
                   'Base Salary': 'Salary'}, inplace=True)

# Convert 'Hire Date' to datetime, add 'Hire Year' and 'Years of Employment'
df['Hire Date'] = pd.to_datetime(df['Hire Date'], errors='coerce')
# Filter out rows with NaT values
valid_dates = df['Hire Date'].notna()
# Use the .dt accessor on valid dates
df.loc[valid_dates, 'Hire Year'] = df.loc[valid_dates, 'Hire Date'].dt.year
# Convert Years from float to int
df['Hire Year'] = df['Hire Year'].astype('Int16')
df['Years of Employment'] = df['Fiscal Year'] - df['Hire Year']
df['Years of Employment'] = df['Years of Employment'].astype('Int16')

# Normalize strings
df['Last Name'] = df['Last Name'].str.strip().str.title()
df['First Name'] = df['First Name'].str.strip().str.title().fillna('None')
df['Mid Init'] = df['Mid Init'].str.strip().str.upper().fillna('None')

# Concatenated Key
df['FirstMidLastStart'] = df['First Name'] + df['Mid Init'] + df['Last Name'] + df['Hire Date'].astype(str)

# Sort and reset index
df = df.sort_values(by='Fiscal Year')
df = df.reset_index(drop=True)

# Calculate salary differences Year over Year by percent and dollar amount
df['Salary Delta'] = df.groupby(['FirstMidLastStart'])['Salary'].pct_change() * 100
df['Salary Monetary Diff'] = df.groupby(['FirstMidLastStart'])['Salary'].diff()

df['Salary Delta'] = df['Salary Delta'].round(3)
df['Salary Monetary Diff'] = df['Salary Monetary Diff'].round(3)

df = df.drop(columns=['Last Name', 'First Name', 'Mid Init'])

df = df[['Fiscal Year', 'Hire Date', 'Hire Year', 'Years of Employment', 'FirstMidLastStart',
         'Salary', 'Salary Delta', 'Salary Monetary Diff']]

df.to_csv('teachers_payroll.csv', index=False)

  data = pd.read_csv('city_payroll_data.csv')


In [7]:
df = pd.read_csv('teachers_payroll.csv')
df.sort_values(['FirstMidLastStart', 'Fiscal Year'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [8]:
df.head()

Unnamed: 0,Fiscal Year,Hire Date,Hire Year,Years of Employment,FirstMidLastStart,Salary,Salary Delta,Salary Monetary Diff
0,2018,2011-09-01,2011.0,7.0,A DilanganiNoneDilrukshi2011-09-01,71930.0,,
1,2019,2011-09-01,2011.0,8.0,A DilanganiNoneDilrukshi2011-09-01,79005.0,9.836,7075.0
2,2020,2011-09-01,2011.0,9.0,A DilanganiNoneDilrukshi2011-09-01,89263.0,12.984,10258.0
3,2021,2011-09-01,2011.0,10.0,A DilanganiNoneDilrukshi2011-09-01,91941.0,3.0,2678.0
4,2022,2011-09-01,2011.0,11.0,A DilanganiNoneDilrukshi2011-09-01,91941.0,0.0,0.0
