In [1]:
import pandas as pd
import warnings
import glob
from openpyxl import load_workbook

In [2]:
active_dfs_list = []

active_files = glob.glob('./Active Employee Reports/*') 

for file_name in active_files:
    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        active = pd.read_excel(file_name, header=4, engine="openpyxl")

        #Getting info out of active:
        active = active[['Worker',
                         'Last Name',
                         'First Name',
                         'Employee ID',
                         'Supervisory Organization',  #Extract Department and Supervisor
                         'Business Title',
                         'Employee Type',
                         'Supervisory Level',
                         'Birthdate',                 #Remove after you get calculation of Generation
                         'Age',
                         'Ethnicity',                 #Get Ethnicity before " ("
                         'Gender',
                         'Annual Rate',
                         'Location',
                         'FTE'                       #Create 'Time Type'
                        ]]

        # Extracting department information from 'Supervisory Organization' column
        active['Department'] = active['Supervisory Organization'].str.extract(r'340\s(.*?)\s\(')

        # Extracting supervisor information from 'Supervisory Organization' column
        active['Supervisor'] = active['Supervisory Organization'].apply(lambda x: x[x.find("(")+1:x.find(")")])

        #Removing Inherited from the Supervisor column:
        active['Supervisor'] = active['Supervisor'].str.split("(").str[0].str.strip()

        # Check if each Worker name is found in the Supervisor column
        active['Direct Reports'] = active['Worker'].apply(lambda x: 'Yes' if x in active['Supervisor'].values else 'No')

        # Calculate the number of direct reports for each supervisor
        supervisor_counts = active['Supervisor'].value_counts().reset_index()
        supervisor_counts.columns = ['Supervisor', 'Number of Direct Reports']
        # Merge these counts back into the original dataframe on the Worker column
        active = active.merge(supervisor_counts, left_on='Worker', right_on='Supervisor', how='left')
        # Fill NaN values with 0 (for workers who are not supervisors)
        active['Number of Direct Reports'].fillna(0, inplace=True)
        # Drop the redundant Supervisor_y column (it was used for merging purposes)
        active.drop(columns=['Supervisor_y'], inplace=True)
        # Rename columns for clarity
        active.rename(columns={'Supervisor_x': 'Supervisor'}, inplace=True)

        # Define a lambda function to apply the conditions for Central Office vs. District
        active['CO or District'] = active.apply(lambda row: 'District' if (isinstance(row['Department'], str) and 'District' in row['Department']) or 
                                                (isinstance(row['Business Title'], str) and 'Regional Administrative Director' in row['Business Title']) 
                                                     else 'Central Office', axis=1)
    
        # Convert 'Birthdate' column to datetime format and then format it to 'mm/dd/yyyy'
        active['Birthdate'] = pd.to_datetime(active['Birthdate']).dt.strftime('%m/%d/%Y')

        # Create a new column 'generation' based on birth year
        active['Birthyear'] = pd.to_datetime(active['Birthdate']).dt.year
        active['Generation'] = pd.cut(active['Birthyear'],
                                  bins=[1927, 1945, 1964, 1980, 1996, pd.to_datetime('now').year],
                                  labels=['Traditionalist', 'Baby Boomer', 'Generation X', 'Millennials', 'Generation Z'])

        # Define the age buckets
        age_bins = [18, 25, 35, 45, 55, 65, float('inf')]
        age_labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']

        # Create the Age Buckets column using pd.cut()
        active['Age Buckets'] = pd.cut(active['Age'], bins=age_bins, labels=age_labels, right=False)



        #Removing Inherited from the Supervisor column:
        active['Ethnicity'] = active['Ethnicity'].str.split("(").str[0].str.strip()

        # Replace "I do not wish to answer." with "Not Specified" in the 'Ethnicity' column
        active['Ethnicity'] = active['Ethnicity'].replace('I do not wish to answer.', 'Not Specified')

        # Create the Minority column based on the Ethnicity column
        active['Minority'] = active['Ethnicity'].apply(lambda x: 'White' if x == 'White' else 
                                                       ('Not Specified' if x == 'Not Specified' else 'Minority'))
        # Dropping the 'Birthyear' column if not needed
        active.drop(columns=['Birthyear', 'Birthdate'], inplace=True)

        # Define a function to determine the Time Type based on FTE value
        def determine_time_type(fte):
            if fte == 1:
                return 'Full-time'
            else:
                return 'Part-time'

        # Apply the function to create the 'Time Type' column
        active['Time Type'] = active['FTE'].apply(determine_time_type)

        #Removing Oklahoma State Athletic Commission, since it is not under Keith Reed. 
        active = active[active['Department'] != 'Okla State Athletic Commission']

        #Removing contract workers:
        active = active[active['Employee Type'].notna()]

        #Removing OSAC Administrator, since they are not under Keith Reed:
        active = active[active['Business Title'] != 'OSAC Administrator']


        # Loading in the day that the report reflects:
        workbook = load_workbook(filename=file_name)
        sheet = workbook.active
        report_date = sheet['B3'].value

        # Determine the fiscal year
        if report_date.month >= 7:
            active['FY'] = report_date.year + 1
        else:
            active['FY'] = report_date.year

        #Creating Fiscal Year Quarters:
        if report_date.month in [1, 2, 3]:
            active['QTR (FY)'] = 3  # Q3 
        elif report_date.month in [4, 5, 6]:
            active['QTR (FY)'] = 4  # Q4 
        elif report_date.month in [7, 8, 9]:
            active['QTR (FY)'] = 1  # Q1 
        else:
            active['QTR (FY)'] = 2  # Q2 

        #Creating Key for joining later
        active['QTR FY'] = "Q" + active['QTR (FY)'].astype(str) + " " + active['FY'].astype(str)

        #Fixing Commissoners Data:
        active.loc[active['Business Title'] == 'Commissioner of Health', 'Department'] = 'Secretary of Health and Mental Health'

        #adding each dataframe to list
        active_dfs_list.append(active)

#Returning all active employee dataframes concatted together
active_df = pd.concat(active_dfs_list)

#active_df

In [3]:
info_dfs_list = []

info_files = glob.glob('./Employee Info Reports/*') 

for file_name in info_files:
    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        info = pd.read_excel(file_name, header=4, engine="openpyxl")
        
    #Getting the join key and email address info:
    info = info[['Employee ID', 'Company Service Date', 'Continuous Service Date']]

    # Convert 'Company Service Date' to datetime format
    info['Company Service Date'] = pd.to_datetime(info['Company Service Date'])
    info['Continuous Service Date'] = pd.to_datetime(info['Continuous Service Date'])

    # Calculating agency and State tenure
    info['Company Service Years'] = (pd.Timestamp.now() - info['Company Service Date']).dt.days / 365
    info['Years of Service'] = (pd.Timestamp.now() - info['Continuous Service Date']).dt.days / 365

    # Define the service year buckets
    service_year_bins = [0, 1, 3, 5, 10, 20, float('inf')]
    service_year_labels = ['<1 years', '1-3 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']

    # Create the "Company Service Year Buckets" column using pd.cut()
    info['Company Service Year Buckets'] = pd.cut(info['Company Service Years'], bins=service_year_bins, labels=service_year_labels, right=False)
    info['Service Year Buckets'] = pd.cut(info['Years of Service'], bins=service_year_bins, labels=service_year_labels, right=False)

    # Create 'FY2018 RIF' column based on the condition
    info['FY2018 RIF'] = info['Company Service Date'].apply(
        lambda date: 'Endured RIF' if date < pd.Timestamp('2018-04-01') else 'Post RIF')

    # Function to determine the pandemic phase
    def determine_pandemic_5_phases(date):
        if date < pd.Timestamp('2020-03-14'):
            return "Pre-pandemic (Before March 14th, 2020)"
        elif date < pd.Timestamp('2021-03-14'):
            return "1st Year of Pandemic (March 14th, 2020 - March 13th, 2021)"
        elif date < pd.Timestamp('2022-03-14'):
            return "2nd Year of Pandemic (March 14th, 2021 - March 13th, 2022)"
        elif date < pd.Timestamp('2023-04-11'):
            return "3rd Year of Pandemic (March 14th, 2022 - April 11th, 2023)"
        else:
            return "Post Pandemic (April 11th, 2023 - Present)"

    def determine_pandemic_3_phases(date):
        if date < pd.Timestamp('2020-03-14'):
            return "Pre-pandemic (Before March 14th, 2020)"
        elif date < pd.Timestamp('2023-04-11'):
            return "Pandemic (March 14th, 2020 - April 11th, 2023)"
        else:
            return "Post Pandemic (April 11th, 2023 - Present)"

    # Create Pandemic columns
    info['Pandemic (5)'] = info['Company Service Date'].apply(determine_pandemic_5_phases)

    info['Pandemic (3)'] = info['Company Service Date'].apply(determine_pandemic_3_phases)

    # Loading in the day that the report reflects:
    workbook = load_workbook(filename=file_name)
    sheet = workbook.active
    report_date = sheet['B3'].value

    # Determine the fiscal year
    if report_date.month >= 7:
        info['FY'] = report_date.year + 1
    else:
        info['FY'] = report_date.year

    #Creating Fiscal Year Quarters:
    if report_date.month in [1, 2, 3]:
        info['QTR (FY)'] = 3  # Q3 
    elif report_date.month in [4, 5, 6]:
        info['QTR (FY)'] = 4  # Q4 
    elif report_date.month in [7, 8, 9]:
        info['QTR (FY)'] = 1  # Q1 
    else:
        info['QTR (FY)'] = 2  # Q2 

    #Creating Key for joining later
    info['QTR FY'] = "Q" + info['QTR (FY)'].astype(str) + " " + info['FY'].astype(str)

    info.drop(columns=['FY', 'QTR (FY)'], inplace=True)

    #adding each dataframe to list
    info_dfs_list.append(info)


#Returning all info employee dataframes concatted together
info_df = pd.concat(info_dfs_list)
#info_df

In [4]:
# Perform left join on 'active' and 'info' DataFrames
final_df = pd.merge(active, info, on=['Employee ID', "QTR FY"], how='left')
#final_df

In [5]:
#The base dataframe that we will use for merging:
wf_trend_df = final_df.groupby('FY').size().reset_index(name='Employee End Count')

#Average values to caculate:
avg_fields = ['Years of Service', "Age"]

for field in avg_fields:
    avg_df = final_df.groupby('FY')[field].mean().reset_index()
    avg_df = avg_df.rename(columns={field: "Average " + field})
    wf_trend_df = pd.merge(wf_trend_df, avg_df, on='FY', how='left')

#Demographics Counts:
fields_to_pivot = ["Gender", "Ethnicity", "Generation"]

for field in fields_to_pivot:
    pivot_df = final_df.pivot_table(index='FY', columns=field, aggfunc='size', fill_value=0)
    pivot_df = pivot_df.reset_index()
    pivot_df.columns.name = None
    wf_trend_df = pd.merge(wf_trend_df, pivot_df, on='FY', how='left')

#Creating Minority Column:
wf_trend_df['Minority'] = wf_trend_df.loc[:, 'American Indian or Alaska Native':'Native Hawaiian or Other Pacific Islander'].sum(axis=1)

#Caculating Other Column:
wf_trend_df['Other'] = wf_trend_df['Employee End Count'] - wf_trend_df['Female'] - wf_trend_df['Male']

#Renaming FY to Fiscal Year to be compatiable:
wf_trend_df = wf_trend_df.rename(columns={'FY': 'Fiscal Year'})

# Identify the columns starting from "Number Eligible to Retire"
start_col = 'Female'
start_index = wf_trend_df.columns.get_loc(start_col)
columns_to_transform = wf_trend_df.columns[start_index:]

# Calculate the percentage and create new columns
for col in columns_to_transform:
    new_col_name = 'Percent ' + col
    wf_trend_df[new_col_name] = wf_trend_df[col] / wf_trend_df['Employee End Count'] 
    
#Reading in FY2013-FY2022 Data:
FY_2013_2022 = pd.read_excel("OSDH Workforce Time Analysis/AgencyWorkforceSum/OSDH End Data FY2013-FY2022.xlsx")

FY_2013_present = pd.concat([FY_2013_2022, wf_trend_df], axis=0, ignore_index=True)

#pd.set_option('display.max_columns', None)
#FY_2013_present

In [6]:
writer = pd.ExcelWriter('OSDH Workforce Trend Data by FY.xlsx', engine='xlsxwriter')
FY_2013_present.to_excel(writer, sheet_name='OSDH WF Trend Data by FY', index=False) 

#Saves Dataframe with filters turned on:
worksheet = writer.sheets['OSDH WF Trend Data by FY']
worksheet.autofilter(0,0,FY_2013_present.shape[0],FY_2013_present.shape[1]-1) 

    #Adjusts sizes of columns for easier reading:
for column in FY_2013_present:
    column_length = max(FY_2013_present[column].astype(str).map(len).max(), len(column) + 3)
    col_idx = FY_2013_present.columns.get_loc(column)
    writer.sheets['OSDH WF Trend Data by FY'].set_column(col_idx, col_idx, column_length)

writer.close()

In [7]:
#We are filterin final_df to get rid of any identifying data.
final_df = final_df[["Employee Type", "FTE", "Direct Reports", "Number of Direct Reports", "Generation", "Gender",
                    "Age Buckets", "Minority", "Time Type", "FY", "QTR (FY)", "QTR FY", "Company Service Years", "Company Service Year Buckets",
                    "Pandemic (3)"]]

writer = pd.ExcelWriter('FY End Data for OSDH Employees.xlsx', engine='xlsxwriter')
final_df.to_excel(writer, sheet_name='OSDH Employee Data by FY', index=False) 

#Saves Dataframe with filters turned on:
worksheet = writer.sheets['OSDH Employee Data by FY']
worksheet.autofilter(0,0,final_df.shape[0],final_df.shape[1]-1) 

    #Adjusts sizes of columns for easier reading:
for column in final_df:
    column_length = max(final_df[column].astype(str).map(len).max(), len(column) + 3)
    col_idx = final_df.columns.get_loc(column)
    writer.sheets['OSDH Employee Data by FY'].set_column(col_idx, col_idx, column_length)

writer.close()