In [1]:
import pandas as pd
#from collections import OrderedDict

#The files containes the statements with their indexes:
key_df = pd.read_excel('2024 OKSEES Survey Key.xlsx',sheet_name='Survey Key')
#key_df

#The file contains the raw survey results from Qualtrics:
resp_df = pd.read_excel('Cleaned Data/OKSEES Clean Data 2024.xlsx')
#resp_df

#Creating a dictinoary that will store all the dfs that will be sent to an excel file.
excel_dfs = {}

#Creating a list of excel sheets and a blank dataframe to create an excel sheets info 
    #which will hold a desciption of each excel sheet, since we have so many.
excel_sheets_info = pd.DataFrame(columns=['Excel Sheet', 'Description'])
excel_sheets_list = []

#We are appending it because we want it to appear first in the excel file:
excel_dfs["Excel Sheets Descriptions"] = excel_sheets_info

# Data Preping:

In [2]:
#Filter out Demographic and Comments questions from survey key:
non_index = ((key_df['Index'] != 'Respondent Demographics') 
             & (key_df['Index'] != 'Additional Comments'))

#Apply the mask to filter out the non index statements/questions:
sq_df = key_df[non_index]

#Sending Statements that need to be remapped to list:
sq_list = sq_df['Statement/Question'].tolist()

# Remove trailing white spaces from column names
resp_df.columns = [col.strip() for col in resp_df.columns]

#Renamming columns for easier analysis:
resp_df = resp_df.rename(columns={
    "Are you a full-time employee?": "FTE",
    "How long have you worked for your agency?": "Agency Tenure",
    "How would you describe your work setting?": "Work Setting",
    "In which generation were you born?": "Generation",
    "Which agency are you a part of?": "Agency"
})

#Removing Unnessary Data:
columns_to_remove = ['ID', 
                     'Name', 
                     'Start time', 
                     'Completion time', 
                     'Email']

resp_df.drop(columns=columns_to_remove, inplace=True)

#In order to caculate eNPS later I need to save a copy of the dataframe before remapping occurs:
eNPS_df = resp_df.copy()

#Defining the remapping:
standard_mapping = {
    'Agree':               'Favorable',
    'Strongly Agree':      'Favorable',
    'Satisfied':           'Favorable',
    'Very Satisfied':      'Favorable',
    
    'Neutral':             'Neutral',
    
    'Disagree':            'Unfavorable',
    'Strongly Disagree':   'Unfavorable',
    'Dissatisfied':        'Unfavorable',
    'Very Dissatisfied':   'Unfavorable'}

#Remapping values:
for column in sq_list:
    resp_df[column] = resp_df[column].replace(standard_mapping)

#resp_df

In [3]:
#Removing comments column because it will cause unnessary duplicates in the pivot section:
comments = 'Do you have any additional comments?'

no_comments_resp_df = resp_df.drop(columns=comments)

#I have to melt the dataframe, so we can do groupby calcaultions by index and statement/question:
    #Melting the dataframe like this means that one individuals survey is spread across multiple
    #rows by how the responded to a particular statement/question. 
melted_df = pd.melt(no_comments_resp_df, 
                    id_vars=["OKSEES Year",
                             "Agency",
                             "Agency Tenure",
                             "Generation",
                             "FTE",
                             "Work Setting",
                             "Cabinet"], 
                    var_name='Statement/Question', 
                    value_name='Response')

#Merging the key_df, so down the road we can .groupby("Index")
melted_df = pd.merge(melted_df, key_df, on='Statement/Question', how='left')

#melted_df

In [4]:
# Unmelting the DataFrame with counting occurrences
unmelted_df = melted_df.pivot_table(index=["OKSEES Year",
                                            "Agency",
                                            "Agency Tenure",
                                            "Generation",
                                            "FTE",
                                            "Work Setting",
                                            "Cabinet",
                                            'Statement/Question',
                                            'Index',
                                            'Employee Index'], 
                                           columns='Response', aggfunc=len, fill_value=0)


unmelted_df = unmelted_df.reset_index()

unmelted_df.columns.name = None

#unmelted_df

# Functions for Caculations:

### Caculating Index Scores Function:

In [5]:
#Function that calculates the percents by whatever column names you feed it.
    #The column names need to be stored in a list even if you are feed just one column name.
def calculate_index_percents(groups, excel_sheet, desc):
    #Creating blank list:
    grouping = ["OKSEES Year"]
    #Adding the caterogircal variables we want to group by:
    grouping.extend(groups)
    
    #Performing Group by to get sums counts for the three different sections:
    group_df = (unmelted_df.groupby(grouping, as_index=False)
                           [['Favorable', 'Neutral', 'Unfavorable']].sum())

    #Getting the Total to calcaulte the percents:
    group_df['Total'] =  group_df[['Favorable', 'Neutral', 'Unfavorable']].sum(axis=1)

    #Calculating percents:
    group_df['Favorable %']   =  group_df['Favorable']   / group_df['Total']
    group_df['Neutral %']     =  group_df['Neutral']     / group_df['Total']
    group_df['Unfavorable %'] =  group_df['Unfavorable'] / group_df['Total']

    #Need to drop Non-Employee Indexes if 'Employee Index is one of our groups:
    if 'Employee Index' in groups:
        group_df = group_df[group_df['Employee Index'] != 'Not Available']

    #Appending Data to the excel_dfs dictionary:
    excel_dfs[excel_sheet] = group_df
    
    #Assigning Excel Sheet Info Dataframe:
    new_excel_sheet_info = {'Excel Sheet': excel_sheet, 'Description': desc}

    # Append the new row data to the list
    excel_sheets_list.append(new_excel_sheet_info)
    

### Caculating Demographics Function:

In [6]:
#Creating copy since we will be pivoting:
demo_graph_df = resp_df.copy()

#Saving the Cleaned Survey Responses to the dictionary
excel_dfs["OKSEES Survey Responses"] = demo_graph_df


def caculate_demographic_percents(groups, demo, excel_sheet, desc):
    #Sending the individual catergories to a list for % caculation later on:
    categories = sorted(set(demo_graph_df[demo].unique().tolist()))

    df = demo_graph_df.groupby(groups)[demo].value_counts().reset_index(name='Count')
    df = df.pivot_table(index=groups, columns=demo, values='Count', aggfunc='sum')
    df.fillna(0, inplace=True)
    df.columns.name = None
    df.reset_index(inplace=True)
    
    #Summing Responses:
    df['Total'] = df[categories].sum(axis=1)
    #Caculating Percents:
    for cat in categories:
        df[cat + ' %'] = df[cat] / df['Total']

    excel_dfs[excel_sheet] = df

    #Assigning Excel Sheet Info Dataframe:
    new_excel_sheet_info = {'Excel Sheet': excel_sheet, 'Description': desc}

    # Append the new row data to the list
    excel_sheets_list.append(new_excel_sheet_info)

### Caculating eNPS Function:

In [7]:
#Setting up a dataframe to do eNPS caculations
eNPS_cols = eNPS_df.columns[:7].tolist() + ["I would recommend my agency as a good place to work."]
eNPS_df = eNPS_df[eNPS_cols]
eNPS_df = eNPS_df.rename(columns={"I would recommend my agency as a good place to work.": "eNPS Question"})

#Setting up remapping for easier caculations
eNPS_mapping = {
    'Strongly Agree':      'Promoters', 
    'Agree':               'Passives', 
    'Neutral':             'Detractors',   
    'Disagree':            'Detractors',
    'Strongly Disagree':   'Detractors'}

#Applying remapping:
eNPS_df['eNPS Question'] = eNPS_df['eNPS Question'].replace(eNPS_mapping)

#Pivoting Dataframe to make each eNPS group a column
eNPS_pivot_df = eNPS_df.pivot_table(index=["OKSEES Year",
                                            "Agency",
                                            "Agency Tenure",
                                            "Generation",
                                            "FTE",
                                            "Work Setting",
                                            "Cabinet"], columns='eNPS Question', aggfunc=len, fill_value=0)
                                                                                     #len is used to count unique occurences

# Reset the index to flatten the multi-index columns
eNPS_pivot_df = eNPS_pivot_df.reset_index()

# Remove the name of the columns to make it clean
eNPS_pivot_df.columns.name = None

#Setting up eNPS groups list
eNPS_groups = ['Promoters', 'Passives', 'Detractors']

def calculate_eNPS(groups, excel_sheet, desc):
    #Performing Group by to get sums counts for the three different sections:
    df = (eNPS_pivot_df.groupby(groups, as_index=False)[eNPS_groups].sum())  
    
    #Getting the Total to calcaulte the percents:
    df['Total'] =  df[eNPS_groups].sum(axis=1)
    
    #Caculating each eNPS group
    for eNPS_group in eNPS_groups:
        df[eNPS_group + ' %'] = df[eNPS_group] / df['Total']
        
    #Caculating the eNPS
    df['eNPS %'] = df['Promoters %'] - df['Detractors %']
    
    #Appending Data to the excel_dfs dictionary:
    excel_dfs[excel_sheet] = df
    
    #Assigning Excel Sheet Info Dataframe:
    new_excel_sheet_info = {'Excel Sheet': excel_sheet, 'Description': desc}
    
    # Append the new row data to the list
    excel_sheets_list.append(new_excel_sheet_info)

# Statewide Calculations:

In [8]:
#Caculating the statewide Employee Engagement and Satisfaction Scores:
calculate_index_percents(["Employee Index"], "sw_emp_idx_scr",
                          "Statewide Employee Index Scores")

#Caculating the statewide Employee Engagement and Satisfaction Scores by Work Setting:
calculate_index_percents(["Employee Index", "Work Setting"], "sw_emp_idx_scr_by_wrk_set",
                          "Statewide Employee Index Scores by Work Setting")

#Caculating the statewide Index Scores:
calculate_index_percents(["Index"], "sw_idx_scr",
                          "Statewide Index Scores")

#Caculating the statewide results for Statements/Questions
calculate_index_percents(["Index", "Statement/Question"], "sw_sq_scr",
                         "Statewide Statement and Question Scores")

#Caculating Statewide eNPS Score:
calculate_eNPS(['OKSEES Year'], 'sw_eNPS', 'Statewide Employee Net Promoter Score (eNPS)')

#Caculating Response Deographics for the State:
caculate_demographic_percents(["OKSEES Year"], "Generation", "sw_resp_by_gen", 
                              "Statewide Responses by Generation")

caculate_demographic_percents(["OKSEES Year"], "Work Setting", "sw_resp_by_wrk_set", 
                              "Statewide Responses by Work Setting")

caculate_demographic_percents(["OKSEES Year"], "Agency Tenure", "sw_resp_by_agcy_tenure",
                              "Statewide Responses by Agnecy Tenure")

#Reading in Statewide Response Rate file
excel_dfs["sw_resp_rate"] = pd.read_excel('OKSEES Response Rate.xlsx')
excel_sheets_list.append({'Excel Sheet': "sw_resp_rate", 'Description': 'Statewide Response Rate'})


# Caculating statewide error
#sw_err_rate  = demo_graph_df.groupby('OKSEES Year')['Agency'].apply(lambda x: (x == 'Other').sum() / len(x)).reset_index()
# Rename the column to Error Rate % 
#sw_err_rate .rename(columns={'Agency': 'Response Error Rate %'}, inplace=True)
#Appending Infomation 
#excel_dfs["sw_resp_err_rate"] = sw_err_rate
#excel_sheets_list.append({'Excel Sheet': "sw_resp_err_rate", 'Description': 'Statewide Response Error Rate'})

# Cabinet Calculations:

In [9]:
#Caulating cabinet index scores
calculate_index_percents(["Employee Index", "Cabinet"], "cab_emp_idx_scr",
                          "Cabinet Employee Index Scores")

# Agency Calculations:

In [10]:
#Calculating Employee Engagement and Satisfaction Scores by Agency:
calculate_index_percents(["Employee Index", "Agency", "Cabinet"], "agcy_emp_idx_scr",
                          "Agencies Employee Index Scores")

#Index Scores by Agency:
calculate_index_percents(["Index", "Agency", "Cabinet"], "agcy_idx_scr",
                          "Agencies Index Scores")

#Statement/Questions Scores by Agency:
calculate_index_percents(["Index", "Statement/Question", "Agency", "Cabinet"], "agcy_sq_scr",
                          "Agencies Statement and Question Scores")

#Caculate Agency eNPS
calculate_eNPS(['OKSEES Year', "Agency", "Cabinet"], 'agcy_eNPS', 'Agencies Employee Net Promoter Score (eNPS)')

#Caculating Agecny eNPS Score by Generation:
#calculate_eNPS(['OKSEES Year', "Agency", "Generation"], 'agcy_eNPS_by_gen', 
               #'Agencies Employee Net Promoter Score (eNPS) by Generation')

#Caculating Agecny eNPS Score by Work Setting:
#calculate_eNPS(['OKSEES Year', "Agency", "Work Setting"], 'agcy_eNPS_wrk_set', 
               #'Agencies Employee Net Promoter Score (eNPS) by Work Setting')

#Caculating Response Deographics for Agencies:
caculate_demographic_percents(["OKSEES Year", "Agency", "Cabinet"], "Generation", "agcy_resp_by_gen",
                             "Agencies Responses by Generation")

caculate_demographic_percents(["OKSEES Year", "Agency", "Cabinet"], "Work Setting", "agcy_resp_by_wrk_set",
                             "Agencies Responses by Work Setting")

caculate_demographic_percents(["OKSEES Year", "Agency", "Cabinet"], "Agency Tenure", "agcy_resp_by_agcy_tenure",
                             "Agencies Responses by Agency Tenure")


In [11]:
#Concating all the Excel Sheet Names and there descriptions into one dataframe
    #and updating it in the dictionary:
excel_dfs["Excel Sheets Descriptions"] = pd.concat([excel_dfs["Excel Sheets Descriptions"], 
                                                    pd.DataFrame(excel_sheets_list)], ignore_index=True)

In [12]:
#Writing Info to excel:
writer = pd.ExcelWriter('OKSEES Results.xlsx', engine='xlsxwriter')

#Setting Percent Format Up:
percent_fmt = {'num_format': '0.0%'}
percent_format = writer.book.add_format(percent_fmt)

# loop through `dict` of dataframes
for sheet, df in excel_dfs.items():  
    df.to_excel(writer, sheet_name=sheet, index=False) 
    
    #Saves Dataframe with filters turned on:
    worksheet = writer.sheets[sheet]
    worksheet.autofilter(0,0,df.shape[0],df.shape[1]-1)

    #Setting Tab Colors for easier seperation:
    if sheet.startswith('sw_resp'):
        worksheet.set_tab_color('#A96728') #Prairie Gold - Dark
    elif sheet.startswith('sw'):
        worksheet.set_tab_color('#DE9027') #Prairie Gold - Bright
    elif sheet.startswith('agcy_resp'):
        worksheet.set_tab_color('#326820') #Woodland Green - Dark
    elif sheet.startswith('agcy'):
        worksheet.set_tab_color('#669B41') #Woodland Green - Bright
    elif sheet.startswith('cab'):
        worksheet.set_tab_color('#D15420') #Red Clay - Bright
    else:
        worksheet.set_tab_color('#1CA6DF') #Sky Blue - Bright
        
    
    #Adjusts sizes of columns for easier reading:
    for column in df:
        column_length = max(df[column].astype(str).map(len).max(), len(column) + 4) #4 for a bit of extra space
        col_idx = df.columns.get_loc(column)
        if column.endswith('%'):
            writer.sheets[sheet].set_column(col_idx, col_idx, column_length, percent_format)
        else:
            writer.sheets[sheet].set_column(col_idx, col_idx, column_length)

writer.close()