#1: Packages, read in mapping files, and define folders

In [1]:
import pandas as pd
import os
import string

#Reading in the excel file that contains the mappings of Survey Items to their updated language to improve data accuracy across datasets:
survey_item_mapping_df = pd.read_excel("Survey Item Language Update List.xlsx")
# Convert the mapping survey_item_mapping_df into a dictionary
survey_item_mapping = dict(zip(survey_item_mapping_df['Survey Item'], survey_item_mapping_df['Updated New Survey Item Language']))

#Read in the excel file that has all the index info for the survey items:
survey_item_index_df = pd.read_excel("Survey Item Indexes.xlsx", sheet_name="Indexes")

#The folder that contains all the OSDH survey data contained in seperate excel files:
folder_path = 'OSDH Histroical Survey Data Backups'

#Creating a dictinoary that will store all the dfs that will be sent to an excel file.
excel_dfs = {}

#Creating a list of excel sheets and a blank dataframe to create an excel sheets info 
    #which will hold a desciption of each excel sheet, since we have so many.
excel_sheets_info = pd.DataFrame(columns=['Excel Sheet', 'Description'])
excel_sheets_list = []

#We are appending it because we want it to appear first in the excel file:
excel_dfs["Excel Sheets Descriptions"] = excel_sheets_info

#survey_item_mapping_dict
#update_survey_item_language_df

#2: Loop through excel file and update survey items

In [2]:
#Create an empty list that will store the survey dataframes to concat
dfs = []

#Loop through all Excel files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx') or filename.endswith('.xls'):
        file_path = os.path.join(folder_path, filename)
        
        # Read the Excel file with all columns as text to avoid intergers errors upon dataframe concating:
        df = pd.read_excel(file_path, dtype=str)
        
        #Rename columns if they match the ones in the survey_item_mapping
        df.rename(columns=survey_item_mapping, inplace=True)
        
        #Append the dataframe to the list to get concated later
        dfs.append(df)

#Concatenate all DataFrames together into one DataFrame
all_OSDH_survey_df = pd.concat(dfs, ignore_index=True)

#3: Minor Data cleaning and value remappings.

In [3]:
#The following column was kept to verify the data was OSDH in the orginial file, but is no longer needed:
all_OSDH_survey_df.drop(columns=["Which agency are you a part of?"], inplace=True)

#Setting up remapping so 5 point items have consitent values
five_point_value_remapping = {
    'Strongly agree':      'Strongly Agree', 
    'Strongly disagree':   'Strongly Disagree',
    
    'Very satisfied':      'Very Satisfied',   
    'Very dissatisfied':   'Very Dissatisfied'}

#Apply the 5 point remapping across the entire dataframe:
all_OSDH_survey_df.replace(five_point_value_remapping, inplace=True)

#Due to character length issue in dataframe joins in Tableau we need to shorten the DEI Background question to be shorter:
#Rename question to shorter version to make refrencing easier in Tableau:
all_OSDH_survey_df.rename(columns={("Employees at my agency can reach their highest potential, regardless of their background" +
                                   " (e.g., all ages, cultural backgrounds, genders, races, religions, etc.).") : 
                                   "Employees at my agency can reach their highest potential, regardless of their background."}, inplace=True)
#all_OSDH_survey_df

#4: Clean Burnout columns.

In [4]:
#Define mapping to remove text in parathesis from 0-10 scale burnout data column:
burnout_mapping = {
    '0 (Never)':      '0', 
    '10 (Always)':    '10'}


#Rename question to shorter version to make refrencing easier in Tableau:
all_OSDH_survey_df.rename(columns={("Currently in your job, please rate how often you experience burnout on a scale of 0-10," +
                                     " where 0 is never experiencing burnout and 10 is always experiencing burnout.") : "Burnout Scale"}, inplace=True)

#Apply Burnout Mapping
all_OSDH_survey_df["Burnout Scale"].replace(burnout_mapping, inplace=True)

#remap the OKSEES 2021 values to be on a 0-10 scale to be compatibale in Qualtrics
burnout_oksees_2021_mapping = {
    'Strongly Disagree' :'0',    #Favorable,   Low (0-3)
    'Disagree'          :'2.5',  #Favorable,   Low (0-3)
    'Neutral'           :'5',    #Neutral,     Moderate (4-6)
    'Agree'             :'7.5',  #Unfavorable, High (7-10)
    'Strongly Agree'    :'10'}   #Unfavorable, High (7-10)

#Apply Burnout remapping for OKSEES 2021 burnout data
all_OSDH_survey_df["I feel burned out from my work."].replace(burnout_oksees_2021_mapping, inplace=True)

#Take the values from the OKSEES 2021 and fill them in on the 11 point burnout column
all_OSDH_survey_df["Burnout Scale"] = all_OSDH_survey_df["Burnout Scale"].fillna(all_OSDH_survey_df["I feel burned out from my work."])

#Drop orginial column we we no longer need:
all_OSDH_survey_df.drop(columns=["I feel burned out from my work."], inplace=True)

# Convert to numeric
all_OSDH_survey_df["Burnout Scale"] = pd.to_numeric(all_OSDH_survey_df["Burnout Scale"], errors='coerce')

# Create the "Burnout Group" column
all_OSDH_survey_df['Burnout Group'] = pd.cut(
    all_OSDH_survey_df['Burnout Scale'],
    bins=[-float('inf'), 3, 6, 10],
    labels=['Low (0-3)', 'Moderate (4-6)', 'High (7-10)'])

#Create column to graph the favorability of burnout:
all_OSDH_survey_df['I am experiencing burnout.'] = all_OSDH_survey_df['Burnout Group'].apply(
                                                                       lambda x: 'Favorable' if x == 'Low (0-3)' else 
                                                                                 'Neutral' if x == 'Moderate (4-6)' else 
                                                                                 'Unfavorable' if x == 'High (7-10)' else None)

#all_OSDH_survey_df

#5: Clean eNPS columns.

In [5]:
#Rename question to shorter version to make refrencing easier in Tableau:
all_OSDH_survey_df.rename(columns={("On a scale of 0-10, how likely are you to recommend to others that OSDH is a great place to work, "
                                     +"where 0 is not at all likely to recommend and 10 is extremely likely to recommend?") : "eNPS Scale"}, inplace=True)

#remap the OKSEES 2021 values to be on a 0-10 scale to be compatibale in Qualtrics
recommend_workplace_mapping = {
    'Strongly Disagree' :'0',   #Detractors (0-6)
    'Disagree'          :'2.5', #Detractors (0-6)
    'Neutral'           :'5',   #Detractors (0-6)
    'Agree'             :'7.5', #Passives (7-8)
    'Strongly Agree'    :'10'}  #Promoters (9-10)

#Apply Burnout remapping for OKSEES 2021 burnout data
all_OSDH_survey_df["I would recommend my agency as a good place to work."].replace(recommend_workplace_mapping, inplace=True)

#Take the values from the OKSEES 2021 and fill them in on the 11 point burnout column
all_OSDH_survey_df["eNPS Scale"] = all_OSDH_survey_df["eNPS Scale"].fillna(all_OSDH_survey_df["I would recommend my agency as a good place to work."])

#Drop orginial columns we we no longer need:
all_OSDH_survey_df.drop(columns=["I would recommend my agency as a good place to work."], inplace=True)

# Convert to numeric
all_OSDH_survey_df["eNPS Scale"] = pd.to_numeric(all_OSDH_survey_df["eNPS Scale"], errors='coerce')

# Create the "Burnout Group" column
all_OSDH_survey_df['eNPS Group'] = pd.cut(
    all_OSDH_survey_df["eNPS Scale"],
    bins=[-float('inf'), 6, 8, 10],
    labels=['Detractors (0-6)', 'Passives (7-8)', 'Promoters (9-10)'])

#Recreate the column to graph the favorability of the survey item itself:
all_OSDH_survey_df["I would recommend my agency as a good place to work."] = pd.cut(
    all_OSDH_survey_df["eNPS Scale"],
    bins=[-float('inf'), 3, 6, 10],
    labels=['Unfavorable', 'Neutral', 'Favorable'])


#6: Rework Intent to Stay Column:

#7: Define function to set up data exports for specific dataframe:

In [6]:
def set_up_data_export(list, excel_sheet, desc):

    #Appending Data to the excel_dfs dictionary:
    excel_dfs[excel_sheet] = all_OSDH_survey_df[list]
    
    #Assigning Excel Sheet Info Dataframe:
    new_excel_sheet_info = {'Excel Sheet': excel_sheet, 'Description': desc}

    # Append the new row data to the list
    excel_sheets_list.append(new_excel_sheet_info)

#Save the meta Data to a sheet:
set_up_data_export(["uniqueidentifier", "Co", "Finished", "SurveyName", "SurveyStartDate", "SurveyCloseDate"],
                    "meta_data",
                    "Contains meta data for respondents.")

#Save the eNPS Data to a sheet:
set_up_data_export(["uniqueidentifier", "eNPS Scale", "eNPS Group"],
                    "eNPS_data",
                    "Contains eNPS response data.")

#Save the Burnout Data to a sheet:
set_up_data_export(["uniqueidentifier", "Burnout Scale", "Burnout Group"],
                    "burnout_data",
                    "Contains burnout response data.")
              

#8: Remap survey item values to favorability scale before data export set up:

REMOVE THE CODE BELOW AFTER OSDHEES 2025 data is added into the file, since the service column will automaically populate from reading in the files.

In [7]:
# Create a blank column
all_OSDH_survey_df["This agency delivers high-quality services with a commitment to continuous improvement."] = None

In [8]:
#Send all the survey items using the survey item index dataframe to a list for easy extraction:
survey_item_list = survey_item_index_df['Survey Item'].tolist()

#Create favorability remapping for recode
favorability_mapping = {
    'Strongly Agree':      'Favorable', 
    'Agree':               'Favorable',
    'Very Satisfied':      'Favorable',
    'Satisfied':           'Favorable',
    
    'Neutral':             'Neutral',
    
    'Disagree':            'Unfavorable',
    'Strongly Disagree':   'Unfavorable',
    'Dissatisfied':        'Unfavorable',
    'Very Dissatisfied':   'Unfavorable'}

# Iterate over each column in survey_item_list and apply the mapping
for column in survey_item_list:
    if column in all_OSDH_survey_df.columns:  # Ensure the column exists in the DataFrame
        all_OSDH_survey_df[column] = all_OSDH_survey_df[column].replace(favorability_mapping)

#Save the survey item responses to a sheet:
set_up_data_export((["uniqueidentifier", "SurveyName"] + survey_item_list),
                    "survey_item_data",
                    "Contains survey item response data.")

#9: Generate Index File

In [9]:
#9.1 We need to melt the data frame and filter it to generate a datframe that contains the
    #all survey items that were asked on a given survey.
# Melt the DataFrame to a long format
melted_df = all_OSDH_survey_df.melt(id_vars=['SurveyName'], 
                                     value_vars=survey_item_list, 
                                     var_name='Survey Item', 
                                     value_name='Response')
# Filter to include only rows with non-null responses
filtered_df = melted_df[melted_df['Response'].notna()]
# Select distinct values
result = filtered_df[['SurveyName', 'Survey Item']].drop_duplicates()

#9.2 We need read in the index key and do a 
#Read in the excel file that has all the index info for the survey items:
survey_item_index_key_df = pd.read_excel("Survey Item Indexes.xlsx", sheet_name="Index Key")

comprehensive_survey_index_key= result.merge(survey_item_index_key_df, on='Survey Item', how='inner')

#Appending Survey Indexes to excel dictionary
excel_dfs["Survey Indexes"] = comprehensive_survey_index_key
index_excel_sheet = {'Excel Sheet': "Survey Indexes", 'Description': "Contains corresponding Indexes for Survey Items."}
excel_sheets_list.append(index_excel_sheet)

#survey_item_index_key_df
#result


#10: Export and save to Excel file

In [10]:
#Concating all Excel Sheet Descriptions together:
excel_dfs["Excel Sheets Descriptions"] = pd.concat([excel_dfs["Excel Sheets Descriptions"],pd.DataFrame(excel_sheets_list)], ignore_index=True)

writer = pd.ExcelWriter("OSDH Survey Data (Tableau).xlsx", engine='xlsxwriter')

# loop through `dict` of dataframes
for sheet, df in excel_dfs.items():  
    df.to_excel(writer, sheet_name=sheet, index=False) 
    
    #Saves Dataframe with filters turned on:
    worksheet = writer.sheets[sheet]
    worksheet.autofilter(0,0,df.shape[0],df.shape[1]-1)   
    
    #Adjusts sizes of columns for easier reading:
    for column in df:
        column_length = max(df[column].astype(str).map(len).max(), len(column) + 10) #9 for a bit of extra space
        col_idx = df.columns.get_loc(column)
        writer.sheets[sheet].set_column(col_idx, col_idx, column_length)

writer.close()