In [1]:
import pandas as pd
import glob
from datetime import datetime
import numpy as np
from scipy.stats import chisquare, fisher_exact, chi2_contingency

#import os
#os.environ["R_HOME"] = r"C:\Program Files\R\R-4.2.0"
#import rpy2.robjects.numpy2ri
#from rpy2.robjects.packages import importr

In [2]:
files = glob.glob('./Pulse_Survey_Data*') 

df_def = pd.read_excel('Pulse Survey Question Key.xlsx', sheet_name='Sheet1')

In [3]:
request_columns = (['Progress', 'Q25*_NPS_GROUP'] 
                   + [f'Q{i}*' for i in range(1, 25)])

#Burnout setup:
bo_ranges = [0, 3, 7, 10]
bo_labels = ['Low', 'Mid', 'High']

#favorable responses categories:
favorable_responses = ["Agree", 
                       "Strongly agree", 
                       "Strongly Agree", 
                       "Satisfied", 
                       "Very satisfied",
                       "Low",
                       "Promoter"]

neutral_responses = ['Neutral',
                     'Mid',
                     'Passive']

unfavorable_responses = ['Disagree',
                         'Strongly disagree',
                         'Strongly Disagree',
                         'Very dissatisfied',
                         'Dissatisfied',
                         'High',
                         'Detractor']

dfs_to_concat = []

burnout_dfs_list = []

num_resp_on_survey = []

for filename in files:
    df = pd.read_excel(filename)
    
    df = df.filter(request_columns)
    
    #Getting Completed Responses
    df = df[(df['Progress'] > 88) & (df['Q24*'].notna())]
    
    #Cleaning the Burnout Column:
    df['Q24*'] = df['Q24*'].replace(r'\D', '', regex=True)
    df['Q24*'] = pd.to_numeric(df['Q24*'], errors='coerce')
    
    burnout_count_df = df.groupby('Q24*').size().reset_index(name='Count_' + filename[-12:-5])
    
    burnout_dfs_list.append(burnout_count_df)
    
    #In order for my code to work in the loop. I Q24* will have to represent the Burnout Groups
        #low mid to high
    df.rename(columns={'Q24*': 'Burnout Level'}, inplace=True)

    # Create Burnout Groups:
    df['Q24*'] = pd.cut(df['Burnout Level'], bins=bo_ranges, labels=bo_labels, include_lowest=True)

    df.rename(columns={'Q25*_NPS_GROUP': 'Q25*'}, inplace=True)
    
    #Storing the number of survey responses
    num_resp_on_survey.append(df.shape[0])
    
    #Calculating favorability % Questions 1-23.
    x = 26 if 'Q25*' in df.columns else 25
    
    #Creating a list to store the % and sum calcs
    calculations = []
        
    for i in range(1, x):
        column = f'Q{i}*'
        
        total_count = df[column].dropna().shape[0]

        favorable_count = df[df[column].isin(favorable_responses)].shape[0]
        percentage_favorable = (favorable_count / total_count) if total_count > 0 else 0
        
        unfavorable_count = df[df[column].isin(unfavorable_responses)].shape[0]
        percentage_unfavorable = (unfavorable_count / total_count) if total_count > 0 else 0
        
        neutral_count = df[df[column].isin(neutral_responses)].shape[0]
        percentage_neutral = (neutral_count / total_count) if total_count > 0 else 0
        
        calculations.append({'Question Number': column, 
                             
                             'Favorable Percentage': percentage_favorable,
                             'Favorable Count': favorable_count,
                             
                             'Unfavorable Percentage': percentage_unfavorable,
                             'Unfavorable Count': unfavorable_count,
                            
                             'Neutral Percentage': percentage_neutral,
                             'Neutral Count': neutral_count})
 

    #Storing results:
    results_df = pd.DataFrame(calculations)
    
    #Assigning Ranks within each survey month:
    results_df['Rank'] = results_df['Favorable Percentage'].rank(ascending=False)
    
    #Assigning Dates: 
    results_df['Survey Month Date'] = filename[-12:-5]
    results_df['Survey Month Date'] = pd.to_datetime(results_df['Survey Month Date'], format='%Y_%m')
    
    results_df = pd.merge(results_df, df_def, on='Question Number', how='inner')

    dfs_to_concat.append(results_df)
    #print(results_df)
    #print(df)
    
compare_df = pd.concat(dfs_to_concat, ignore_index=True)
  
  

with pd.ExcelWriter('Favorability_%_2023.xlsx') as writer:
    compare_df.to_excel(writer, sheet_name='June_Dec_2023', index=False)

In [4]:
#Setting up for calculating differences from December to June
df_june = compare_df[compare_df['Survey Month Date'] == '2023-06-01']
df_december = compare_df[compare_df['Survey Month Date'] == '2023-12-01']

# Merge DataFrames on 'Question Number'
diff_df = pd.merge(df_june, df_december, on='Question Number', suffixes=('_June', '_December'))

# Calculate the Favorable Percentage differences
diff_df['Favorable Percentage Difference'] = diff_df['Favorable Percentage_December'] - diff_df['Favorable Percentage_June']


for column in diff_df.columns:
    if column.endswith(" Percentage_June"):
        prefix = column.rsplit(" Percentage_June", 1)[0]
        diff_df['Expected ' + prefix] = diff_df[column] * num_resp_on_survey[1]
        
#diff_df

In [5]:
def chi_square_test(row):
    observed_values = row[['Neutral Count_December', 
                           'Unfavorable Count_December', 
                           'Favorable Count_December']].values
    expected_values = row[['Expected Neutral', 
                           'Expected Unfavorable', 
                           'Expected Favorable']].values
    
    alpha = 0.05
    
    if any(value < 5 for value in expected_values):
        #rpy2.robjects.numpy2ri.activate()
        #stats = importr('stats')
        #contingency_table = np.concatenate((observed_values, expected_values), axis=0)
        #p_value = stats.fisher_test(contingency_table)
        #rpy2.robjects.numpy2ri.deactivate()
        p_value = None
        test_method = None
        significant = 'NA'
    else:
        chi2_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
        # Check if the p-value is less than the chosen significance level (alpha)
        test_method = 'Chi-Square Goodness of Fit'
        significant = p_value < alpha
    
    # Include 'Test Method' in the returned Series
    return pd.Series({'Significant': significant, 'p_value': p_value, 'Test Method': test_method})

# Apply the function to each row and store the results in new columns
diff_df[['Significant', 'p_value', 'Test Method']] = diff_df.apply(chi_square_test, axis=1)

In [6]:
#We are testing if there are significant difference for the 3 main Primary Areas of intrest
KPIS = diff_df.groupby('Primary Area of Intrest_December').agg({
    'Neutral Count_June': 'sum',
    'Favorable Count_June': 'sum',
    'Unfavorable Count_June': 'sum',
    
    'Neutral Count_December': 'sum',
    'Favorable Count_December': 'sum',
    'Unfavorable Count_December': 'sum'
}).reset_index()

# Rename columns for clarity
KPIS.columns = ['Primary Area of Intrest', 
                'Neutral Count_June', 
                'Favorable Count_June', 
                'Unfavorable Count_June',
                
                'Neutral Count_December', 
                'Favorable Count_December', 
                'Unfavorable Count_December']

for column in KPIS.columns:
    if column.endswith(" Count_June"):
        prefix = column.rsplit(" Count_June", 1)[0]
        KPIS['Expected ' + prefix] = ((KPIS[column]/ sum([KPIS['Neutral Count_June'], 
                                                          KPIS['Favorable Count_June'], 
                                                          KPIS['Unfavorable Count_June']]))
                                      
                                      * sum([KPIS['Neutral Count_December'], 
                                             KPIS['Favorable Count_December'], 
                                             KPIS['Unfavorable Count_December']]))

KPIS[['Significant', 'p_value', 'Test Method']] = KPIS.apply(chi_square_test, axis=1)
# Display or use the KPI DataFrame as needed
KPIS

Unnamed: 0,Primary Area of Intrest,Neutral Count_June,Favorable Count_June,Unfavorable Count_June,Neutral Count_December,Favorable Count_December,Unfavorable Count_December,Expected Neutral,Expected Favorable,Expected Unfavorable,Significant,p_value,Test Method
0,Advancement Opportunities,125,140,124,138,106,134,121.465296,136.041131,120.493573,True,0.00552,Chi-Square Goodness of Fit
1,Agency Perceptions,413,1210,322,442,1081,367,401.321337,1175.784062,312.894602,True,2.6e-05,Chi-Square Goodness of Fit
2,Burnout and Work/Life Balance,250,434,94,252,392,112,242.930591,421.727506,91.341902,True,0.028636,Chi-Square Goodness of Fit
3,Communication,179,431,168,155,419,182,173.938303,418.812339,163.249357,False,0.121496,Chi-Square Goodness of Fit
4,Compensation,99,145,145,105,126,147,96.200514,140.899743,140.899743,False,0.266521,Chi-Square Goodness of Fit
5,Engagement,126,1005,36,120,960,54,122.437018,976.580977,34.982005,True,0.004822,Chi-Square Goodness of Fit
6,Fair Treatment,178,480,120,178,417,161,172.966581,466.426735,116.606684,True,1.4e-05,Chi-Square Goodness of Fit
7,Feeling Valued,193,451,134,182,439,135,187.542416,438.246787,130.210797,False,0.843143,Chi-Square Goodness of Fit
8,Overall Employee Satisfaction,195,878,94,166,866,102,189.485861,853.172237,91.341902,False,0.113753,Chi-Square Goodness of Fit
9,Teamwork,68,263,58,79,231,68,66.077121,255.562982,56.359897,True,0.026092,Chi-Square Goodness of Fit


In [7]:
#Filtering out unnecessary:
diff_df = diff_df.filter(['Question Number',
                          #'Favorable Percentage_June',
                          #'Favorable Percentage_December',
                          'Favorable Percentage Difference', 
                          'Significant',
                          'p_value',
                          'Test Method'])

#Getting Question info:
diff_df = pd.merge(diff_df, df_def, on='Question Number', how='inner')

#Ranking Percents Differences 
diff_df['Favorable % Difference Rank'] = diff_df['Favorable Percentage Difference'].rank(ascending=False)

#diff_df
with pd.ExcelWriter('Favorability_%_Diff_2023.xlsx') as writer:
    diff_df.to_excel(writer, sheet_name='Differences', index=False)

In [24]:
burnout_merge = pd.merge(burnout_dfs_list[0], burnout_dfs_list[1], on='Q24*', how='inner')

burnout_merge['Expected_in_lvl'] = (burnout_merge['Count_2023_06'] / (burnout_merge['Count_2023_06'].sum()) 
                                    * burnout_merge['Count_2023_12'].sum())

burnout_merge['Count_2023_12_out_lvl'] = burnout_merge['Count_2023_12'].sum()-burnout_merge['Count_2023_12']

burnout_merge['Expected_out_lvl'] = ((burnout_merge['Count_2023_06'].sum()-burnout_merge['Count_2023_06']) 
                                     / (burnout_merge['Count_2023_06'].sum()) 
                                    * burnout_merge['Count_2023_12'].sum())

burnout_merge = burnout_merge.rename(columns={'Count_2023_06': 'Count_2023_06_in_lvl', 
                                              'Count_2023_12': 'Count_2023_12_in_lvl'})


def bo_chi_square_test(row):
    observed_values = row[['Count_2023_12_in_lvl', 
                           'Count_2023_12_out_lvl']].values
    expected_values = row[['Expected_in_lvl', 
                           'Expected_out_lvl']].values
    
    alpha = 0.05
    
    chi2_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
    significant = p_value < alpha
    test_method = "Chi-Square Goodness-of-Fit"
    # Include 'Test Method' in the returned Series
    return pd.Series({'Significant': significant, 'p_value': p_value, 'Test Method': test_method})

# Apply the function to each row and store the results in new columns
burnout_merge[['Significant', 'p_value', 'Test Method']] = burnout_merge.apply(bo_chi_square_test, axis=1)
#burnout_merge

Unnamed: 0,Q24*,Count_2023_06_in_lvl,Count_2023_12_in_lvl,Expected_in_lvl,Count_2023_12_out_lvl,Expected_out_lvl,Significant,p_value,Test Method
0,0,38,40,36.92545,338,341.07455,False,0.594278,Chi-Square
1,1,22,29,21.377892,349,356.622108,False,0.089658,Chi-Square
2,2,40,29,38.868895,349,339.131105,False,0.094681,Chi-Square
3,3,67,56,65.105398,322,312.894602,False,0.214853,Chi-Square
4,4,26,38,25.264781,340,352.735219,True,0.00872,Chi-Square
5,5,48,43,46.642674,335,331.357326,False,0.568899,Chi-Square
6,6,42,32,40.812339,346,337.187661,False,0.144149,Chi-Square
7,7,48,52,46.642674,326,331.357326,False,0.402128,Chi-Square
8,8,33,42,32.066838,336,345.933162,False,0.06671,Chi-Square
9,9,11,7,10.688946,371,367.311054,False,0.252364,Chi-Square
