# Define Functions and Run Initial Query

In [1]:
import pandas as pd
import pyodbc
import numpy as np
import plotly.express as px
from datetime import datetime
import statsmodels.api as sm

pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = 50
pd.options.display.max_rows = 10

def read_query_file(fileName):
    file = open(fileName,'r')
    string = file.read()
    file.close()
    return string

def run_query(query):
    cnxn = pyodbc.connect('DSN=edp-workbench-cshub', autocommit=True)
    df = pd.read_sql_query(query,cnxn)
    cnxn.close()
    return df

def timestamp():
    return datetime.now().strftime('%Y-%m-%d %I:%M:%S %p')   

print("Query Start "+timestamp())
df = run_query(read_query_file('SQL\SamplingTest\KeyMetrics_ForGroupSelection.SQL'))
print("Query End "+timestamp())
print("Add Metrics "+timestamp())
df['SP100'] = df['Total_Accepted']/df['Total_Eligible']
df['CXP_Score']=100*df['CXP_Prob_Sum']/df['CXP_Prob_Count']
df['Sessions Per Hour'] = df['Sessions']/df['HoursWorked']
df['CRT']= df['Total_Resolution_Time']/df['Sessions']
df['Searches Per Session']= df['Helix_Searches']/df['Sessions']
df['%Sessions With Search']= df['Helix_Sessions']/df['Sessions']
df=df.join(pd.get_dummies(df['TenureGroup']))
print("Metrics Completed"+timestamp())
display(df)

Query Start 2023-09-05 10:20:53 AM


  df = pd.read_sql_query(query,cnxn)


Query End 2023-09-05 10:22:37 AM
Add Metrics 2023-09-05 10:22:37 AM
Metrics Completed2023-09-05 10:22:37 AM


Unnamed: 0,Employee,TenureGroup,Date,HoursWorked,Sessions,Total_Resolution_Time,Total_Accepted,Total_Eligible,Helix_Searches,Helix_Sessions,CXP_Prob_Sum,CXP_Prob_Count,SP100,CXP_Score,Sessions Per Hour,CRT,Searches Per Session,%Sessions With Search,0-30,121-180,180+,31-60,61-90,91-120
0,592052,0-30,2023-08-29,7.24,1.00,2451.00,0.00,1.00,25,7,0.53,1.00,0.00,53.18,0.14,2451.00,25.00,7.00,1,0,0,0,0,0
1,592052,0-30,2023-08-31,7.69,4.00,4511.00,,,9,5,2.10,4.00,,52.43,0.52,1127.75,2.25,1.25,1,0,0,0,0,0
2,592052,0-30,2023-09-03,9.10,16.00,14403.00,0.00,7.00,14,7,9.24,18.00,0.00,51.32,1.76,900.19,0.88,0.44,1,0,0,0,0,0
3,583433,61-90,2023-07-06,8.06,13.00,8454.00,1.00,12.00,10,2,10.22,20.00,0.08,51.12,1.61,650.31,0.77,0.15,0,0,0,0,1,0
4,583433,61-90,2023-07-27,7.95,17.00,20632.00,1.00,12.00,2,1,10.83,21.00,0.08,51.56,2.14,1213.65,0.12,0.06,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14719,573856,121-180,2023-08-25,7.86,10.00,12872.00,2.00,11.00,2,1,7.26,14.00,0.18,51.86,1.27,1287.20,0.20,0.10,0,1,0,0,0,0
14720,573856,121-180,2023-08-01,7.89,13.00,10719.00,2.00,7.00,3,1,6.66,13.00,0.29,51.25,1.65,824.54,0.23,0.08,0,1,0,0,0,0
14721,573856,121-180,2023-07-25,7.99,10.00,23758.00,4.00,9.00,1,1,5.26,10.00,0.44,52.59,1.25,2375.80,0.10,0.10,0,1,0,0,0,0
14722,573856,121-180,2023-08-24,4.76,4.00,4345.00,2.00,6.00,1,1,5.72,11.00,0.33,51.99,0.84,1086.25,0.25,0.25,0,1,0,0,0,0


# Build out the metrics from the Query

In [2]:

employees_df = df[['Employee','TenureGroup']].copy().drop_duplicates(subset='Employee')
display(employees_df)
display(df.head())

Unnamed: 0,Employee,TenureGroup
0,592052,0-30
3,583433,61-90
58,568454,180+
121,580952,91-120
184,548646,180+
...,...,...
14461,577246,121-180
14519,589874,0-30
14532,577968,121-180
14597,547653,180+


Unnamed: 0,Employee,TenureGroup,Date,HoursWorked,Sessions,Total_Resolution_Time,Total_Accepted,Total_Eligible,Helix_Searches,Helix_Sessions,CXP_Prob_Sum,CXP_Prob_Count,SP100,CXP_Score,Sessions Per Hour,CRT,Searches Per Session,%Sessions With Search,0-30,121-180,180+,31-60,61-90,91-120
0,592052,0-30,2023-08-29,7.24,1.0,2451.0,0.0,1.0,25,7,0.53,1.0,0.0,53.18,0.14,2451.0,25.0,7.0,1,0,0,0,0,0
1,592052,0-30,2023-08-31,7.69,4.0,4511.0,,,9,5,2.1,4.0,,52.43,0.52,1127.75,2.25,1.25,1,0,0,0,0,0
2,592052,0-30,2023-09-03,9.1,16.0,14403.0,0.0,7.0,14,7,9.24,18.0,0.0,51.32,1.76,900.19,0.88,0.44,1,0,0,0,0,0
3,583433,61-90,2023-07-06,8.06,13.0,8454.0,1.0,12.0,10,2,10.22,20.0,0.08,51.12,1.61,650.31,0.77,0.15,0,0,0,0,1,0
4,583433,61-90,2023-07-27,7.95,17.0,20632.0,1.0,12.0,2,1,10.83,21.0,0.08,51.56,2.14,1213.65,0.12,0.06,0,0,0,0,1,0


# Select Sample Size by Tenure Group
Using the current number of experts find the number of samples needed in each group to be representative of the general population at Orlando

In [3]:
SAMPLE_SIZE = 30
sample_size_df=pd.pivot_table(df.copy(),index='TenureGroup',values='Employee',aggfunc=lambda x: len(x.unique()),margins=True,)
sample_size_df['Total Employees']=sample_size_df['Employee']
del sample_size_df['Employee']
sample_size_df['% of Employees'] = sample_size_df['Total Employees']/sample_size_df['Total Employees'].loc['All']
sample_size_df['n Employees'] = (sample_size_df['% of Employees']* SAMPLE_SIZE).round()

sample_size_df=sample_size_df.iloc[:-1]
display(sample_size_df.sum())
display(sample_size_df)

del SAMPLE_SIZE

Total Employees   396.00
% of Employees      1.00
n Employees        30.00
dtype: float64

Unnamed: 0_level_0,Total Employees,% of Employees,n Employees
TenureGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-30,62,0.16,5.0
121-180,55,0.14,4.0
180+,157,0.4,12.0
31-60,21,0.05,2.0
61-90,56,0.14,4.0
91-120,45,0.11,3.0


# Test
1. Creates representative samples
2. Tests those samples to see if they have a meaningful effect on the metric
    1. Runs an OLS on the sample to see if they have high p val for the metric. If they have a low value for all metrics they are saved as a good possible sample.

In [None]:
SCORE_LIMIT = .25
SAMPLES = 1000

SelectionMetrics = ['Sessions','Sessions Per Hour','CRT','Searches Per Session','%Sessions With Search','Total_Accepted','SP100']

results_df = pd.DataFrame(columns=['Members','Tenure Group']+SelectionMetrics)

# Build test dataframe dft and get groups setup
groups = sample_size_df.index.to_list()
df['Constant']=1

# Make a loop to run this a bunch
for empGroup in groups:
    print('Starting '+empGroup+" "+timestamp())
    dft = df[df[empGroup]==1].copy()
    dft = dft[['Employee',empGroup,'Constant']+SelectionMetrics].copy()
    dft = dft.replace({np.inf : np.nan}).copy()
    dft= dft.dropna().copy()
    # creates a list for the sampled employes
    # then adds the correct number of samples from each group to it
    sample_df = dft[['Employee']].copy().drop_duplicates()
    
    for run in range(1,SAMPLES):
        sample_group = []
        sample_group= sample_group+ sample_df['Employee'].sample(
            n=int(sample_size_df.loc[empGroup]['n Employees'])
            ,replace=False
            ,random_state=run
        ).tolist()
        
        # Flag the employees in the group
        dft['TestSample']=(dft['Employee'].isin(sample_group)).astype(int)
        
        # Make sure the p vals are high enough to assume no statistical significance
        # Also added a lower and upper bound to make sure it could be positive or negative
        score_list = []
        for y in SelectionMetrics:
            # if y == 'CXP_Score':
            #     dft[y]=dft[y]-dft[y].mean()
            y_mean = dft[y].mean()
            y_std = dft[y].std()
            model = sm.OLS(endog=dft[y].copy(),exog=dft.copy()[[empGroup,'TestSample','Constant']]).fit().summary2().tables[1]
            score = (model['Coef.'].loc['TestSample']-y_mean)/y_std
            lower = model['[0.025'].loc['TestSample']
            upper = model['0.975]'].loc['TestSample']
            # display(model,dft)
            if (abs(score) <= SCORE_LIMIT and lower<0 and upper>0) or y=='CXP_Score':
                score_list.append(score)
            del score, y
        # add sample group to results list if the results were not significant
        if len(score_list)==len(SelectionMetrics):
            current_result = dict(zip(['Members','Tenure Group']+SelectionMetrics,[[sample_group],empGroup]+score_list))
            results_df=pd.concat([results_df,pd.DataFrame(current_result,index=[run])])
        del sample_group, score_list, dft['TestSample'],y_mean,y_std


del dft, SAMPLES, SCORE_LIMIT, run

results_df
# results_df.to_excel('Unbiased Sample Selections.xlsx')

# Find and display subsets that do not have any duplicate members

In [None]:
results_df['TotalScore']=0
for metric in SelectionMetrics:
    results_df['TotalScore'] = results_df['TotalScore']+results_df[metric]
results_df['AvgScore']= results_df['TotalScore']/len(SelectionMetrics)
base_results = results_df.copy()
del metric
for empGroup in groups:
    results_df = base_results[base_results['Tenure Group']==empGroup]
    for row1 in range(len(results_df['Members'])):
        for row2 in range(1,len(results_df['Members'])):
            duplicates = 0
            for item in results_df['Members'].iloc[row1]:
                if item in results_df['Members'].iloc[row2]: 
                    duplicates=duplicates+1
                    display(item)
            if duplicates == 0:
                display(pd.concat([results_df.iloc[[row1]],results_df.iloc[[row2]]]))
del row1, row2, duplicates, item
results_df.to_excel('Unbiased Sample Selections - By Tenure Group.xlsx')

In [None]:
# results_df.to_excel('Unbiased Sample Selections.xlsx')
model

# Changes
1. Drop Sessions Per Hour
2. Get SP100 and CXP from Brian Vickers
3. Check on Helx Search Ravi