# Define Functions and Run Initial Query

In [None]:
import pandas as pd
import pyodbc
import numpy as np
import plotly.express as px
from datetime import datetime
import statsmodels.api as sm

pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = 50
pd.options.display.max_rows = 20

def read_query_file(fileName):
    file = open(fileName,'r')
    string = file.read()
    file.close()
    return string

def run_query(query):
    cnxn = pyodbc.connect('DSN=edp-workbench-cshub', autocommit=True)
    df = pd.read_sql_query(query,cnxn)
    cnxn.close()
    return df
    
def timestamp():
    return datetime.now().strftime('%Y-%m-%d %I:%M:%S %p')

In [2]:
df = run_query(read_query_file('SQL\SamplingTest\KeyMetrics_ForGroupSelection.SQL'))
display(df.head())
df['SP100'] = df['Total_Accepted']/df['Total_Eligible']
df['CXP_Score']=df['CXP_Prob_Sum']/df['CXP_Prob_Count']
df['Sessions Per Hour'] = df['Sessions']/df['HoursWorked']
df['CRT']= df['Total_Resolution_Time']/df['Sessions']
df['Searches Per Session']= df['Helix_Searches']/df['Sessions']
df['%Sessions With Search']= df['Helix_Sessions']/df['Sessions']
df=df.join(pd.get_dummies(df['TenureGroup']))
display(df.head())

Unnamed: 0,Employee,TenureGroup,Date,HoursWorked,Sessions,Total_Resolution_Time,Total_Accepted,Total_Eligible,Helix_Searches,Helix_Sessions,CXP_Prob_Sum,CXP_Prob_Count
0,590774,31-60,2023-08-24,8.03,7.0,7268.0,0.0,8.0,8,4,3.05,6.0
1,590774,31-60,2023-08-20,8.19,12.0,10689.0,0.0,8.0,21,8,7.2,14.0
2,590774,31-60,2023-08-27,7.88,17.0,13621.0,4.0,14.0,10,5,9.8,19.0
3,590774,31-60,2023-08-21,8.11,17.0,11161.0,0.0,13.0,14,4,10.11,20.0
4,590774,31-60,2023-08-17,7.92,7.0,8646.0,0.0,4.0,17,10,4.15,8.0


Unnamed: 0,Employee,TenureGroup,Date,HoursWorked,Sessions,Total_Resolution_Time,Total_Accepted,Total_Eligible,Helix_Searches,Helix_Sessions,CXP_Prob_Sum,CXP_Prob_Count,SP100,CXP_Score,Sessions Per Hour,CRT,Searches Per Session,%Sessions With Search,0-30,121-180,180+,31-60,61-90,91-120
0,590774,31-60,2023-08-24,8.03,7.0,7268.0,0.0,8.0,8,4,3.05,6.0,0.0,0.51,0.87,1038.29,1.14,0.57,0,0,0,1,0,0
1,590774,31-60,2023-08-20,8.19,12.0,10689.0,0.0,8.0,21,8,7.2,14.0,0.0,0.51,1.47,890.75,1.75,0.67,0,0,0,1,0,0
2,590774,31-60,2023-08-27,7.88,17.0,13621.0,4.0,14.0,10,5,9.8,19.0,0.29,0.52,2.16,801.24,0.59,0.29,0,0,0,1,0,0
3,590774,31-60,2023-08-21,8.11,17.0,11161.0,0.0,13.0,14,4,10.11,20.0,0.0,0.51,2.1,656.53,0.82,0.24,0,0,0,1,0,0
4,590774,31-60,2023-08-17,7.92,7.0,8646.0,0.0,4.0,17,10,4.15,8.0,0.0,0.52,0.88,1235.14,2.43,1.43,0,0,0,1,0,0


# Select Sample Size by Tenure Group
Using the current number of experts find the number of samples needed in each group to be representative of the general population at Orlando

In [3]:
# df = df[~df['Employee'].isin([571644,267315])]
employees_df = df[['Employee','TenureGroup']].copy().drop_duplicates(subset='Employee')

SAMPLE_SIZE = 15
sample_size_df=pd.pivot_table(df.copy(),index='TenureGroup',values='Employee',aggfunc=lambda x: len(x.unique()),margins=True,)
sample_size_df['Total Employees']=sample_size_df['Employee']
del sample_size_df['Employee']
sample_size_df['% of Employees'] = sample_size_df['Total Employees']/sample_size_df['Total Employees'].loc['All']
sample_size_df['n Employees'] = (sample_size_df['% of Employees']* SAMPLE_SIZE).round()
display(sample_size_df)
sample_size_df=sample_size_df.iloc[:-1]
sample_size_df.loc['0-30']['n Employees']=2
display(sample_size_df.sum())
display(sample_size_df)

del SAMPLE_SIZE, employees_df

Unnamed: 0_level_0,Total Employees,% of Employees,n Employees
TenureGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-30,41,0.1,2.0
121-180,63,0.16,2.0
180+,156,0.4,6.0
31-60,42,0.11,2.0
61-90,54,0.14,2.0
91-120,35,0.09,1.0
All,391,1.0,15.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_size_df.loc['0-30']['n Employees']=2


Total Employees   391.00
% of Employees      1.00
n Employees        15.00
dtype: float64

Unnamed: 0_level_0,Total Employees,% of Employees,n Employees
TenureGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-30,41,0.1,2.0
121-180,63,0.16,2.0
180+,156,0.4,6.0
31-60,42,0.11,2.0
61-90,54,0.14,2.0
91-120,35,0.09,1.0


# Test
1. Creates representative samples
2. Tests those samples to see if they have a meaningful effect on the metric
    1. Runs an OLS on the sample to see if they have high p val for the metric. If they have a low value for all metrics they are saved as a good possible sample.

In [None]:
SCORE_CEIL = .175
SAMPLES = 1000
SelectionMetrics = ['Sessions','Sessions Per Hour','CRT','Searches Per Session','%Sessions With Search','Total_Accepted','SP100']
employees_df = df[['Employee','TenureGroup']].copy().drop_duplicates(subset='Employee')
results_df = pd.DataFrame(columns=['Members','Tenure Group']+SelectionMetrics)

# Build test dataframe dft and get groups setup
groups = sample_size_df.index.to_list()
# df = df.copy()
df['Constant']=1
df = df.replace({np.inf : np.nan})
# Drop some outliers
df = df[df['%Sessions With Search']<1]
df = df[df['Searches Per Session']<2.5]
# px.histogram(df['Searches Per Session']).show()

i=0
current_group = '61-90'
tenure_group_size = 2

group_len = str(len(groups))

for tenure_group in [current_group]:
    i = i+1
    print(timestamp()+" Running "+str(i)+" of "+group_len)
    # Make a loop to run this a bunch
    dft= df[df[tenure_group]==1].copy()
    dft=dft[['Employee','Constant']+SelectionMetrics+groups].copy()
    dft= dft.dropna()
    meanList = []
    stdList = []
    for metric in SelectionMetrics:
        meanList= meanList+[dft[metric].mean()]
        stdList= stdList+[dft[metric].std()]  
    meanDict = dict(zip(SelectionMetrics,meanList))
    stdDict = dict(zip(SelectionMetrics,stdList))
    del meanList, stdList
    for run in range(1,SAMPLES):
        # creates a list for the sampled employes
        # then adds the correct number of samples from each group to it
        sample_group = []
        sample_df = employees_df[employees_df['TenureGroup']==tenure_group]
        sample_group= sample_group+ sample_df['Employee'].sample(
            n=int(tenure_group_size)
            ,replace=False
            # ,random_state=run
            ).tolist()
        
        # Flag the employees in the group
        dft['TestSample']=(dft['Employee'].isin(sample_group)).astype(int)
        
        # See if its worth the regressions
        score_list = []
        score_count = 0
        test_means_df = dft[dft['TestSample']==1].copy()
        for y in SelectionMetrics:
            if SCORE_CEIL >= abs((test_means_df[y].mean()-meanDict[y])/stdDict[y]):
                score_count = score_count+1
            del y
        del test_means_df

        # Run the regresssions
        if score_count == len(SelectionMetrics):
            for y in SelectionMetrics:        
                model = sm.OLS(endog=dft[y].copy(),exog=dft.copy()[['TestSample','Constant']]).fit().summary2().tables[1]
                score = (model['Coef.'].loc['TestSample'])/stdDict[y]
                lower = model['[0.025'].loc['TestSample']
                upper = model['0.975]'].loc['TestSample']
                if abs(score) <= SCORE_CEIL and lower<0 and upper>0:
                    score_list.append(score)
                del score, y
            
        # add sample group to results list if the results were not significant
        if len(score_list)==len(SelectionMetrics):
            current_result = dict(zip(['Members','Tenure Group']+SelectionMetrics,[[sample_group]]+[tenure_group]+score_list))
            # display(current_result)
            results_df=pd.concat([results_df,pd.DataFrame(current_result,index=[run])])
        del sample_group, score_list, dft['TestSample'], sample_df

# del dft, SAMPLES, SCORE_CEIL, run, employees_df, group_len, groups, i, lower, upper, metric,model,meanDict

results_df

# Find and display subsets that do not have any duplicate members

In [None]:
results_df['TotalSCORE']=0
results_dfp = results_df.copy()
groups = sample_size_df.index.to_list()



i=0
# groups.remove('180+')
# groups.remove('0-30')
# groups.remove('91-120')

for tenure_group in [current_group]:
    results_df = results_dfp[results_dfp['Tenure Group']==tenure_group].copy()
    for metric in SelectionMetrics:
        results_df['TotalSCORE'] = results_df['TotalSCORE']+abs(results_df[metric])
    results_df['AvgSCORE']= results_df['TotalSCORE']/len(SelectionMetrics)
    results_df = results_df.sort_values('AvgSCORE')
    results_df = results_df.drop_duplicates(subset='Members')
    display(results_df.head(20))
    # del metric
    # for row1 in range(len(results_df['Members'])):
    #     for row2 in range(1,len(results_df['Members'])):
    #         duplicates = 0           
    #         for item in results_df['Members'].iloc[row1]:
    #             if item in results_df['Members'].iloc[row2]: 
    #                 duplicates=duplicates+1
    #                 # display(item)
    #         if duplicates == 0:
    #             summary=pd.concat([results_df.iloc[[row1]],results_df.iloc[[row2]]])
    #             display(summary)
results_df = results_dfp.copy()          
print(1)
# results_df.to_excel('Unbiased Sample Selections. Scored 180+.xlsx')