# Define Functions and Run Initial Query

In [None]:
import pandas as pd
import pyodbc
import numpy as np
import plotly.express as px
from datetime import datetime
import statsmodels.api as sm

pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

def read_query_file(fileName):
    file = open(fileName,'r')
    string = file.read()
    file.close()
    return string

def run_query(query):
    cnxn = pyodbc.connect('DSN=edp-workbench-cshub', autocommit=True)
    df = pd.read_sql_query(query,cnxn)
    cnxn.close()
    return df
    
def timestamp():
    return datetime.now().strftime('%Y-%m-%d %I:%M:%S %p')

def add_conditionals(df):
    df = df.copy()
    df['TestGroup'] = (df['Employee'].isin((573573,576911,572909,577073,575731,573585,573192,573276,572815,572247,579162,577246,573190,364717,
                                            576565,575996,581139,581145,580911,580895,581015,581275,552121,569375,547655,426097,554487))).astype(int)
    display("Test Group Tenure",df[df['TestGroup']==1]['TenureGroup'].value_counts())
    df['SP100'] = df['Total_Accepted']/df['Total_Eligible']
    df['CXP_Score']=df['CXP_Prob_Sum']/df['CXP_Prob_Count']
    df['Sessions Per Hour'] = df['Sessions']/df['HoursWorked']
    df['CRT']= df['Total_Resolution_Time']/df['Sessions']
    df['Searches Per Session']= df['Helix_Searches']/df['Sessions']
    df['%Sessions With Search']= df['Helix_Sessions']/df['Sessions']
    df=df.join(pd.get_dummies(df['TenureGroup']))
    return df

In [None]:
dfp = run_query(read_query_file('SQL\SamplingTest\KeyMetrics_ForGroupSelection-LimitedCoachesForTestOnly.SQL'))

# Select Sample Size by Tenure Group
Using the current number of experts find the number of samples needed in each group to be representative of the general population at Orlando

In [None]:
df = add_conditionals(dfp)
sample_size_df=pd.pivot_table(df[df['TestGroup']==1].copy(),index='TenureGroup',values='Employee',aggfunc=lambda x: len(x.unique()),margins=True,)
sample_size_df['Total Employees']=sample_size_df['Employee']
del sample_size_df['Employee']
sample_size_df['% of Employees'] = sample_size_df['Total Employees']/sample_size_df['Total Employees'].loc['All']
sample_size_df['n Employees'] = sample_size_df['Total Employees']
display(sample_size_df)
sample_size_df=sample_size_df.iloc[:-1]
display(sample_size_df.sum())
display(sample_size_df)

In [None]:
df = add_conditionals(dfp)
sample_size_df=pd.pivot_table(df[df['TestGroup']==00].copy(),index='TenureGroup',values='Employee',aggfunc=lambda x: len(x.unique()),margins=True,)
sample_size_df['Total Employees']=sample_size_df['Employee']
del sample_size_df['Employee']
sample_size_df['% of Employees'] = sample_size_df['Total Employees']/sample_size_df['Total Employees'].loc['All']
sample_size_df['n Employees'] = sample_size_df['Total Employees']
display(sample_size_df)
sample_size_df=sample_size_df.iloc[:-1]
display(sample_size_df.sum())
display(sample_size_df)

In [None]:
display(df[df['TestGroup']==1][['Employee','TenureGroup']].value_counts().to_excel('TestExpertsv2.xlsx'))

# Test
1. Creates representative samples
2. Tests those samples if they are within a number of standard deviations from the test group. If so adds them to a list for further examination.

In [22]:
SCORE_CEIL = .25
SAMPLES = 100000
SelectionMetrics = ['Sessions Per Hour','CRT','Searches Per Session','%Sessions With Search','Total_Accepted','SP100']
employees_df = df[['Employee','TenureGroup','TestGroup']].copy().drop_duplicates(subset='Employee')
employees_df = employees_df[employees_df['TestGroup']!=1]
results_df = pd.DataFrame(columns=['Members','Tenure Group']+SelectionMetrics)

# Build test dataframe dft and get groups setup
groups = sample_size_df.index.to_list()
df['Constant']=1
df = df.replace({np.inf : np.nan})
# Drop some outliers
# df = df[df['%Sessions With Search']<1]
# df = df[df['Searches Per Session']<2.5]
# px.histogram(df['Searches Per Session']).show()

i=0

tenure_group_size = 27

group_len = str(len(groups))




dft= df.copy()
meanList = []
stdList = []
for metric in SelectionMetrics:
    meanList= meanList+[dft[dft['TestGroup']==1][metric].mean()]
    stdList= stdList+[dft[dft['TestGroup']==1][metric].std()]  
meanDict = dict(zip(SelectionMetrics,meanList))
stdDict = dict(zip(SelectionMetrics,stdList))
del meanList, stdList
dft=dft[dft['TestGroup']!=1]
dft=dft[['Employee','Constant']+SelectionMetrics+groups].copy()
dft= dft.dropna()


sample_df = employees_df[employees_df['TestGroup']!=1]
for run in range(1,SAMPLES):
    percent_complete=(run*100/SAMPLES)
    if percent_complete % 10 == 0:
        print(str(percent_complete)+"% Complete "+timestamp())
    # creates a list for the sampled employes
    # then adds the correct number of samples from each group to it
    sample_group = []
    
    sample_group= sample_group+ sample_df['Employee'].copy().sample(
        n=int(tenure_group_size)
        ,replace=False
        # ,random_state=run
        ).tolist()
    
    # Flag the employees in the group
    dft['ControllSample']=(dft['Employee'].isin(sample_group)).astype(int)
    
    # Tes
    score_list = []
    score_count = 0
    test_means_df = dft[dft['ControllSample']==1].copy()
    for y in SelectionMetrics:
        score =(test_means_df[y].mean()-meanDict[y])/stdDict[y]
        if SCORE_CEIL >= abs(score):
            score_list.append(score)
        del y
    del test_means_df

    # add sample group to results list if the results were not significant
    if len(score_list)==len(SelectionMetrics):
        current_result = dict(zip(['Members']+SelectionMetrics,[[sample_group]]+score_list))         
        results_df=pd.concat([results_df,pd.DataFrame(current_result,index=[run])])
    del sample_group, score_list, dft['ControllSample']

del dft, SAMPLES, SCORE_CEIL, run, employees_df, group_len, groups, metric,meanDict
results_df

10.0% Complete 2023-09-18 09:04:21 PM
20.0% Complete 2023-09-18 09:05:10 PM
30.0% Complete 2023-09-18 09:06:11 PM
40.0% Complete 2023-09-18 09:07:33 PM
50.0% Complete 2023-09-18 09:08:55 PM
60.0% Complete 2023-09-18 09:10:34 PM
70.0% Complete 2023-09-18 09:12:22 PM
80.0% Complete 2023-09-18 09:14:46 PM
90.0% Complete 2023-09-18 09:18:05 PM


Unnamed: 0,Members,Tenure Group,Sessions Per Hour,CRT,Searches Per Session,%Sessions With Search,Total_Accepted,SP100
1,"[548646, 552081, 552402, 531526, 572560, 49115...",,0.24,-0.18,0.03,-0.02,0.18,0.01
6,"[321930, 571756, 572235, 568456, 581252, 56901...",,0.18,-0.10,0.11,0.16,0.09,-0.00
8,"[581396, 552712, 581380, 581007, 568568, 49115...",,0.13,-0.07,0.18,0.12,0.03,0.02
12,"[552704, 541518, 550910, 543980, 580888, 29393...",,-0.04,0.16,0.14,0.13,0.06,-0.00
13,"[550902, 552081, 581637, 578299, 546831, 58140...",,0.21,-0.16,0.15,0.12,0.01,-0.08
...,...,...,...,...,...,...,...,...
99976,"[546831, 581648, 581880, 552704, 579630, 56800...",,-0.00,-0.12,0.10,0.11,0.04,0.00
99979,"[541500, 531526, 568675, 568651, 549940, 58088...",,0.15,-0.17,0.02,0.22,0.08,-0.01
99982,"[577236, 533514, 552081, 572200, 549940, 57223...",,0.13,-0.17,0.20,0.13,0.12,0.02
99991,"[546357, 568404, 321930, 574178, 548632, 53152...",,0.17,-0.20,-0.02,-0.01,-0.01,-0.09


# Find and display subsets that do not have any duplicate members

In [23]:
results_df['TotalSCORE']=0
results_dfp = results_df.copy()

i=0

results_df = results_dfp.copy()
results_df = results_df.drop_duplicates(subset='Members')
for metric in SelectionMetrics:
    results_df['TotalSCORE'] = results_df['TotalSCORE']+abs(results_df[metric])
results_df['AvgSCORE']= results_df['TotalSCORE']/len(SelectionMetrics)
results_df = results_df.sort_values('AvgSCORE')
display(results_df.head(20))

Unnamed: 0,Members,Tenure Group,Sessions Per Hour,CRT,Searches Per Session,%Sessions With Search,Total_Accepted,SP100,TotalSCORE,AvgSCORE
1177,"[569498, 554243, 293932, 536491, 549667, 56176...",,0.01,-0.02,0.0,0.01,0.01,0.02,0.08,0.01
3144,"[393198, 578299, 574178, 548658, 568454, 56176...",,0.02,0.01,0.02,0.01,-0.01,-0.02,0.09,0.02
55826,"[568127, 293932, 554243, 568438, 552712, 56865...",,-0.01,-0.01,-0.05,0.01,0.0,0.02,0.1,0.02
79190,"[491150, 572232, 543980, 568651, 568456, 57723...",,-0.0,0.04,0.01,-0.01,0.01,-0.03,0.1,0.02
18175,"[581252, 293932, 548632, 569009, 568982, 57417...",,0.05,-0.0,-0.01,-0.01,0.02,-0.02,0.11,0.02
56717,"[568545, 550902, 561761, 547381, 570227, 54151...",,-0.06,0.0,0.01,0.02,0.01,-0.0,0.11,0.02
22202,"[570227, 548658, 541226, 581252, 293932, 49115...",,0.01,-0.06,0.02,0.0,0.03,-0.0,0.11,0.02
56815,"[569498, 552704, 400024, 576601, 572200, 54802...",,0.0,-0.06,0.02,0.02,0.01,0.0,0.11,0.02
50303,"[572200, 546831, 552129, 547417, 548632, 54966...",,-0.04,0.01,0.0,0.02,-0.03,-0.01,0.12,0.02
81047,"[572235, 546616, 552704, 572200, 552712, 32193...",,0.03,-0.04,0.02,0.01,0.01,-0.01,0.12,0.02


In [26]:
len(results_df['Members'].loc[1177])

27

In [25]:
run_query("""
               WITH
DateSelector As (
    SELECT *
    FROM
        ( VALUES (CAST('2023-07-01' AS DATE), CAST(CURRENT_DATE AS DATE)) )
        as t ("StartDate","EndDate")
)

    /*Finds currently active
     Employees in the orlando office and their tenure group */
    SELECT
        CAST(eh.expert_id AS BIGINT) as "Employee",
        eh.tenure_group as "TenureGroup", *
    FROM
        hive.care.l3_asurion_whole_home_expert_hierarchy eh
    WHERE 1=1
        AND eh.lob_name  = 'pss-verizon wireless'
        AND UPPER(eh.business_unit) = 'PSS'
        AND eh.location = 'flor'
        AND (SELECT MAX(EndDate) FROM DateSelector)
            BETWEEN eh.eff_start_dt and eh.eff_end_dt
        and CAST(eh.expert_id AS BIGINT)  in (538855,
569498,
 554243,
 293932,
 536491,
 549667,
 561761,
 574166,
 568127,
 567620,
 579630,
 581694,
 550446,
 552402,
 577236,
 393198,
 576601,
 578299,
 548646,
 581396,
 569153,
 548026,
 568568,
 567519,
 568651,
 570227,
 547381,
 321930
          ) """)[['Employee','TenureGroup']].to_csv('Experts.csv')

  df = pd.read_sql_query(query,cnxn)
