In [None]:
# This notebook's pipeline is managed through mrxlinx extension.
# Looks like it's not organized, but in this extension shows 
# the whole pipeline correctly.

In [None]:
import pandas as pd
import numpy as np
import polars as pl
from pickle import dump, load
import itertools
import winsound

import Custom_Tools
from sksurv.nonparametric import kaplan_meier_estimator
from target_definition_polars import target_definition_polars
from drop_turnover import drop_turnover
from preprocess_turnover import preprocess_turnover
from scale_dataset import scale_dataset
from final_transformation import final_transformation


from sklearn.model_selection import train_test_split, StratifiedKFold, ShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif


from imblearn.over_sampling import ADASYN
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

import optuna

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import shap




In [None]:
path = 'C:/Users/Gabriel.Gomes/Downloads/Turnover/23-01-16/'

In [None]:
df_pd = pd.read_excel(path + 'Análise_Headcount.xlsx')
df_pd.astype(str).to_parquet(path + 'Análise_Headcount.parquet', index=False)

df_mov_pd = pd.read_excel(path + 'Movimentações.xlsx')
df_mov_pd['dtmudanca'] = df_mov_pd.dtmudanca.dt.date
df_mov_pd = df_mov_pd[df_mov_pd.descricao.isin(['MÉRITO', 'PROMOÇÃO'])]
df_mov_pd.astype(str).to_parquet(path + 'Movimentações.parquet', index=False)


duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)

In [None]:
df = pl.read_parquet(path + 'Análise_Headcount.parquet')
df_mov = pl.read_parquet(path + 'Movimentações.parquet')

'''
Adicionar tipo de função (Gestor, não gestor)
Salário, méritos e promoções
Pulses


'''

In [None]:
df.columns

In [None]:
# Needs to be fixed, target_definition is excluding more than 1 year dismissals

df_surv = target_definition(df, 'tipo_colaborador', 'CLT')
df_surv = df_surv[(df_surv.tempo_de_casa > 0) & (df_surv.tempo_de_casa < 62)]
df_surv.loc[df_surv.tipo_função == 'Docente','tipo_filial'] = 'DOCENTE'

In [None]:
df_surv.cod

In [None]:
fig = px.histogram(df_surv, x="tempo_de_casa",  color="desligado", facet_col="corraca_descricao", facet_row="escolaridade_classificação",height=1500, width = 1000) # noqa E501
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for annotation in fig['layout']['annotations']: 
    annotation['textangle']= 25
fig.show()

In [None]:
g = sns.FacetGrid(data=df_surv, row="escolaridade_classificação", col="corraca_descricao", hue = 'desligado')


g.map(sns.kdeplot, "tempo_de_casa", warn_singular=False);






In [None]:
data_x = df_surv.copy()

data_y = df_surv.desligado==1

surv_days = data_x.tempo_de_casa

analyze = ['tipo_filial','VP_descricao','genero', 'corraca_descricao']

for i in analyze:

    analisar = i

    plt.figure(figsize=(12, 12), dpi=80)

    for value in data_x[analisar].unique():
        mask = data_x[analisar] == value
        time_cell, survival_prob_cell = kaplan_meier_estimator(data_y[mask],
                                                               surv_days[mask])
        plt.step(time_cell, survival_prob_cell, where="post",
                 label="%s (n = %d)" % (value, mask.sum()));

    plt.ylabel("est. probability of survival $\hat{S}(t)$")
    plt.xlabel("Months")
    plt.legend(loc="best");


In [None]:
df_alvo = target_definition_polars(df, df_mov, 'tipo_colaborador', 'CLT')

In [None]:
X, X_holdout, y, y_holdout = train_test_split(df_alvo.drop(['desligado'], axis=1), df_alvo['desligado'], test_size=0.1, random_state=42, stratify=df_alvo['desligado']) # noqa E501

In [None]:
print(y.value_counts()/y.shape[0])
print(y_holdout.value_counts()/y_holdout.shape[0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # noqa E501

df_trat = X_train

y_trat = y_train.loc[df_trat.index]



In [None]:
print(y_train.value_counts()/y_train.shape[0])
print(y_test.value_counts()/y_test.shape[0])

In [None]:
print(X_train.idade.mean())

In [None]:
df_trat_drop = drop_turnover(df_trat)

df_preprocess = preprocess_turnover(df_trat_drop)

In [None]:
teste = df_trat_drop.copy()

for i in teste:
    print(f'{i} - {teste[i].dtype}')

In [None]:
print(df_preprocess.columns)
print(df_preprocess.shape)
print(y_trat.shape)
#print(pd.cut(df_trat_drop.salario, [0,5000,10000,15000,20000]).value_counts()/df_trat_drop.shape[0])
print(df_preprocess.isnull().sum())

duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)


In [None]:
df_preprocess.columns

In [None]:
Scaler = MinMaxScaler((0,1))
Scaler.fit(df_preprocess[feature_final])
dump(Scaler, open('Scaler.pkl', 'wb'))

In [None]:
df_eng.loc[df_eng.filial_descricao.str.contains('IPEMED'), 'filial_descricao'] = 'IPEMED'

df_eng.filial_descricao.unique()

In [None]:
df_eng.filial_descricao.unique()

In [None]:
from sklearn.feature_extraction import FeatureHasher

def useful_metrics(X_df, y_df, model, threshold=0.5):
    evaluation = {}
    
    results = model.predict_proba(X_df)[:,-1]>threshold

    base_metrics = [
                metrics.f1_score,
                metrics.recall_score,
                metrics.precision_score,
                metrics.brier_score_loss]

    for metric in base_metrics:
            
        if metric.__name__ == 'brier_score_loss':
        
            evaluation[metric.__name__] = round(metric(y_df, model.predict_proba(X_df)[:,-1]),2)
        
        else:
            
            evaluation[metric.__name__] = round(metric(y_df, results, zero_division=0),2)

    return pd.DataFrame(evaluation, index=[0])





def feature_hashing_optimum(col_name, final_number_of_columns = 0):
    
    from tqdm.notebook import tqdm
    
    scores = pd.DataFrame()
    
    if final_number_of_columns == 0:

        for i in tqdm(range(3,20)):

            n_features = i

            hashf = FeatureHasher(n_features=n_features,  input_type='string')

            cols = ['Test_'+str(i) for i in range(n_features)]

            train_hash = pd.DataFrame(hashf.fit_transform(df_trat[col_name]).toarray(), columns = cols )

            test_hash = pd.DataFrame(hashf.transform(X_test[col_name]).toarray(), columns = cols)

            model = XGBClassifier(use_label_encoder=False, verbosity=0)

            model.fit(train_hash, y_trat)

            score = useful_metrics(test_hash, y_test, model)

            score['cols'] = i

            scores = pd.concat([scores, score])

        return scores

    else:

        hashf = FeatureHasher(n_features=final_number_of_columns,  input_type='string')
        hashf.fit(df_trat[col_name])

        return hashf



In [None]:
final_score = feature_hashing_optimum('codsecao')
final_score
scores_trat = final_score.copy() 
scores_trat.index = scores_trat.cols
scores_trat[['f1_score', 'recall_score', 'precision_score']].plot(figsize=(10, 6))

In [None]:
df_trat.columns

In [None]:
scores_trat['score'] = scores_trat.cols*3 - scores_trat.f1_score*200

scores_trat.sort_values('score')

In [None]:
a = feature_hashing_optimum('codsecao', 11)

In [None]:
a.fit_transform(df_trat['codsecao']).toarray()

In [None]:
dump(feature_hashing_optimum('codsecao', 11),
     open('codsecao_hasher.pkl', 'wb'))

In [None]:
tt.columns

In [None]:
def feature_hashing(df, col_name, hasher):

    cols = [col_name+'_'+str(i) for i in range(hasher.get_params()['n_features'])]

    return  pd.concat([df, pd.DataFrame(hasher.transform(df[col_name]).toarray(), columns = cols, index=df.index)], axis=1)

In [None]:
pd.DataFrame()

In [None]:

a = feature_hashing(df_trat, 'codsecao', hasher1)
a

In [None]:
Scaler = load(open('Scaler.pkl', 'rb'))
feature_final = load(open('feature_selection.pkl', 'rb'))
#feature_final = df_preprocess.columns.tolist()
scaled_df = df_preprocess[feature_final]
scaled_df = scale_dataset(df_preprocess, feature_final, Scaler)

In [None]:
df_preprocess.columns

In [None]:
df_preprocess_analysis = scaled_df.copy()
df_preprocess_analysis['desligado'] = y_train

aval = Custom_Tools.preprocess_afya(df_preprocess_analysis)

cols = aval.col_missing()

card = aval.cardinality()

rws = aval.row_missing()

In [None]:
print(rws[rws > 0])

In [None]:
print(cols[cols > 0])

In [None]:
cols = df_preprocess_analysis.columns.tolist()

In [None]:
cols.remove('desligado')

In [None]:
aval.distribution(cols, 'desligado')

# Feature Selection

In [None]:
y_trat.shape

In [None]:
df_preprocess.iloc[:,11]

In [None]:
feature_check = [x for x in df_preprocess.columns]
algorithms = [chi2, f_classif, mutual_info_classif]
selection = Custom_Tools.feature_selection(df_preprocess[feature_check], y_trat, algorithms, 500)


In [None]:
features = selection.columns.tolist()
features.remove('Total_points')
Kbest_eval = selection.groupby('Columns').sum()['Total_points'].sort_values(ascending=False) # noqa E501
Kbest_eval

In [None]:
Kbest_eval[(Kbest_eval/Kbest_eval[0])>0.3].index.tolist()

In [None]:
# Tempo de casa redundante com ord_faixa_tempo_casa
feature_final = ['tipo_filial_DIGITAL', 'filial_descricao_AFYA',
       'genero',
       'engineer_merit_indexes',
       'codsecao_gerencia_contagion',
       'salario', 'funcao_descricao_clean_contagion',
       'tempo_de_casa', 'engineer_promotion_indexes']

#feature_final = df_preprocess.columns

dump(feature_final, open('feature_selection.pkl', 'wb'))

In [None]:
Scaler = load(open('Scaler.pkl', 'rb'))
feature_final = load(open('feature_selection.pkl', 'rb'))
#feature_final = df_preprocess.columns.tolist()
scaled_df = df_preprocess[feature_final].copy()
scaled_df = scale_dataset(df_preprocess, feature_final, None)


# Comment if feature selection was used
# dump(feature_final, open('feature_selection.pkl', 'wb'))

X_train_final = scaled_df[feature_final].copy()

In [None]:
feature_final

In [None]:
lr = LogisticRegression(max_iter=10000)
lgbm = LGBMClassifier()
gnb = GaussianNB()


models = {'Logistic Regression': lr,
          'LGBM': lgbm,
          'Gaussian NB': gnb,
          'Random Forest | Default': RandomForestClassifier()}

for a in range(5, 3, -1):
    models['Random Forest - Max_depth:'+str(a)] = RandomForestClassifier(max_depth=a) # noqa E501


In [None]:
models_1 = {'Random Forest - Default': RandomForestClassifier()}

In [None]:
test = Custom_Tools.overfit(X_train_final, y_trat, models_1, None, None)
test_results = test.overfit(metrics.f1_score)
test_results

In [None]:
fig = px.line(test_results, x="Sample", y="f1_score", color="Dataset", facet_col="Model", facet_col_wrap=4, height=500, range_y=[0,1]) # noqa E501
fig.show()

In [None]:
All_models_f1 = Custom_Tools.overfit(X_train_final, y_trat, models, None, None)
All_models_results_f1 = All_models_f1.overfit(metrics.f1_score)
All_models_f1 = Custom_Tools.overfit(X_train_final, y_trat, models, None, ADASYN(sampling_strategy= 1.0)) # noqa E501
All_models_results_f1 = pd.concat([All_models_results_f1, All_models_f1.overfit(metrics.f1_score)]) # noqa E501

In [None]:
pivot_f1 = All_models_results_f1.pivot(index=['Sample', 'Model'], columns='Dataset', values='f1_score') # noqa E501
pivot_f1['Overfit'] = abs(pivot_f1['Test']-pivot_f1['Train'])
pivot_f1 = pivot_f1.groupby(['Model'])[['Overfit', 'Test']].mean().sort_values(['Overfit']) # noqa E501
pivot_f1[pivot_f1.Test > 0]

In [None]:
All_models_recall = Custom_Tools.overfit(X_train_final, y_trat, models, None, None) # noqa E501
All_models_results_recall = All_models_recall.overfit(metrics.recall_score)
All_models_recall = Custom_Tools.overfit(X_train_final, y_trat, models, None, ADASYN(sampling_strategy= 1.0)) # noqa E501
All_models_results_recall = pd.concat([All_models_results_recall, All_models_recall.overfit(metrics.recall_score)]) # noqa E501

In [None]:
All_models_recall = Custom_Tools.overfit(X_train_final, y_trat, models, None, None) # noqa E501
All_models_results_recall = All_models_recall.overfit(metrics.recall_score)
All_models_recall = Custom_Tools.overfit(X_train_final, y_trat, models, None, ADASYN(sampling_strategy= 1.0)) # noqa E501
All_models_results_recall = pd.concat([All_models_results_recall, All_models_recall.overfit(metrics.recall_score)]) # noqa E501

In [None]:
pivot_recall = All_models_results_recall.pivot(index=['Sample', 'Model'], columns='Dataset', values='recall_score') # noqa E501
pivot_recall['Overfit'] = abs(pivot_recall['Test']-pivot_recall['Train'])
pivot_recall = pivot_recall.groupby('Model')['Overfit', 'Test'].mean().sort_values('Overfit') # noqa E501
pivot_recall[pivot_recall.Test > 0]

In [None]:
All_models_precision = Custom_Tools.overfit(X_train_final, y_trat, models, None, None) # noqa E501
All_models_results_precision = All_models_precision.overfit(metrics.f1_score)
All_models_precision = Custom_Tools.overfit(X_train_final, y_trat, models, None, ADASYN(sampling_strategy= 1.0)) # noqa E501
All_models_results_precision = pd.concat([All_models_results_precision, All_models_precision.overfit(metrics.precision_score)]) # noqa E501


In [None]:
pivot_precision = All_models_results_precision.pivot(index=['Sample', 'Model'], columns='Dataset', values='precision_score') # noqa E501
pivot_precision['Overfit'] = abs(pivot_precision['Test']-pivot_precision['Train'])
pivot_precision = pivot_precision.groupby('Model')[['Overfit', 'Test']].mean().sort_values('Overfit') # noqa E501
pivot_precision[pivot_precision.Test > 0]

In [None]:
fig = px.line(All_models_results_f1, x="Sample", y="f1_score",color="Dataset",facet_col="Model",facet_col_wrap=4, height=2500,range_y=[0,1]) # noqa E501
fig.show()

In [None]:
X_test_final = final_transformation(X_test, feature_final, 0, None)

In [None]:
X_test_final.columns

In [None]:
X_train_final, y_train_final = X_train_final, y_trat

In [None]:
print(X_train_final.shape)
print(y_train_final.shape)
print(X_test_final.shape)
print(y_test.shape)

In [None]:
def useful_metrics(X_df, y_df, model, threshold=0.5):
    evaluation = {}
    
    results = model.predict_proba(X_df)[:,-1]>threshold

    base_metrics = [
                metrics.f1_score,
                metrics.recall_score,
                metrics.precision_score,
                metrics.brier_score_loss]

    for metric in base_metrics:
            
        if metric.__name__ == 'brier_score_loss':
        
            evaluation[metric.__name__] = round(metric(y_df, model.predict_proba(X_df)[:,-1]),2)
        
        else:
            
            evaluation[metric.__name__] = round(metric(y_df, results, zero_division=0),2)

    return pd.DataFrame(evaluation, index=[0])

In [None]:
gnb = GaussianNB()

cat = CatBoostClassifier(silent=True)

xgb = XGBClassifier(use_label_encoder=False, verbosity=0)

lr = LogisticRegression()

models = [gnb, cat, xgb, lr]

for model in models:

    print(model.__class__.__name__)

    model.fit(X_train_final, y_train_final)

    print(useful_metrics(X_test_final[X_train_final.columns.tolist()],
                         y_test, model))


In [None]:
xgb.a

In [None]:
X_optuna.shape

In [None]:
X_optuna = pd.concat([X_train_final, X_test_final])
y_optuna = pd.concat([y_train_final, y_test])

In [None]:

sequence = [ ]
for i in range(-9, 1):
    sequence.append(10**i)

def objective_gnb(trial):

    skf = StratifiedKFold(n_splits=5)
    
    model_grid = {
        'var_smoothing': trial.suggest_categorical('var_smoothing', sequence)
    }
    

    param_grid = {
    'threshold': trial.suggest_float('threshold',0.2,1.0,step=0.01)
    }

    model = GaussianNB(**model_grid)

    F_evaluation = pd.DataFrame()


    for train_index, test_index in skf.split(X_optuna, y_optuna):
        X_train, y_train = X_optuna.iloc[train_index], y_optuna.iloc[train_index]
        X_test, y_test = X_optuna.iloc[test_index], y_optuna.iloc[test_index]

        model.fit(X_train, y_train)
        
        evaluation = useful_metrics(X_test, y_test, model, **param_grid)

        F_evaluation = pd.concat([F_evaluation,evaluation])

    return(F_evaluation.f1_score.mean(), F_evaluation.precision_score.mean(),F_evaluation.recall_score.mean(),F_evaluation.brier_score_loss.mean())

In [None]:
def objective_lr(trial):

    skf = StratifiedKFold(n_splits=5)
    
    model_grid = {
        'max_iter':10000
    }
    
    oversample_grid = {
        'sampling_strategy': trial.suggest_float('sampling_strategy',0.5,1.0,step=0.01)
    }
    

    param_grid = {
    'threshold': trial.suggest_float('threshold',0.2,1.0,step=0.01)
    }

    model = LogisticRegression(**model_grid)

    F_evaluation = pd.DataFrame()
    
    over_under = ADASYN(**oversample_grid)


    for train_index, test_index in skf.split(X_optuna, y_optuna):
        X_train, y_train = X_optuna.iloc[train_index], y_optuna.iloc[train_index]
        X_test, y_test = X_optuna.iloc[test_index], y_optuna.iloc[test_index]
        
        X_train, y_train = over_under.fit_resample(X_train, y_train)

        model.fit(X_train, y_train)
        
        evaluation = useful_metrics(X_test, y_test, model, **param_grid)

        F_evaluation = pd.concat([F_evaluation,evaluation])

    return(F_evaluation.f1_score.mean(), F_evaluation.precision_score.mean(),F_evaluation.recall_score.mean(),F_evaluation.brier_score_loss.mean())

In [None]:
study = optuna.create_study(directions=['maximize','maximize', 'maximize', 'minimize'])
study.optimize(objective_lr, n_trials=100)

In [None]:
metrics_name = ['F1','Precision','Recall','Brier']

tries = pd.DataFrame()
try_1 = {}
for trial in study.best_trials:
    
    for metric, value in zip(metrics_name,trial.values):
        
        try_1[metric] = value
        
    try_1['Params'] = str(trial.params)
    
    tries = pd.concat([tries, pd.DataFrame(try_1, index=[0])], ignore_index=True)
    
    tries = tries[tries.Precision > 0]
    
tries.sort_values(['F1','Brier','Precision'], ascending= [False, True, False]).head(20)

In [None]:
def objective_xgb(trial):

    skf = StratifiedKFold(n_splits=5)

    model_grid = {
        # Prevent Overfit
        'max_depth': trial.suggest_int('max_depth', 3, 7, step=1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7, step=1),
        'gamma': trial.suggest_int('gamma', 0, 7, step=1),
        # Imbalanced dataset
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10, step=1)
    }
    '''
    param_grid = {
        'threshold': trial.suggest_float('threshold', 0.2, 1.0, step=0.01)
    }'''
    
    param_grid = {'threshold':0.5}

    model = XGBClassifier(use_label_encoder=False, verbosity=0, **model_grid)

    F_evaluation = pd.DataFrame()

    for train_index, test_index in skf.split(X_optuna, y_optuna):
        X_train, y_train = X_optuna.iloc[train_index], y_optuna.iloc[train_index]
        X_test, y_test = X_optuna.iloc[test_index], y_optuna.iloc[test_index]

        model.fit(X_train, y_train)

        evaluation = useful_metrics(X_test, y_test, model, **param_grid)

        F_evaluation = pd.concat([F_evaluation, evaluation])

    return(F_evaluation.f1_score.std(),
           F_evaluation.f1_score.mean(), F_evaluation.precision_score.mean(),
           F_evaluation.recall_score.mean(),
           F_evaluation.brier_score_loss.mean())

In [None]:
study = optuna.create_study(directions=['minimize', 'maximize','maximize', 'maximize', 'minimize'])
study.optimize(objective_xgb, n_trials=100)

duration = 2000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)


In [None]:
metrics_name = ['std_F1','F1', 'Precision', 'Recall', 'Brier']

tries = pd.DataFrame()
try_1 = {}
for trial in study.best_trials:

    for metric, value in zip(metrics_name, trial.values):

        try_1[metric] = value

    try_1.update(trial.params)
    try_1['Params'] = str(trial.params)

    tries = pd.concat([tries, pd.DataFrame(try_1, index=[0])], ignore_index=True)

    tries = tries[tries.Precision > 0]

best_params = tries.sort_values(['std_F1', 'Precision','F1','Brier'], ascending= [True, False, True, False]).head(100)

best_params[(best_params.Recall > 0.200) & (best_params.Precision > 0.900)].drop_duplicates().sort_values('F1', ascending=False)

In [None]:
best_params.loc[best_params.index == 4,"Params"].to_dict()

In [None]:
study = optuna.create_study(directions=['maximize','maximize', 'maximize', 'minimize'])
study.optimize(objective_gnb, n_trials=200)

In [None]:
metrics_name = ['F1','Precision','Recall','Brier']

tries = pd.DataFrame()
try_1 = {}
for trial in study.best_trials:
    
    for metric, value in zip(metrics_name,trial.values):
        
        try_1[metric] = value
        
    try_1['Params'] = str(trial.params)
    
    tries = pd.concat([tries, pd.DataFrame(try_1, index=[0])], ignore_index=True)
    
    tries = tries[tries.Precision > 0]
    
tries

In [None]:
tries.sort_values(['F1','Brier','Precision'], ascending= [False, True, False]).head(20)

In [None]:
params = {'max_depth': 4, 'min_child_weight': 4, 'gamma': 1, 'max_delta_step': 9}


threshold = 0.5

xgb = XGBClassifier(use_label_encoder=False, verbosity=0, random_state=42, **params)

#xgb = XGBClassifier(use_label_encoder=False, verbosity=0, random_state=42)

In [None]:
X_test_final.columns

In [None]:


xgb.fit(X_train_final, y_train_final)

explainer = shap.Explainer(xgb.predict, X_train_final)
shap_values = explainer(X_test_final)

In [None]:
shap.summary_plot(shap_values, plot_type='violin')

In [None]:
X_test_analysis = X_test_final.copy()

X_test_analysis['pred_proba'] = xgb.predict_proba(X_test_final)[:, -1]

X_test_analysis['true'] = y_test

X_test_analysis = X_test_analysis.reset_index()

test = pd.DataFrame({'true': y_test, 'pred':xgb.predict_proba(X_test_final)[:, -1]})

limit = 40
bins = 50

test[test.true==1].pred.plot.hist(bins=bins, ylim=(0,limit), fc=(0, 0, 1, 0.5), legend='Legend', label='Voluntary Attrition', yticks=np.arange(0,limit,2))
test[test.true==0].pred.plot.hist(bins=bins, ylim=(0,limit), fc=(0, 0.4, 0.3, 0.5), legend='Legend', label='Active')

In [None]:
shap_analyze= pd.DataFrame()

index = 0

for i in shap_values:
    teste = pd.DataFrame(data=[list(i.values)], columns=X_test_final.columns, index=[index])
    shap_analyze = pd.concat([shap_analyze, teste])
    
    index += 1
    
    
shap_analyze

In [None]:
def plot_analysis(title='test', dictionary_analysis={}):
    
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.title(title)

    ax.boxplot(dictionary_analysis.values())
    ax.set_xticklabels(dictionary_analysis.keys(), rotation = 90);

for target_pred in range(0,2):
    
    index_analysis = X_test_analysis[(X_test_analysis.true == target_pred)&(X_test_analysis.pred_proba >= 0.5)].index
    
    shap_analyze_chosen = shap_analyze.iloc[index_analysis,:]
    
    shap_analyze_chosen_dict = {columns:shap_analyze_chosen[columns] for columns in shap_analyze_chosen}
    
    plot_analysis(f'True == {target_pred} and prob >= 0.5', shap_analyze_chosen_dict)
    
    index_analysis = X_test_analysis[(X_test_analysis.true == target_pred)&(X_test_analysis.pred_proba < 0.5)].index
    
    shap_analyze_chosen = shap_analyze.iloc[index_analysis,:]
    
    shap_analyze_chosen_dict = {columns:shap_analyze_chosen[columns] for columns in shap_analyze_chosen}
    
    plot_analysis(f'True == {target_pred} and prob < 0.5', shap_analyze_chosen_dict)

In [None]:
X_test_analysis_trat = X_test_analysis[['true']]

shap_final = pd.concat([shap_analyze, X_test_analysis_trat], axis=1  )

shap_final_trat = pd.melt(shap_final, id_vars=['true']).groupby(['true', 'variable']).median().reset_index()

index_true = shap_final_trat['true'] == 0

shap_final_trat.loc[index_true, 'value'] = shap_final_trat[index_true]*-1

shap_final_trat.loc[:, 'value'] = shap_final_trat.loc[:, 'value']*10

shap_final_trat = shap_final_trat.sort_values('value', ascending=False)

shap_final_trat = shap_final_trat.pivot(index='variable', columns='true')['value']

shap_final_trat = shap_final_trat.reindex()

shap_final_trat['dif'] = np.round(shap_final_trat[0] + shap_final_trat[1],3)

feature_shap_selection = shap_final_trat[shap_final_trat.dif != 0].sort_values('dif')

feature_shap_selection

In [None]:
a = ['tipo_filial_DIGITAL', 'filial_descricao_AFYA',
       'genero',
       'engineer_merit_indexes',
       'codsecao_gerencia_contagion',
       'salario', 'funcao_descricao_clean_contagion',
       'tempo_de_casa', 'engineer_promotion_indexes']

In [None]:
feature_shap_selection[feature_shap_selection.index.isin(a)]

In [None]:
['tipo_filial_DIGITAL', 'VP_descricao_VICEPRESIDENCIADESERVICOSDIGITAIS',
       'filial_descricao_AFYA', 'VP_descricao_VICEPRESIDENCIADEMERCADO',
       'genero', 'corraca_descricao_Branca',
       'VP_descricao_VICEPRESIDENCIADEINOVACAOESERVDIGITAIS',
       'funcao_descricao_clean_proportion_contagion', 'engineer_merit_indexes',
       'codsecao_gerencia_proportion_contagion', 'codsecao_gerencia_contagion',
       'salario', 'funcao_descricao_clean_contagion', 'ord_faixa_tempo_casa',
       'tempo_de_casa', 'engineer_promotion_indexes']

In [None]:
from yellowbrick.classifier import DiscriminationThreshold


# Instantiate the classification model and visualizer
model = XGBClassifier(use_label_encoder=False, verbosity=0, **params)
visualizer = DiscriminationThreshold(model)

visualizer.fit(X_train_final, y_train_final)        # Fit the data to the visualizer
visualizer.show()           # Finalize and render the figure

In [None]:
from yellowbrick.model_selection import validation_curve

viz = validation_curve(
    xgb, X_train_final, y_train_final,
    param_name= 'gamma',
    param_range= np.arange(1,7),
    cv=10, scoring='f1'
)

In [None]:
index = 0
chosen_index = 0
last_best = 0


for i in visualizer.cv_scores_['precision'].tolist():
    
    if i > last_best:
        last_best = i
        chosen_index = index
        consecutive_better = 0
    else:
        consecutive_better += 1
        
    if consecutive_better > 9:
        break
        
    index += 1
        
print(str(last_best)+'/n'+ str(chosen_index))   

print(str(visualizer.thresholds_[chosen_index]))

In [None]:
X_dataset, y_dataset = X_optuna, y_optuna

model = xgb

evaluation = pd.DataFrame()

assertiveness = pd.DataFrame()

shf = ShuffleSplit(n_splits=100,test_size=0.3)

count = 0

for train_index, test_index in shf.split(X_dataset,y_dataset):

        X_train, y_train  = X_dataset.iloc[train_index].copy(), y_dataset.iloc[train_index].copy()
        X_test, y_test = X_dataset.iloc[test_index].copy(), y_dataset.iloc[test_index].copy()
        
        #Exclude if no oversample
        #X_train, y_train = over_under.fit_resample(X_train, y_train)

        model.fit(X_train,y_train)

        probabilities = [round(x,2) for x in model.predict_proba(X_test)[:,-1]]
        
        evaluation = pd.concat([evaluation, useful_metrics(X_test, y_test, model, threshold)],ignore_index=True)
        
        assertiveness_rec = pd.DataFrame({'index':X_test.index, 'probabilities': probabilities, 'y_true': y_test})
        
        assertiveness = pd.concat([assertiveness, assertiveness_rec])
        

        count+=1
        
        print("Shuffle " + str(count), end = "\r")
        
assertiveness['correct'] = assertiveness.y_true == (assertiveness.probabilities > threshold)
assertiveness['brier'] = abs(assertiveness.y_true - assertiveness.probabilities)
assertiveness['precision'] = [y if x ==0 else np.nan for x,y in zip(assertiveness.y_true,assertiveness.correct)]
assertiveness['recall'] = [y if x ==1 else np.nan for x,y in zip(assertiveness.y_true,assertiveness.correct)]
        
grouped_stats = assertiveness.groupby('index').agg(
        avg_correct_percent =('correct', 'mean'),
        avg_precision = ('precision', np.nanmean),
        avg_recall = ('recall', np.nanmean),
        std = ('brier', np.std),
        brier = ('brier', np.mean),
        count = ('brier', 'count')
        )
        
X_dataset_stats = pd.merge(X_dataset,grouped_stats, left_index=True, right_index=True)

X_dataset_stats = pd.merge(X_dataset_stats,pd.get_dummies(y_dataset, prefix='target'), left_index=True, right_index=True)


In [None]:
duration = 1000  # milliseconds
freq = 440# Hz
winsound.Beep(freq, duration)
ax = sns.boxplot(x='variable', y='value', data=pd.melt(evaluation))
ax.set_xticklabels(ax.get_xticklabels(),rotation=30);

In [None]:
X_holdout_final = final_transformation(X_holdout,feature_final, 0, 'N')

In [None]:
xgb.fit(X_optuna, y_optuna)

metrics_final = useful_metrics(X_holdout_final, y_holdout, xgb, threshold)

confusion_matrix = metrics.confusion_matrix(
    y_holdout, xgb.predict_proba(X_holdout_final)[:, -1] > threshold)

tn, fp, fn, tp = confusion_matrix.ravel()
specificity = tn / (tn+fp)

In [None]:
test = pd.DataFrame({'true': y_holdout, 'pred':xgb.predict_proba(X_holdout_final)[:, -1]})

limit = 40
bins = 50

test[test.true==1].pred.plot.hist(bins=bins, ylim=(0,limit), fc=(0, 0, 1, 0.5), legend='Legend', label='Voluntary Attrition', yticks=np.arange(0,limit,2))
test[test.true==0].pred.plot.hist(bins=bins, ylim=(0,limit), fc=(0, 0.4, 0.3, 0.5), legend='Legend', label='Active')

In [None]:
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix)

cm_display.plot()
plt.grid(False)
plt.show()

In [None]:
print(f"O teste final do modelo conseguiu capturar {round(metrics_final['recall_score'][0]*100,2)}% ({tp} pessoa(s)) que iriam sair.\n")

print(f'O custo desse tipo de predição foi de {round(100-specificity*100,1)}%.\n Resumindo, o modelo previu que {fp} pessoa(s) da base iria sair, mas não saiu...')

print('\nExcelente notícia!!!!')



duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)


In [None]:
dump(xgb, open('turnover_general_model.pkl', 'wb'))

dump(threshold, open('threshold.pkl', 'wb'))