# Modelo de Machine Learning para el etiquetado de empleados dentro de Liverpool

# Librerías

In [None]:
''' 
    * Autor: Octavio Augusto Alemán Esparza
    * fecha: 12.11.2023
    * Titulo: model.ipynb
    * Descripción: Funciones para el modelado y etiquetado de empleados renunciantes dentro de Liverpool
'''

import pandas as pd #libreria standard para el manejo de datos
import numpy as np  #libreria standard para operaciones matemáticas
import matplotlib.pyplot as plt #libreria base para visualización de datos
import seaborn as sns #libreria avanzada para visualización de datos
from itertools import combinations

from tqdm import tqdm

import statistics
import warnings
import random
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

#plotting some predictions
from sklearn import tree
from mlxtend.plotting import plot_decision_regions
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

warnings.filterwarnings('ignore')
%matplotlib inline

# Carga de datos

In [None]:
# Ruta de archivos
file_path = './data/'

# Nombre de archivo
file_name = 'data_for_model.csv'

Visualización de archivo

In [None]:
# Carga de datos a un Dataframe
df = pd.read_csv(file_path + file_name)

# Definición de la primera columna como index
df.set_index(df.columns[0], inplace=True)
df.index.name = 'index'

# Visualización de las primeras 5 entradas
df.head(5)

# Ajuste de datos

In [None]:
# Conversión de columnas categóricas a numericas
def dataToNumeric(df):

    df_categoric = df.select_dtypes(exclude= ['number'])

    label_encoder = LabelEncoder()
    for col in range (0, len(df_categoric.columns)):
        df[df_categoric.columns[col]] = label_encoder.fit_transform(df[df_categoric.columns[col]])

    return df

df = dataToNumeric(df)

# Visualización de resultados
df.head(5)

Sondeo de datos

In [None]:
df.describe()

# Analísis de Atributos

In [None]:
def showCorr(df):

    df_aux = df.copy()

    df_aux.columns = [col[:10] for col in df_aux.columns]

    # Calcula la matriz de correlación
    correlation_matrix = df_aux.corr()

    # Crea un gráfico de mapa de calor con Plotly
    fig = px.imshow(correlation_matrix,
                    color_continuous_scale='PuRd',  # Utiliza una paleta de colores en tonos de rosa
                    title='Mapa de Calor de Correlación')
    
    fig.update_layout(title_x = 0.5)

    return fig.show()

showCorr(df)

Desviación estándar - Z-Score

In [None]:
df = df.dropna()
def zScore():
    cols = list(df.columns)
    cols.pop() 
    z = np.abs(stats.zscore(df[cols]))

    return z

zScore()

Distribución de activos y renuncias

In [None]:
def populationDistribution():
    # Poblacion activa
    act = df[df['clase'] == 0]

    # Población renuncias
    res = df[df['clase'] == 1]

    # Cantidad por población
    num_act = len(act)
    num_res = len(res)

    # Rango normalizado
    act_range = np.arange(num_act) / num_act * 100
    res_range = np.arange(num_res) / num_res * 100

    cols = list(df.columns)
    #cols.remove('Estado')
    cols.pop()

    # Filas del subplot
    rows = int(np.ceil(len(cols) / 3))

    fig = sp.make_subplots(rows=rows, cols=3)

    for i, column in enumerate(cols):

        # Ordenar valores de columna actual
        act_col = act[column].sort_values()
        res_col = res[column].sort_values()

        row_num = i // 3 + 1
        col_num = i % 3 + 1

        legend = False

        if row_num == 3 and col_num == 3:
            legend = True

        fig.add_trace(go.Scatter(x=act_col, 
                                 y=act_range, 
                                 name="Activos", 
                                 mode='lines', 
                                 marker=dict(color='#CD137A'),
                                 legendgroup="Activos",
                                 legendgrouptitle_text="Activos",
                                 showlegend=legend), row=row_num, col=col_num)
        
        fig.add_trace(go.Scatter(x=res_col, 
                                 y=res_range, 
                                 name="Renuncias", 
                                 mode='lines', 
                                 marker=dict(color='#EEAABF'),
                                 legendgroup="Renuncias",
                                 legendgrouptitle_text="Renuncias",
                                 showlegend=legend), row=row_num, col=col_num)

        fig.update_xaxes(title_text=column, 
                         row=row_num, 
                         col=col_num, 
                         showline=True, 
                         showgrid=False, 
                         showticklabels=True)
        
        fig.update_yaxes(row=row_num, 
                         col=col_num, 
                         showline=True, 
                         showgrid=False)

    fig.update_layout(
            showlegend=True,
            height=300 * rows,
            width=1000,
            title='Distribución de activos y renuncias',
            title_x=0.5,
        )
    
    return fig.show()

populationDistribution()


Distribución de activos y renuncias - Histograma

In [None]:
def populationDistributionHist():
    # Poblacion activa
    act = df[df['clase'] == 0]

    # Población renuncias
    res = df[df['clase'] == 1]

    cols = list(df.columns)
    #cols.remove('Estado')
    cols.pop()

    # Filas y columnas del subplot
    rows = int(np.ceil(len(cols) / 3))

    fig = sp.make_subplots(rows=rows, cols=3)

    for i, column in enumerate(cols):

        # Valores de columna actual
        act_col = act[column]
        res_col = res[column]

        row_num = i // 3 + 1
        col_num = i % 3 + 1

        legend = False

        if row_num == 3 and col_num == 3:
            legend = True

        fig.add_trace(go.Histogram(x=act_col, 
                                   name="Activos",
                                   marker_color='teal',
                                   opacity=0.5,
                                   legendgroup="Activos",
                                   nbinsx=9,
                                   showlegend=legend,
                                   histnorm='percent'), row=row_num, col=col_num)
        
        fig.add_trace(go.Histogram(x=res_col, 
                                   name="Renuncias",
                                   marker_color='orange',
                                   opacity=0.5,
                                   legendgroup="Renuncias",
                                   nbinsx=9,
                                   showlegend=legend,
                                   histnorm='percent'), row=row_num, col=col_num)

        fig.update_xaxes(title_text=column, 
                         row=row_num, 
                         col=col_num, 
                         showline=True, 
                         showgrid=False, 
                         showticklabels=True)
        
        fig.update_yaxes(row=row_num, 
                         col=col_num, 
                         showline=True, 
                         showgrid=False)

    fig.update_layout(
            showlegend=True,
            height=300 * rows,
            width=1000,
            barmode='overlay',  # Superpone los histogramas
            title='Distribución de activos y renuncias',
            title_x=0.5,
        )
    
    return fig.show()

populationDistributionHist()

# Submuestreo aleatorio

In [None]:
def classProportion():
    # Calcula la diferencia en la columna 'Clase'
    difference = df['clase'].value_counts()

    colors = ['#CD137A', '#EEAABF']

    # Crea un gráfico de barras
    fig = go.Figure(data=[
        go.Bar(x=difference.index, 
               y=difference.values,
               marker_color=colors)
    ])

    # Configura el diseño del gráfico
    fig.update_layout(
        title='Distribución de Clases',
        title_x = 0.5,
        xaxis_title='Clase',
        yaxis_title='Total',
    )

    # Muestra el gráfico
    return fig.show()

classProportion()

Aleatorizar y particionar la muestra

In [None]:
df = df.sample(frac = 1, random_state=57)

def removeClass(df, class_to_drop, n):
    df_aux = df.copy()

    to_remove = df_aux[df_aux['clase'] == class_to_drop]
    to_remove = to_remove.sample(frac = 1, random_state = 27)

    df_aux = df_aux.drop(to_remove.head(n).index)

    return df_aux

# Eliminar aleatoriamente N muestras
N = 95000

# Clase a eliminar (Renucnias)
drop_class = 1

# Remover filas
df = removeClass(df, 1, N)

# Visualización del Dataframe
df.sample(5)

Nueva Distribución de clases

In [None]:
classProportion()

# Filtrado de datos

In [None]:
df.drop(['genero', 'sindicato', 'edad ingreso', 'edad salida'], axis = 1, inplace = True)

# Visualización del dataframe
df.head(5)

# Transformación de los datos

In [None]:
# Normalización
n_colsX = df.shape[1] - 1

X = df.iloc[:,0:n_colsX]
Y = df.iloc[:,n_colsX] 

rescaledX = StandardScaler().fit_transform(X)
newX    = pd.DataFrame(data=rescaledX,columns=X.columns)

# Nuevos datos escalados
newX.head(5)

In [None]:
# Columna de Clase
Y.head(5)

# Clasificación

División de datos en entrenamiento y prueba

In [None]:
# Definición de conjuntos de entrenamiento y prueba
test_size = 0.2
rnd_state = 66
X_train, X_test, Y_train, Y_test = train_test_split(newX, 
                                                    Y, 
                                                    random_state=rnd_state, 
                                                    test_size=test_size)

In [None]:
# Datos de entrenamiento
X_train.head(5)

In [None]:
# Datos de prueba
X_test.head(5)

# Entrenamiento de los modelos

Definición de modelos

In [None]:
# Lista de Modelos
models = []

# Definición de odelos

#Naive Bayes
models.append(('Naive Bayes', GaussianNB()))

# KNN
models.append(('KNN, K = 5', KNeighborsClassifier(n_neighbors=5)))
models.append(('KNN, K = 7', KNeighborsClassifier(n_neighbors=7)))
models.append(('KNN, K = 9', KNeighborsClassifier(n_neighbors=9)))

# Árboles de decisión
models.append(('CART with gini', DecisionTreeClassifier(criterion='gini')))
models.append(('CART with entropy', DecisionTreeClassifier(criterion='entropy')))
models.append(('CART with entropy and max_depth: 3', DecisionTreeClassifier(criterion='entropy', max_depth=3)))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('Decision Tree with entropy', DecisionTreeClassifier(criterion='entropy')))

# MLP
models.append(('MLP Adam Identity, 5x2', MLPClassifier(solver='adam', hidden_layer_sizes= (5,2), max_iter=5000, activation = 'identity')))

# SVM
models.append(('SVM RBF 3', SVC(kernel='rbf', C=0.5, gamma='auto', probability=True)))
models.append(('SVM RBF 4', SVC(kernel='rbf', C=0.5, gamma=0.07, probability=True)))
models.append(('SVM Linear 2', SVC(kernel='linear', C=0.5, probability=True)))
models.append(('SVM Poly 2', SVC(kernel='poly', degree=1, C=0.05,  probability=True)))
models.append(('SVM Poly 4', SVC(kernel='poly', degree=3, C=0.9,  probability=True)))


Entrenamiento

In [None]:
#Entrenando los modelos
results=[]
names=[]

#Barra de progreso
total_iterations = len(models)
with tqdm(total=total_iterations, desc="Cargando") as pbar:
  # Entrenamiento de cada modelo
  for name,model in models:
    kfold=KFold(n_splits=10,shuffle=True)
    cv_result=cross_val_score(model,X_train,Y_train,cv=kfold,scoring='accuracy')
    names.append(name)
    results.append(cv_result)

    # Actualiza la barra de progreso
    pbar.update(1)

print("Proceso completado")

Resultados

In [None]:
# Cálculo de resultados
for i in range(len(names)):
  total=int((1-test_size)*100)
  print(f'{names[i]} entrenado con una precisión del: {round((results[i].mean() * 100), 2)}%\n')

Boxplot con los resultados por modelo

In [None]:
def boxplotModelResults():
    # Paleta de colores
    colors_extended = ['#CD137A', 
                       '#D12A84', 
                       '#D5408E', 
                       '#D95797', 
                       '#DD6DA1', 
                       '#E184AB', 
                       '#E59BB5', 
                       '#E9B1BF', 
                       '#EDC8C8', 
                       '#F1DED2', 
                       '#F5F5DC']

    # Obtención de las medianas
    medians = []
    for i in range(len(results)):
        medians.append(round(statistics.median(results[i]), 3))

    # Crear una figura y un eje
    fig, ax = plt.subplots(figsize=(25, 10))

    # Usar la paleta de colores en el gráfico de caja
    sns.boxplot(data=results, ax=ax, palette=colors_extended)

    ax.set_xticklabels(names)
    vertical_offset = statistics.median(medians) * 0.01
    for xtick in ax.get_xticks():
        ax.text(xtick, medians[xtick] + vertical_offset, 
                medians[xtick], 
                horizontalalignment='center', 
                color='black', 
                weight='semibold')

    plt.title('Precisión para modelos en fase entrenamiento', fontsize=20, fontweight='bold')
    plt.xticks(rotation=45)
    return plt.show()

boxplotModelResults()

Prueba de modelos - Matrices de Confusión

In [None]:
label = ["Activos", "Renuncias"]
FPR = []  # false positives rates vector
TPR = []  # true positives rates vector
TRESH = []  # threshold rates vector
Y_scores = []  # prediction in each point

for name, model in models:
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    print(f'Evaluando {name} con los datos de prueba')
    print('---------------------------------------')
    
    # confusion matrix
    confusion = confusion_matrix(Y_test, Y_predict)
    sns.heatmap(confusion, annot=True, xticklabels=label, yticklabels=label, fmt='', cmap='Blues')  # Cambio de colores a rosas
    plt.show()
    
    # taking the important values from confusion matrix
    TP = confusion[1, 1]  # True Positives (TP)
    TN = confusion[0, 0]  # True Negatives (TN)
    FP = confusion[0, 1]  # False Positives (FP) a "Type I error"
    FN = confusion[1, 0]  # False Negatives (FN) a "Type II error"
    
    # Classification Accuracy: Overall, how often is the classifier correct?
    acc = (TP + TN) / (TP + TN + FP + FN)
    print(f'ACC: {acc:.2f}')
    
    # Classification Error: Overall, how often is the classifier incorrect?
    error_rate = (FP + FN) / (TP + TN + FP + FN)
    print(f'Misclassification Rate: {error_rate:.2f}')
    
    # Sensitivity (recall): When the actual value is positive, how often is the prediction correct?
    recall = TP / (TP + FN)
    print(f'Recall: {recall:.2f}')
    
    # Precision: When a positive value is predicted, how often is the prediction correct?
    precision = TP / (TP + FP)
    print(f'Precision: {precision:.2f}')

    # F1-score
    f1 = f1_score(Y_test, Y_predict)
    print(f'F1 Score: {f1:.2f}')
    
    # For ROC
    Y_score = model.predict_proba(X_test)[:, 1]  # Predictions in all points
    Y_scores.append(Y_score)
    
    FP_rates, TP_rates, Tresh_rates = roc_curve(Y_test, Y_score)
    FPR.append(FP_rates)
    TPR.append(TP_rates)
    TRESH.append(Tresh_rates)
    
    print('\n\n\n')

# Análisis Visual de Resultados

ROC

In [None]:
#plotting ROC
markers = ['.', ',', 'o', 'v', '^', '>', '<', '*', '1', '2', '3', '4', '8', 's', 'p', 'P', 'h', 'H', '+', 'x', '|', '_', '_', 'x','1', '2', '3','.', ',', 'o', 'v', '^', '>','8', 's', 'p', 'P', 'h', 'H', '+', 'x', '|', '_', '_', 'x','1']
lines = ['-', '--', '-.', ':', '-', '--', '-.', ':', '-', '--', '-.', ':','-', '--', '-.', ':','-', '--', '-.', ':','-', '--', '-.', ':','-', '--', '-.', ':', '-', '--', '-.', ':', '-', '--', '-.', ':','-', '--', '-.', ':','-', '--', '-.', ':','-', '--', '-.', ':' ,'-', '--', '-.', ':', '-', '--', '-.', ':', '-', '--', '-.', ':','-', '--', '-.', ':','-', '--', '-.', ':','-', '--', '-.', ':'  ]

plt.figure(figsize=(12,8))
for i in range(len(names)):
  plt.plot(FPR[i], TPR[i], label=names[i], marker=markers[i], linestyle=lines[i])

#plotting reference line
sns.lineplot(x = [0, 1], y = [0, 1], color = 'red')
plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)

plt.xlabel('False positives rate',fontsize=16)
plt.ylabel('True positives rate',fontsize=16)
plt.title('ROC', fontsize=28)
plt.legend(fontsize=16, loc=4)
plt.show()

#printing AUC
print('ROC-AUC')
print('----')
for i in range(len(names)):
  print(f'ROC-AUC for {names[i]}: {roc_auc_score(Y_test, Y_scores[i])}')

Precision-recall Curve

In [None]:
#plotting precision-recall curve
plt.figure(figsize=(12,8))
displays = []
i = 0

for name, model in models:
  precisionCurve, recallCurve, _ = precision_recall_curve(Y_test, Y_scores[i])
  displays.append(PrecisionRecallDisplay(precision=precisionCurve, recall=recallCurve, estimator_name=name))
  displays[i].plot(ax=plt.gca())
  i = i + 1



plt.xlabel('Recall',fontsize=16)
plt.ylabel('Precision',fontsize=16)
plt.title('PR Curve', fontsize=28)
plt.legend(fontsize=12, loc=4)
plt.show()



Espacios de clasificación

In [None]:
#plotting classification space

pca = PCA(n_components=2)
X_pca = pca.fit_transform(rescaledX)
Y_plot = df['clase'].astype(int).values

for name,model in models:
  print(f'Modelando con:  {name}')
  model_short = model
  model_short.fit(X_pca, Y_plot)
  plot_decision_regions(X_pca, Y_plot, clf=model_short, legend = 2 )
  plt.xlabel('X')
  plt.ylabel('Y')
  plt.show()

# Importancia de Atributos
* Para los modelos con mejor rendimiento

In [None]:
feature_labels = np.array(list(df.columns[:-1]))

for name,model in models:
  if 'Random' in name:
      
      '''model.fit(X_train, Y_train)
      Y_predict = model.predict(X_test)'''
  
      print('_____________________________________')
      print(f'Modelando con:  {name}')

      importance = model.feature_importances_
      feature_indexes_by_importance = importance.argsort()
      for index in feature_indexes_by_importance:
            print('{} - {:.2f}%'.format(str(feature_labels[index]).capitalize(), (importance[index] *100.0)))

# Funcionamiento del modelo
* Random Forest

Datos de prueba

In [None]:
test_data = df[df['clase'] == 1].sample(10).copy()
test_data

In [None]:
def identify_risk(df):
    for name, model in models:
        if 'Random' in name:
            
            aux = df.copy()

            # Normalización
            n_colsX = aux.shape[1]-1
            X = df.iloc[:,0:n_colsX]
            rescaledX = StandardScaler().fit_transform(X)
            newX    = pd.DataFrame(data=rescaledX,columns=X.columns)

            # Utilizar el modelo para predecir probabilidades de la clase positiva
            Y_probabilities = model.predict_proba(newX)[:, 1]

            # Definir umbrales de riesgo
            low_threshold = 0.6
            medium_threshold = 0.8

            # Asignar niveles de riesgo
            risk_levels = []

            for score in Y_probabilities:
                if score < low_threshold:
                    risk_levels.append("Riesgo Bajo")
                elif low_threshold <= score < medium_threshold:
                    risk_levels.append("Riesgo Medio")
                else:
                    risk_levels.append("Riesgo Alto")

            # Agregar los niveles de riesgo al DataFrame
            df['Nivel de Riesgo'] = risk_levels

    return df

identify_risk(test_data)

