In [1]:
import pandas as pd
import numpy as np
import joblib  ### para guardar model
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split

#Importar módulo de funciones
import sys
sys.path.insert(0, 'utils')  # Añadir el directorio 'utils' al PATH
import funciones
from importlib import reload
reload(funciones)  # Recarga el módulo

<module 'funciones' from 'c:\\recursos_humanos\\utils\\funciones.py'>

Función para preprocesamiento

In [290]:
def preprocesamiento(data_name):
    
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    
    data_2016 = funciones.get_data(data_name)
    employee_id = data_2016.EmployeeID
    
    data  = data_2016[['Age', 'DistanceFromHome', 'MonthlyIncome','NumCompaniesWorked', 'PercentSalaryHike',
       'TrainingTimesLastYear', 'YearsAtCompany','EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance',
       'BusinessTravel','Department','EducationField', 'MaritalStatus']].copy()

    for col in ['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance','BusinessTravel','Department','EducationField', 'MaritalStatus']:
       data[col] = data[col].astype('object')
       
    data.NumCompaniesWorked = data.NumCompaniesWorked.astype('Int64')
    
    
    
    data_num = data.select_dtypes('Int64')
    data_cat = data.select_dtypes('object')
    features_names = ['Age', 'DistanceFromHome', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike',
       'TrainingTimesLastYear', 'YearsAtCompany',
       'cat__EnvironmentSatisfaction_Bajo', 'cat__JobSatisfaction_Bajo',
       'cat__JobSatisfaction_Muy alto', 'cat__WorkLifeBalance_Mala',
       'cat__BusinessTravel_Travel_Frequently',
       'cat__Department_Human Resources',
       'cat__EducationField_Human Resources', 'cat__MaritalStatus_Single']
    

    # Asignar categorías
    EnvironmentSatisfaction = JobSatisfaction = {
        1: 'Bajo',
        2: 'Medio',
        3: 'Alto',
        4: 'Muy alto'
    }
    WorkLifeBalance = {
        1: 'Mala',
        2: 'Buena',
        3: 'Muy buena',
        4: 'La mejor'
    }

    # Reemplazar categorías
    data.EnvironmentSatisfaction.replace(EnvironmentSatisfaction, inplace=True)
    data.JobSatisfaction.replace(JobSatisfaction, inplace=True)
    data.WorkLifeBalance.replace(WorkLifeBalance, inplace=True)

    # Revisar que no hayan categorías raras que el modelo no conoce
    for col in data_cat:
        if col == 'EnvironmentSatisfaction':
            indices = data[~data[col].isin(['Bajo', 'Medio', 'Alto', 'Muy alto'])].index
        elif col == 'JobSatisfaction':
            indices = data[~data[col].isin(['Muy alto', 'Alto', 'Bajo', 'Medio'])].index
        elif col == 'WorkLifeBalance':
            indices = data[~data[col].isin(['La mejor', 'Buena', 'Mala', 'Muy buena'])].index
        elif col == 'BusinessTravel':
            indices = data[~data[col].isin(['Travel_Frequently', 'Travel_Rarely', 'Non-Travel'])].index
        elif col == 'Department':
            indices = data[~data[col].isin(['Sales', 'Research & Development', 'Human Resources'])].index
        elif col == 'EducationField':
            indices = data[~data[col].isin(['Life Sciences', 'Medical', 'Other', 'Technical Degree', 'Marketing', 'Human Resources'])].index
        elif col == 'MaritalStatus':
            indices = data[~data[col].isin(['Single', 'Married', 'Divorced'])].index
        data.drop(indices, inplace = True)

    # Definir límites de outliers
    lims_outliers = {}
    for col in data_num:
        q1 = data[col].quantile(.25)
        q3 = data[col].quantile(.75)
        iqr = q3 - q1
        lim_inf = q1 - 3 * iqr
        lim_sup = q3 + 3 * iqr
        lims_outliers[col] = (lim_inf, lim_sup)

    # Tratar outliers y nulos en numéricas
    for col in data_num:
        
        median = int(data[col].median())
        
        # Imputar outliers con la mediana
        lim_inf, lim_sup = lims_outliers[col]
        indices_outliers = data[ (data[col] < lim_inf) & (data[col] > lim_sup) ].index
        data.loc[indices_outliers, col] = median
        
        #Imputar faltantes con la mediana
        data[col].fillna(median, inplace = True)

    # Tratar faltantes en categóricas con la moda
    for col in data_cat:
        mean = data[col].mode()
        data[col].fillna(mean)

    # Obtener dummies
    transformer = ColumnTransformer([('cat', OneHotEncoder(), data_cat.columns)], remainder = 'passthrough')
    df_transformado = transformer.fit_transform(data)
    columns_transformed = transformer.get_feature_names_out(data.columns)
    data = pd.DataFrame(df_transformado, columns = columns_transformed)
    data = data.rename(columns={'remainder__Age':'Age',
                                'remainder__DistanceFromHome':'DistanceFromHome',
                                'remainder__MonthlyIncome':'MonthlyIncome',
                                'remainder__NumCompaniesWorked':'NumCompaniesWorked',
                                'remainder__PercentSalaryHike':'PercentSalaryHike',
                                'remainder__TrainingTimesLastYear':'TrainingTimesLastYear',
                                'remainder__YearsAtCompany': 'YearsAtCompany'})


    return data[features_names], employee_id

In [295]:
def despliegue():
    
    dt = joblib.load("salidas/dt_final.pkl")
    
    data, employee_id = preprocesamiento('data_2016')

    predict = dt.predict(data)

    prob_predict = dt.predict_proba(data)

    paths = dt.decision_path(data).toarray().astype(bool)

    # Obtener los índices de los nodos en el camino de decisión
    nodos_en_camino = np.where(paths)

    ids, paths2 = nodos_en_camino

    paths_final = []
    for i in range(ids.shape[0]):
        if i == 4410:
            break
        acum = ''
        for t in range(ids.shape[0]):
            if ids[t] == i:
                acum += str(paths2[t]) + ','
        paths_final.append(acum)
        
        
    data['target'] = predict
    data['employee_id'] = employee_id
    data['prob_abandono'] = prob_predict[:,1]
    data['path'] = paths_final
    data_abandono = data[data.target == 1].sort_values(by='prob_abandono', ascending=False)
    
    data_abandono.to_excel('salidas/despliegue.xlsx', index=False)
    
    return data_abandono

In [296]:
despliegue()

Unnamed: 0,Age,DistanceFromHome,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,TrainingTimesLastYear,YearsAtCompany,cat__EnvironmentSatisfaction_Bajo,cat__JobSatisfaction_Bajo,cat__JobSatisfaction_Muy alto,cat__WorkLifeBalance_Mala,cat__BusinessTravel_Travel_Frequently,cat__Department_Human Resources,cat__EducationField_Human Resources,cat__MaritalStatus_Single,target,employee_id,prob_abandono,path
6,33,10,43020,2,13,3,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,3938,1.000000,0132458199221332384459461
509,24,28,28860,1,19,5,1,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1,3554,1.000000,013245759113193234266277298
2344,44,6,45580,1,13,2,6,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1,600,1.000000,"0,2,5,7,19,27,31,34,37,88,97,100,117,127,130,1..."
2343,44,6,45580,1,13,2,6,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1,2070,1.000000,"0,2,5,7,19,27,31,34,37,88,97,100,117,127,130,1..."
414,31,9,168850,2,14,2,1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,3940,1.000000,013245759113193234266277298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,48,24,16010,4,12,3,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,781,0.685429,0258183
1723,56,1,25590,1,21,3,4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,2567,0.685429,0257202147136141451
3340,47,1,57620,1,11,4,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2954,0.685429,0269209353437440441
1666,35,24,20700,1,13,1,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,468,0.685429,0258183
