# 1. Importar librerías

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn import over_sampling
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from importlib import reload
import utils  # Importa el módulo
reload(utils)  # Recarga el módulo

<module 'utils' from 'c:\\Users\\USUARIO\\OneDrive - Universidad de Antioquia\\Aprendizaje\\Universidad\\2023-2\\Analítica II\\Caso Estudio Supervisado\\pediccion_abandono_empleo\\utils.py'>

# 2. Importar datos

In [2]:
df_manual = pd.read_csv('Datasets/seleccion_manual.csv')
df_anova = pd.read_csv('Datasets/seleccion_anova.csv')
df_total = pd.read_csv('Datasets/datos_preparados.csv')

## 3. Random Forest Classifier

In [22]:
# Definición de cuadricula de hiperparametros
parameters = {'max_depth': [30, 45, 60],
              'max_leaf_nodes': [30, 35, 40, 45, 50, 55],
              'min_samples_leaf': [3, 5, 7],
              'n_estimators': [800, 1000]}

### Con todas las características

In [4]:
X = df_total.drop(['target'], axis=1)
y = df_total.target

X_train, X_test, y_train, y_test = utils.split(X, y, test_size = 0.2)

X train shape:  (3520, 69)
y train shape:  (881, 69)
X test shape:  (3520,)
y test shape:  (881,)


In [23]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

ranfor = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')

rand_s = GridSearchCV(ranfor,
                        parameters,
                        scoring = 'recall',
                        cv = 10,
                        n_jobs=-1)
rand_s.fit(X_train, y_train)

In [24]:
rand_s.best_estimator_

In [35]:
rf = RandomForestClassifier(max_depth=45,
                            max_leaf_nodes=45,
                            min_samples_leaf=3,
                            n_estimators=800,
                            n_jobs=-1,
                            random_state=42,
                            class_weight='balanced')
rf.fit(X_train, y_train)

In [36]:
rf.score(X_train, y_train)

0.9340909090909091

In [38]:
rf.score(X_test, y_test)

0.8864926220204313

In [25]:
rand_s.best_estimator_.score(X_train, y_train)

0.8928977272727273

In [26]:
rand_s.best_estimator_.score(X_test, y_test)

0.8671963677639046

In [44]:
rand_s.best_estimator_.fit(X_train, y_train)
y_pred = rand_s.best_estimator_.predict(X_train)

In [46]:
from sklearn.metrics import confusion_matrix

mc_train = confusion_matrix(y_pred, y_train)

tn, fp, fn, tp = mc_train.ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
especificidad = tn / (fp + tn)
f1_score = 2*(precision*recall)/(precision+recall)
print('-'*30,'TRAIN','-'*30)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Especificidad: {especificidad}')
print(f'F1 score: {f1_score}')
print('Train score: ',rand_s.best_estimator_.score(X_train,y_train))

------------------------------ TRAIN ------------------------------
Precision: 0.3267857142857143
Recall: 1.0
Especificidad: 0.8870242732993707
F1 score: 0.4925975773889637
Train score:  0.8928977272727273


### Random Forest con la selección manual

In [16]:
#Obtener dummies y escalar variables
X_std = utils.transformar_datos(X)

#Separar datos en entrenamiento y test
X_train, X_test, y_train, y_test = utils.split(X_std, y, test_size=0.3)

X train shape:  (3080, 69)
y train shape:  (1321, 69)
X test shape:  (3080,)
y test shape:  (1321,)


In [8]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.8636363636363636

In [10]:
lr.score(X_test, y_test)

0.8637395912187736

### Exluyendo total_working_years, years_since_last_promotion, years_with_curr_managerSin 

In [97]:
df2 = df.drop(['total_working_years', 'years_since_last_promotion', 'years_with_curr_manager'], axis=1)

X_emp2 = df2.drop(['attrition'], axis=1)
y_emp2 = df2['attrition']

#Obtener dummies
X_dummies2 = pd.get_dummies(X_emp2)

#Escalar variables
cols_numericas2 = X_dummies2.select_dtypes(['float64', 'int64']).columns
pipeline = ColumnTransformer([('num', StandardScaler(), cols_numericas2)], remainder='passthrough')
X_std2 = pipeline.fit_transform(X_dummies2)

#Separar datos en entrenamiento y test
X_train2, X_test2, y_train2, y_test2 = split(X_std2, y_emp2, test_size=0.3)

X train shape:  (3080, 59)
y train shape:  (1321, 59)
X test shape:  (3080,)
y test shape:  (1321,)


In [98]:
lr2 = LogisticRegression()
lr2.fit(X_train2, y_train2)
lr2.score(X_train2, y_train2)

0.8590909090909091

In [99]:
lr2.score(X_test2, y_test2)

0.8516275548826646

#### Selección de variables con VarianceThreshold

In [100]:
X_emp3 = df.drop(['attrition'], axis=1)
y_emp3 = df['attrition']

#Obtener dummies
X_dummies3 = pd.get_dummies(X_emp3)

#Escalar variables
cols_numericas3 = X_dummies3.select_dtypes(['float64', 'int64']).columns
pipeline = ColumnTransformer([('num', StandardScaler(), cols_numericas3)], remainder='passthrough')
X_std3 = pipeline.fit_transform(X_dummies3)

In [101]:
def variance_threshold(X,th):
    var_thres=VarianceThreshold(threshold=th)
    var_thres.fit(X)
    new_cols = var_thres.get_support()
    return new_cols

In [102]:
new_cols = variance_threshold(X_std3, 0.08)
X_emp3_vt = X_std3[:,new_cols]

In [103]:
#Separar datos en entrenamiento y test
X_train3, X_test3, y_train3, y_test3 = split(X_emp3_vt, y_emp3, test_size=0.3)

X train shape:  (3080, 52)
y train shape:  (1321, 52)
X test shape:  (3080,)
y test shape:  (1321,)


In [104]:
lr2 = LogisticRegression()
lr2.fit(X_train3, y_train3)
lr2.score(X_train3, y_train3)

0.8639610389610389

In [105]:
lr2.score(X_test3, y_test3)

0.8591975775927327

DecisionThree

In [106]:
from sklearn import tree

In [107]:
clf = tree.DecisionTreeClassifier(
          criterion = 'gini',
          max_depth = 15,
          max_leaf_nodes=50,
          random_state=0)
clf.fit(X_train, y_train)

In [108]:
clf.score(X_train, y_train)

0.9224025974025974

In [109]:
clf.score(X_test, y_test)

0.8819076457229371

GradientBoostingClassifier

In [110]:
from sklearn.ensemble import GradientBoostingClassifier

# Entrenamiento del modelo: N_ESTIMATORS = 100, LEARNING_RATE = 0.1, , MAX_DEPTH = 4
# ===================================================================================
gboos = GradientBoostingClassifier(
            n_estimators = 150,
            learning_rate= 0.1,
            max_features = 10,
            random_state = 123
         )
gboos.fit(X_train, y_train)
gboos.score(X_train, y_train)

0.9321428571428572

Random Forest

In [111]:
from sklearn.ensemble import RandomForestClassifier
# Entrenamiento del modelo
# ==============================================================================
ranfor = RandomForestClassifier(
            n_estimators = 200,
            criterion    = 'gini',
            max_depth    = 20,
            max_leaf_nodes = 100,
            n_jobs       = -1,
            random_state = 123
         )
ranfor.fit(X_train, y_train)
ranfor.score(X_train, y_train)

0.9409090909090909

In [None]:
from sklearn.metrics import roc_curve, auc

# Calcular la curva ROC
fpr, tpr, _ = roc_curve(res_rf_gb['y_test'], res_rf_gb['y_pred_prob_test'][:,1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, color='red', lw=2, label='Curva ROC (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='darkgray', lw=2, linestyle='--', label='RANDOM CLASSIFIER')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FALSE POSITIVE RATE')
plt.ylabel('TRUE POSITIVE RATE')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()