# 1. Importar librerías

In [51]:
#Librerías para manipulación de datos
import pandas as pd
import numpy as np

#Librerías para visualización
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate,GridSearchCV

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RidgeClassifier

from sklearn.feature_selection import SelectKBest, f_classif,chi2

#Importar módulo de funciones
import sys
sys.path.insert(0, 'utils')  # Añadir el directorio 'utils' al PATH
import funciones
from importlib import reload
reload(funciones)  # Recarga el módulo

pd.set_option('display.max_columns',None)

# 2. Importar datos


In [52]:
df = pd.read_csv('data/data_2015_clean.csv', dtype={'StockOptionLevel': str, 'JobLevel': str, 'NumCompaniesWorked': 'int64'})

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4353 entries, 0 to 4352
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   InfoDate                 4353 non-null   object
 1   EmployeeID               4353 non-null   int64 
 2   Age                      4353 non-null   int64 
 3   DistanceFromHome         4353 non-null   int64 
 4   MonthlyIncome            4353 non-null   int64 
 5   NumCompaniesWorked       4353 non-null   int64 
 6   PercentSalaryHike        4353 non-null   int64 
 7   StockOptionLevel         4353 non-null   object
 8   TrainingTimesLastYear    4353 non-null   int64 
 9   YearsAtCompany           4353 non-null   int64 
 10  EnvironmentSatisfaction  4353 non-null   object
 11  JobSatisfaction          4353 non-null   object
 12  WorkLifeBalance          4353 non-null   object
 13  BusinessTravel           4353 non-null   object
 14  Department               4353 non-null  

# 2. Preparación de datos

## 2.1. Obtener dummies y Escalar

In [54]:
df_no_trans = df.drop(['Attrition','retirementDate','retirementType','resignationReason','InfoDate','EmployeeID'], axis=1)

num_columns = df_no_trans.select_dtypes('int64').columns
cat_columns = df_no_trans.select_dtypes('object').columns

In [55]:
# Crear un ColumnTransformer para aplicar transformaciones específicas a las columnas
transformer = ColumnTransformer([('num',StandardScaler(), num_columns),
                                 ('cat', OneHotEncoder(), cat_columns)])

# Ajustar y transformar el DataFrame
df_transformado = transformer.fit_transform(df_no_trans)

In [56]:
columns_transformed = transformer.get_feature_names_out(df_no_trans.columns)

In [57]:
df2 = pd.DataFrame(df_transformado, columns = columns_transformed)
df2['target'] = df['Attrition'].replace({'Sí':1, 'No':0})

# 2.2. Separar variable objetivo de los datos

In [58]:
X = df2.drop('target',axis=1)
y = df2['target']

print(X.shape)
print(y.shape)

(4353, 65)
(4353,)


# 3. Selección de variables

## 3.1. SelectKBest

#### Anova

In [59]:
def select_kbest(X,y,score_f,k):
    sel_kb = SelectKBest(score_func = score_f,k = k)
    sel_kb.fit(X,y)
    new_cols= sel_kb.get_support()
    print('Score:\n', sel_kb.scores_, '\n P-values:\n', sel_kb.pvalues_)
    return new_cols

In [60]:
X_new = select_kbest(X, y, f_classif, 15)
# Nuevo conjunto de datos
df_anova = X.iloc[:,X_new]
df_anova.head()

Score:
 [9.36130542e+01 8.12374511e-03 1.94697108e+00 7.74182132e+00
 8.61684071e-01 1.03433937e+01 8.27399526e+01 8.66151432e-02
 2.51847682e+00 3.81317628e+00 1.30427920e-02 4.45774368e+00
 4.54902513e+01 1.04227057e+00 7.46716502e+00 1.07214557e+00
 3.02748154e+01 5.74040757e-01 4.05072712e+01 9.28168774e-02
 6.82452685e-01 5.02596897e+01 1.24596982e+01 1.65972031e+01
 5.49753660e+01 1.33219287e+01 2.54680775e+01 3.81552795e-01
 2.53004278e+00 1.26661207e-01 6.57734788e-01 5.17716730e+00
 8.67680128e-01 4.50114767e-01 3.28362502e+01 2.22998544e-01
 1.41703332e-02 4.18179343e-02 2.97912298e+00 2.80247805e+00
 8.73949656e-01 8.73949656e-01 4.73668596e-02 1.33551718e+00
 6.21605497e-01 8.10896097e-02 1.30442291e+00 6.30794492e-01
 6.33811679e-01 1.07594314e-01 3.81957900e+00 4.97271525e+00
 4.35070875e+00 1.53682556e+00 2.06748666e+00 4.87580155e-01
 2.15360395e+01 3.01900480e+01 1.01679456e+02 3.71353102e+00
 1.13589277e+01 2.76987933e-01 1.81613474e+00 2.13286553e-01
 2.13286553e-01]

Unnamed: 0,num__Age,num__YearsAtCompany,cat__EnvironmentSatisfaction_Bajo,cat__JobSatisfaction_Bajo,cat__JobSatisfaction_Muy alto,cat__WorkLifeBalance_Mala,cat__WorkLifeBalance_Muy buena,cat__BusinessTravel_Non-Travel,cat__BusinessTravel_Travel_Frequently,cat__BusinessTravel_Travel_Rarely,cat__Department_Human Resources,cat__EducationField_Human Resources,cat__MaritalStatus_Divorced,cat__MaritalStatus_Married,cat__MaritalStatus_Single
0,-0.52393,-1.238141,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.52393,-1.238141,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.52393,-1.238141,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.25567,-1.238141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.25567,-1.238141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


#### Chi2

In [61]:
df_dummies = pd.get_dummies(df_no_trans)

In [62]:
X_new= select_kbest(df_dummies, y, chi2, 15)
# Nuevo conjunto de datos
df_chi2 = X.iloc[:,X_new]
df_chi2.head()

Score:
 [2.01881006e+02 5.82129840e-02 6.64732132e+04 1.78798873e+01
 7.56779058e-01 6.12871063e+00 3.53400332e+02 4.94478816e-02
 1.49945199e+00 3.39917001e+00 1.22933417e-02 3.06635969e+00
 3.64522940e+01 8.38214977e-01 5.20829475e+00 7.50144566e-01
 2.42197151e+01 4.65398445e-01 2.74967495e+01 7.13123884e-02
 6.12401701e-01 4.70136182e+01 4.83715451e+00 1.48658989e+01
 4.40953313e+01 3.84610266e+00 2.42316573e+01 1.32580381e-01
 1.76087670e+00 7.75489846e-02 5.81760182e-01 4.17864553e+00
 8.39793157e-01 3.28008799e-01 3.19985631e+01 1.31147789e-01
 1.26330828e-02 2.86312178e-02 2.81423785e+00 2.55092429e+00
 5.25348925e-01 3.48826866e-01 2.99482446e-02 8.50590130e-01
 5.28810581e-01 7.53107345e-02 1.24259171e+00 5.74025304e-01
 6.11726346e-01 8.85757995e-02 3.56011790e+00 4.47958147e+00
 4.11161314e+00 1.23086239e+00 1.61007837e+00 4.60185548e-01
 1.66819816e+01 1.62692562e+01 6.75476045e+01 1.51706229e+00
 1.06940114e+01 2.06438677e-01 1.63972227e+00 3.26458013e-02
 1.80728333e-01]

Unnamed: 0,num__Age,num__MonthlyIncome,num__NumCompaniesWorked,num__YearsAtCompany,cat__EnvironmentSatisfaction_Bajo,cat__JobSatisfaction_Bajo,cat__JobSatisfaction_Muy alto,cat__WorkLifeBalance_Mala,cat__BusinessTravel_Non-Travel,cat__BusinessTravel_Travel_Frequently,cat__Department_Human Resources,cat__EducationField_Human Resources,cat__MaritalStatus_Divorced,cat__MaritalStatus_Married,cat__MaritalStatus_Single
0,-0.52393,-0.467319,2.118864,-1.238141,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.52393,-0.467319,2.118864,-1.238141,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.52393,-0.467319,2.118864,-1.238141,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.25567,-0.922516,0.518896,-1.238141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.25567,-0.922516,0.518896,-1.238141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 3.2. RFE

In [63]:
# Función recursiva de selección de características
def recursive_feature_selection(X, y, model, k):
  rfe = RFE(model, n_features_to_select = k, step = 1)
  fit = rfe.fit(X, y)
  X_new = fit.support_
  print("Num Features: %s" % (fit.n_features_))
  print("Selected Features: %s" % (fit.support_))
  print("Feature Ranking: %s" % (fit.ranking_))

  return X_new

#### Regresión Logística

In [64]:
model= LogisticRegression(max_iter = 1000,random_state = 42)
X_new=recursive_feature_selection(X, y, model, 15)
df_log = X.iloc[:,X_new]
df_log.info()

Num Features: 15
Selected Features: [ True False False False False False  True False False False False False
  True False False False  True False  True False False  True False  True
  True False  True False False False False False  True False  True False
 False False False False False False False False False False False False
 False False  True  True False False False False False False  True False
  True False False False False]
Feature Ranking: [ 1 50 47  6 48 18  1 36 14 27 15 35  1 34  7 39  1 38  1 21 22  1  8  1
  1 26  1 20 19 44 45  5  1 46  1 42 33 31  3 32 40 43 24 25 41 23  2 30
 28 13  1  1  4 12 11 29  9 10  1 16  1 17 37 49 51]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4353 entries, 0 to 4352
Data columns (total 15 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   num__Age                               4353 non-null   float64
 1   num__YearsAtCompany                

#### Bosque Aleatorio

In [65]:
model = RandomForestClassifier(n_estimators=300,random_state=42,n_jobs=-1)
X_new = recursive_feature_selection(X,y,model,15)
df_ranfor = X.iloc[:,X_new]
df_ranfor.info()

Num Features: 15
Selected Features: [ True  True  True  True  True  True  True  True False False False False
  True False False False False False False False False False  True False
  True False False  True False False False False False False False False
 False False False False False False  True False False False False False
 False False False False False False False False False False  True  True
 False False False False False]
Feature Ranking: [ 1  1  1  1  1  1  1  1  9 28 43 17  1 29 10  6  5 23 14 20 36 13  1 45
  1 26 25  1 19  4 34 18 51 12 41  2 38 11 47 44  3 16  1  7 30 37 50 40
 49 22 48 42 35 15  8 46 32 24  1  1 31 21 27 33 39]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4353 entries, 0 to 4352
Data columns (total 15 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   num__Age                                4353 non-null   float64
 1   num__DistanceFromHome           

## 3.3. Select From Model

#### Ridge Classifier

In [66]:
# Selector de variables con Ridge
sel_ = SelectFromModel(RidgeClassifier(alpha = 0.1, random_state = 24), max_features = 15)
sel_.fit(X, y)
print(sel_.estimator_.coef_)
#Obtener variables seleccionadas
X_new = sel_.get_support()

df_ridge = X.iloc[:,X_new]
df_ridge.info()

[[-9.09673774e-02  1.87496456e-04 -3.76078433e-03  5.16770867e-02
   3.34229096e-03 -3.46419800e-02 -5.66734707e-02  8.16998319e-03
  -3.32443348e-02  4.93041731e-02 -2.42298214e-02 -2.41633519e-02
   1.11332485e-01 -2.19933927e-02 -6.51757400e-02  6.48400685e-03
   1.07162889e-01 -3.00810512e-03 -1.10638791e-01 -5.92966875e-02
  -5.58527730e-02  2.19108069e-01 -1.03958608e-01 -1.28733558e-01
   1.61569677e-01 -3.28361189e-02  9.98194495e-02 -3.70107426e-02
  -6.28087069e-02  7.05196736e-03  8.15560996e-03  5.73877010e-02
  -7.35778946e-02  9.82616256e-04  2.16464886e-01 -1.28594390e-02
  -2.52783984e-02 -3.81076937e-02 -9.44402237e-02 -4.57791308e-02
  -4.43711325e-03  4.43711325e-03  2.04206625e-02  2.46063806e-02
  -2.14322347e-03  4.96517939e-03 -4.78489990e-02 -2.99376547e-02
  -2.12811127e-02  2.33923897e-02 -6.12047206e-02 -6.60807404e-02
   1.04789158e-01  2.57125549e-02  6.23661179e-02 -3.77559924e-02
  -6.73153125e-02 -5.54117817e-02  1.22727094e-01 -5.92140750e-02
   8.65427

#### Gradient Boosting

In [67]:
# Selector de variables con Gradient Boosting
sel_ = SelectFromModel(GradientBoostingClassifier(n_estimators = 300, random_state=24))
sel_.fit(X, y)
print(sel_.estimator_.feature_importances_)
#Obtener variables seleccionadas
X_new = sel_.get_support()

df_gb = X.iloc[:,X_new]
df_gb.head()

[0.13686973 0.08519843 0.17044558 0.04298689 0.02992571 0.02991584
 0.08827219 0.00587755 0.0078594  0.00541291 0.00597628 0.00353081
 0.02598129 0.00377202 0.00849942 0.00703625 0.02024932 0.0067009
 0.01729523 0.00379891 0.00189162 0.02183015 0.00667605 0.00390977
 0.03131842 0.00371321 0.01650612 0.00148307 0.00411823 0.00334005
 0.00308869 0.00583353 0.0044154  0.00183441 0.01555812 0.00620047
 0.00325098 0.00798572 0.00060313 0.00316238 0.00203551 0.00347364
 0.01345241 0.00356992 0.00281043 0.00466554 0.0017542  0.00174217
 0.01193567 0.00375655 0.00464558 0.00544255 0.00361559 0.00347734
 0.00638097 0.00471816 0.00868761 0.00440004 0.03077984 0.00353072
 0.00842091 0.00659523 0.00489555 0.0013965  0.00149323]


Unnamed: 0,num__Age,num__DistanceFromHome,num__MonthlyIncome,num__NumCompaniesWorked,num__PercentSalaryHike,num__TrainingTimesLastYear,num__YearsAtCompany,cat__EnvironmentSatisfaction_Bajo,cat__JobSatisfaction_Bajo,cat__JobSatisfaction_Muy alto,cat__WorkLifeBalance_Mala,cat__BusinessTravel_Travel_Frequently,cat__Department_Human Resources,cat__EducationField_Human Resources,cat__MaritalStatus_Single
0,-0.52393,-0.019402,-0.467319,2.118864,-0.601649,0.930621,-1.238141,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,-0.52393,-0.019402,-0.467319,2.118864,-0.601649,0.930621,-1.238141,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,-0.52393,-0.019402,-0.467319,2.118864,-0.601649,0.930621,-1.238141,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,1.25567,-1.007222,-0.922516,0.518896,0.767107,0.930621,-1.238141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.25567,-1.007222,-0.922516,0.518896,0.767107,0.930621,-1.238141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Prueba de Selecciones

### 4.1. ANOVA con Regresión Logística

In [68]:
anova_log = funciones.regresionLogistica(df_anova, y)

------------------------------ TRAIN ------------------------------
Precision: 0.2591687041564792
Recall: 0.6883116883116883
Especificidad: 0.6990066225165563
F1 score: 0.37655417406749553
Train score:  0.6975875933371626
------------------------------ TEST ------------------------------
Precision : 0.2478386167146974
Recall : 0.7478260869565218
Especificidad : 0.6547619047619048
F1 score : 0.3722943722943723
Test score:  0.6670493685419059


### 4.2. ANOVA con Bosque Aleatorio

In [69]:
anova_ranfor = funciones.bosqueAleatorio(df_anova, y)

------------------------------ TRAIN ------------------------------
Precision: 0.3519736842105263
Recall: 0.6948051948051948
Especificidad: 0.8043046357615894
F1 score: 0.46724890829694327
Train score:  0.7897759908098794
------------------------------ TEST ------------------------------
Precision : 0.312
Recall : 0.6782608695652174
Especificidad : 0.7724867724867724
F1 score : 0.4273972602739726
Test score:  0.7600459242250287


### 4.3. Chi2 con Regresión Logística

In [70]:
chi2_log = funciones.regresionLogistica(df_chi2, y)

------------------------------ TRAIN ------------------------------
Precision: 0.25100240577385724
Recall: 0.6774891774891775
Especificidad: 0.6907284768211921
F1 score: 0.366296079578701
Train score:  0.6889718552556002
------------------------------ TEST ------------------------------
Precision : 0.2455621301775148
Recall : 0.7217391304347827
Especificidad : 0.6626984126984127
F1 score : 0.36644591611479027
Test score:  0.6704936854190585


### 4.4. Chi2 con Bosque Aleatorio

In [71]:
chi2_log = funciones.bosqueAleatorio(df_chi2, y)

------------------------------ TRAIN ------------------------------
Precision: 0.40473225404732255
Recall: 0.7034632034632035
Especificidad: 0.8417218543046358
F1 score: 0.5138339920948617
Train score:  0.8233773693279725
------------------------------ TEST ------------------------------
Precision : 0.3521739130434783
Recall : 0.7043478260869566
Especificidad : 0.8029100529100529
F1 score : 0.46956521739130436
Test score:  0.7898966704936854


### 4.5. RFE Log con Regresión Logística

In [72]:
rfelog_log = funciones.regresionLogistica(df_log, y)

------------------------------ TRAIN ------------------------------
Precision: 0.25901089689857504
Recall: 0.6688311688311688
Especificidad: 0.7072847682119205
F1 score: 0.3734138972809668
Train score:  0.7021826536473291
------------------------------ TEST ------------------------------
Precision : 0.24477611940298508
Recall : 0.7130434782608696
Especificidad : 0.6653439153439153
F1 score : 0.36444444444444446
Test score:  0.6716417910447762


### 4.6. RFE Log con Bosque Aleatorio

In [73]:
rfelog_ranfor = funciones.bosqueAleatorio(df_log, y)

------------------------------ TRAIN ------------------------------
Precision: 0.33683105981112277
Recall: 0.6948051948051948
Especificidad: 0.7907284768211921
F1 score: 0.45371024734982324
Train score:  0.7780011487650775
------------------------------ TEST ------------------------------
Precision : 0.3082706766917293
Recall : 0.7130434782608696
Especificidad : 0.7566137566137566
F1 score : 0.43044619422572183
Test score:  0.7508610792192881


### 4.7. RFE ranfor con Regresión Logística

In [74]:
rferanfor_log = funciones.regresionLogistica(df_ranfor, y)

------------------------------ TRAIN ------------------------------
Precision: 0.24135280553420446
Recall: 0.6796536796536796
Especificidad: 0.6731788079470199
F1 score: 0.3562110039705048
Train score:  0.6740379092475589
------------------------------ TEST ------------------------------
Precision : 0.23209169054441262
Recall : 0.7043478260869566
Especificidad : 0.6455026455026455
F1 score : 0.34913793103448276
Test score:  0.653272101033295


### 4.8. RFE ranfor con Bosque Aleatorio

In [75]:
rferanfor_ranfor = funciones.bosqueAleatorio(df_ranfor, y)

------------------------------ TRAIN ------------------------------
Precision: 0.4252163164400494
Recall: 0.7445887445887446
Especificidad: 0.8460264900662252
F1 score: 0.5413060582218725
Train score:  0.8325674899483055
------------------------------ TEST ------------------------------
Precision : 0.34977578475336324
Recall : 0.6782608695652174
Especificidad : 0.8082010582010583
F1 score : 0.46153846153846156
Test score:  0.7910447761194029


### 4.9. SFM ridge con Regresión Logística

In [76]:
sfmridge_log = funciones.regresionLogistica(df_ridge, y)

------------------------------ TRAIN ------------------------------
Precision: 0.2528041415012942
Recall: 0.6341991341991342
Especificidad: 0.7132450331125828
F1 score: 0.3615052436767427
Train score:  0.7027570361861
------------------------------ TEST ------------------------------
Precision : 0.22254335260115607
Recall : 0.6695652173913044
Especificidad : 0.6441798941798942
F1 score : 0.33405639913232105
Test score:  0.6475315729047072


### 4.10. SFM ridge con Bosque Aleatorio

In [77]:
sfmridge_ranfor = funciones.bosqueAleatorio(df_ranfor, y)

------------------------------ TRAIN ------------------------------
Precision: 0.4252163164400494
Recall: 0.7445887445887446
Especificidad: 0.8460264900662252
F1 score: 0.5413060582218725
Train score:  0.8325674899483055
------------------------------ TEST ------------------------------
Precision : 0.34977578475336324
Recall : 0.6782608695652174
Especificidad : 0.8082010582010583
F1 score : 0.46153846153846156
Test score:  0.7910447761194029


### 4.11. SFM GBC con Regresión Logística

In [78]:
sfmgbc_log = funciones.regresionLogistica(df_gb, y)

------------------------------ TRAIN ------------------------------
Precision: 0.26578073089701
Recall: 0.6926406926406926
Especificidad: 0.7072847682119205
F1 score: 0.3841536614645858
Train score:  0.7053417576105686
------------------------------ TEST ------------------------------
Precision : 0.2606060606060606
Recall : 0.7478260869565218
Especificidad : 0.6772486772486772
F1 score : 0.3865168539325843
Test score:  0.6865671641791045


### 4.12. SFM GBC con Bosque Aleatorio

In [79]:
sfmgbc_ranfor = funciones.bosqueAleatorio(df_gb, y)

------------------------------ TRAIN ------------------------------
Precision: 0.42805755395683454
Recall: 0.7727272727272727
Especificidad: 0.8420529801324503
F1 score: 0.5509259259259259
Train score:  0.832854681217691
------------------------------ TEST ------------------------------
Precision : 0.35319148936170214
Recall : 0.7217391304347827
Especificidad : 0.798941798941799
F1 score : 0.4742857142857143
Test score:  0.7887485648679678


### 4.13. Todas con Regresión Logística

In [80]:
todas_log = funciones.regresionLogistica(X, y)

------------------------------ TRAIN ------------------------------
Precision: 0.2706766917293233
Recall: 0.7012987012987013
Especificidad: 0.7109271523178808
F1 score: 0.39059674502712477
Train score:  0.7096496266513498
------------------------------ TEST ------------------------------
Precision : 0.2484472049689441
Recall : 0.6956521739130435
Especificidad : 0.6798941798941799
F1 score : 0.36613272311212813
Test score:  0.6819747416762342


### 4.14. Todas con Bosque Aleatorio

In [81]:
todas_ranfor = funciones.bosqueAleatorio(X, y)

------------------------------ TRAIN ------------------------------
Precision: 0.4639303482587065
Recall: 0.8073593073593074
Especificidad: 0.8572847682119206
F1 score: 0.5892575039494471
Train score:  0.8506605399195865
------------------------------ TEST ------------------------------
Precision : 0.37264150943396224
Recall : 0.6869565217391305
Especificidad : 0.8240740740740741
F1 score : 0.48318042813455647
Test score:  0.8059701492537313


# 5. Exportar Selecciones

Se elige la selección SFM GBC con Bosque Aleatorio teniendo en cuenta el recall y el sobreajuste.

In [86]:
df_gb['target'] = df2['target']
df_gb.to_csv('data/seleccion_gb.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gb['target'] = df2['target']
