In [117]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.metrics import MeanAbsoluteError, MeanSquaredError
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from tensorflow.keras.optimizers import Adam
from keras_tuner import HyperModel
from keras_tuner.tuners import Hyperband
import pandas as pd
from datetime import datetime

In [118]:
df = pd.read_csv('train_data.csv', sep = ';')
df_prueba = pd.read_csv('test_data.csv', sep = ';')

In [119]:
df['edad'] = pd.to_datetime(df['fecha_nacimiento'], format='%d/%m/%Y').apply(lambda x:datetime.now().year - x.year)

In [120]:
print(df.head())

   id_colaborador  id_ultimo_jefe  ...  abandono_6meses edad
0          100247        102074.0  ...                0   25
1          103355        102115.0  ...                1   28
2          100669        102060.0  ...                0   53
3          103760        102062.0  ...                1   32
4          100965        102062.0  ...                0   32

[5 rows x 17 columns]


In [121]:
caracteristicas_importantes = ['seniority', 'modalidad_trabajo', 'distancia_oficina', 'dias_baja_salud', 'salario', 'performance_score', 'psi_score', 'genero', 'canal_reclutamiento', 'estado_civil']
X = df[caracteristicas_importantes]
X_prueba = df_prueba[caracteristicas_importantes]
y = df['abandono_6meses']

imputer = SimpleImputer(strategy='mean')
X.loc[:, 'performance_score'] = imputer.fit_transform(X[['performance_score']])


In [122]:
print(len(X_prueba))

2020


In [123]:
caracteristicas_num = ['seniority', 'distancia_oficina', 'dias_baja_salud', 'salario', 'psi_score', 'performance_score']
caracteristicas_cat = ['modalidad_trabajo', 'genero', 'canal_reclutamiento', 'estado_civil']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), caracteristicas_num),
        ('cat', OneHotEncoder(), caracteristicas_cat)
    ])

X_preprocesado = preprocessor.fit_transform(X)
X_prueba_preprocesado = preprocessor.transform(X_prueba)
X_train, X_test, y_train, y_test = train_test_split(X_preprocesado, y, test_size=0.2, random_state=42)

In [124]:
modelo = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

modelo.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

modelo.summary()

stopper = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)

history = modelo.fit(X_train, y_train, epochs=5000, validation_split=0.2, callbacks=[stopper], verbose=1)

test_loss, test_acc = modelo.evaluate(X_test, y_test, verbose=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5000


[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5543 - loss: 0.6781 - val_accuracy: 0.6261 - val_loss: 0.6326
Epoch 2/5000
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6676 - loss: 0.6241 - val_accuracy: 0.6348 - val_loss: 0.6170
Epoch 3/5000
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6823 - loss: 0.5966 - val_accuracy: 0.6174 - val_loss: 0.6146
Epoch 4/5000
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6846 - loss: 0.5903 - val_accuracy: 0.6290 - val_loss: 0.6107
Epoch 5/5000
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6542 - loss: 0.5941 - val_accuracy: 0.6348 - val_loss: 0.6285
Epoch 6/5000
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6736 - loss: 0.5684 - val_accuracy: 0.6319 - val_loss: 0.6105
Epoch 7/5000
[1m43/43[0m [32m━━━━━━━━━

In [126]:
predicciones = modelo.predict(X_prueba_preprocesado)

umbral_predicciones = (predicciones >= 0.5).astype(int)
umbral_predicciones = umbral_predicciones.flatten()

print(len(umbral_predicciones))
print(len(df_prueba['id_colaborador']))


# Crear un DataFrame con las predicciones

df_predicciones = pd.DataFrame({'ID': df_prueba['id_colaborador'], 'abandono_6meses': umbral_predicciones})

# Guardar las predicciones en un archivo CSV
df_predicciones.to_csv('predicciones.csv', index=False)

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 706us/step
2020
2020


In [90]:
print('Accuracy: ', test_acc)

Accuracy:  0.6473317742347717


In [93]:
def train_evaluate_model(neurons, lr, batch_size, X_train, y_train, X_test, y_test):
    model = Sequential([
        Conv1D(filters=neurons, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(neurons, activation='relu'),
        Dense(1)  # Sin función de activación 'sigmoid' para regresión
    ])
    
    optimizer = Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, 
                    loss='mean_squared_error',  # MSE es una elección común para regresión
                    metrics=[
                        MeanAbsoluteError(name='mae'),
                        MeanSquaredError(name='mse')
                        ])
    
    stopper = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[stopper], verbose=0, batch_size=batch_size)
    
    metrics = model.evaluate(X_test, y_test, verbose=0, return_dict=True)
    
    # Calcular R-squared usando sklearn
    y_pred = model.predict(X_test)
    metrics['r2_score'] = r2_score(y_test, y_pred)
    
    return model, metrics

In [94]:
hyperparams_combinations = [
    {'neurons': 32, 'lr': 0.001, 'batch_size': 16},
    {'neurons': 32, 'lr': 0.001, 'batch_size': 32},
    {'neurons': 64, 'lr': 0.001, 'batch_size': 16},
    {'neurons': 64, 'lr': 0.01, 'batch_size': 32},
    {'neurons': 128, 'lr': 0.001, 'batch_size': 16},
    {'neurons': 128, 'lr': 0.01, 'batch_size': 32},
    {'neurons': 32, 'lr': 0.1, 'batch_size': 64},
    {'neurons': 64, 'lr': 0.1, 'batch_size': 64},
    {'neurons': 128, 'lr': 0.1, 'batch_size': 64},
    
]

best_mse = float('inf')
best_model = None
best_params = {}
best_metrics = {}

for params in hyperparams_combinations:
    model, metrics = train_evaluate_model(params['neurons'], params['lr'], params['batch_size'], X_train, y_train, X_test, y_test)
    print(f"Config: {params}, Metrics: {metrics}")
    if metrics['mse'] < best_mse:  # Para regresión, podríamos centrarnos en minimizar el MSE
        best_mse = metrics['mse']
        best_model = model
        best_params = params
        best_metrics = metrics

print(f"Mejor configuración: {best_params}, Mejores métricas: {best_metrics}")



  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Config: {'neurons': 32, 'lr': 0.001, 'batch_size': 16}, Metrics: {'loss': 0.220516175031662, 'mae': 0.4265490174293518, 'mse': 0.21946623921394348, 'r2_score': 0.12201695040899241}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Config: {'neurons': 32, 'lr': 0.001, 'batch_size': 32}, Metrics: {'loss': 0.22098851203918457, 'mae': 0.42709964513778687, 'mse': 0.2201915830373764, 'r2_score': 0.11911516380293619}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Config: {'neurons': 64, 'lr': 0.001, 'batch_size': 16}, Metrics: {'loss': 0.22366653382778168, 'mae': 0.4267660677433014, 'mse': 0.22240647673606873, 'r2_score': 0.11025434248729782}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Config: {'neurons': 64, 'lr': 0.01, 'batch_size': 32}, Metrics: {'loss': 0.22066400945186615, 'mae': 0.42876043915748596, 'mse': 0.22114256024360657, 'r2_score': 0.11531065408895713}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Config: {'neurons': 128, 'lr': 0.001, 'batch_size': 16}, Metrics: {'loss': 0.22001323103904724, 'mae': 0.4270206093788147, 'mse': 0.21940141916275024, 'r2_score': 0.1222760891713528}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Config: {'neurons': 128, 'lr': 0.01, 'batch_size': 32}, Metrics: {'loss': 0.21879032254219055, 'mae': 0.43073317408561707, 'mse': 0.21863415837287903, 'r2_score': 0.12534560788077753}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Config: {'neurons': 32, 'lr': 0.1, 'batch_size': 64}, Metrics: {'loss': 0.2502742111682892, 'mae': 0.5001119375228882, 'mse': 0.2502051889896393, 'r2_score': -0.000955496003409495}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Config: {'neurons': 64, 'lr': 0.1, 'batch_size': 64}, Metrics: {'loss': 0.23340913653373718, 'mae': 0.4522552490234375, 'mse': 0.23425672948360443, 'r2_score': 0.06284698166356284}


  super().__init__(


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
Config: {'neurons': 128, 'lr': 0.1, 'batch_size': 64}, Metrics: {'loss': 0.2525247037410736, 'mae': 0.4994717240333557, 'mse': 0.2521260976791382, 'r2_score': -0.008640201631642475}
Mejor configuración: {'neurons': 128, 'lr': 0.01, 'batch_size': 32}, Mejores métricas: {'loss': 0.21879032254219055, 'mae': 0.43073317408561707, 'mse': 0.21863415837287903, 'r2_score': 0.12534560788077753}


In [95]:
modelo.predict(X_test)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


array([[0.5900766 ],
       [0.435065  ],
       [0.18738483],
       [0.5952816 ],
       [0.65332276],
       [0.17144251],
       [0.46512675],
       [0.551577  ],
       [0.71866095],
       [0.15463127],
       [0.46580887],
       [0.16177818],
       [0.25927496],
       [0.5459267 ],
       [0.46414006],
       [0.20736507],
       [0.30630875],
       [0.551525  ],
       [0.9293813 ],
       [0.12951426],
       [0.6961035 ],
       [0.40040988],
       [0.62101185],
       [0.6889634 ],
       [0.43745595],
       [0.55127287],
       [0.73324794],
       [0.436726  ],
       [0.5730493 ],
       [0.42375758],
       [0.22801417],
       [0.753369  ],
       [0.7232077 ],
       [0.24558339],
       [0.5100696 ],
       [0.6457954 ],
       [0.65338206],
       [0.24543221],
       [0.36249867],
       [0.17180547],
       [0.3713726 ],
       [0.5498135 ],
       [0.21374628],
       [0.49290252],
       [0.6358477 ],
       [0.17923695],
       [0.4334338 ],
       [0.643

In [97]:
class CNNHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = Sequential()
        model.add(Conv1D(filters=hp.Int('filters', min_value=32, max_value=128, step=32),
                        kernel_size=hp.Choice('kernel_size', values=[3, 5]),
                        activation='relu',
                        input_shape=self.input_shape))
        model.add(MaxPooling1D(
            pool_size=hp.Choice('pool_size', values=[2, 3])))
        model.add(Flatten())
        model.add(Dense(units=hp.Int('units', min_value=32, max_value=128, step=32),
                        activation='relu'))
        # Sin función de activación 'sigmoid' para regresión
        model.add(Dense(1))

        lr = hp.Choice('learning_rate', values=[0.001, 0.01, 0.1])
        model.compile(optimizer=Adam(learning_rate=lr),
                        loss='mean_squared_error',
                        metrics=[MeanAbsoluteError(name='mae'), MeanSquaredError(name='mse')])
        return model


# Asegúrate de que X_train esté preparado adecuadamente
input_shape = (X_train.shape[1], 1)
hypermodel = CNNHyperModel(input_shape=input_shape)

tuner = Hyperband(
    hypermodel,
    max_epochs=10,
    objective='val_loss',
    seed=42,
    executions_per_trial=2,
    directory='my_dir',
    project_name='keras_tuner_demo'
)

stop_early = EarlyStopping(monitor='val_loss', patience=5)

tuner.search(X_train, y_train, epochs=100,
                validation_split=0.2, callbacks=[stop_early])

best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# Evalúa el mejor modelo
metrics = best_model.evaluate(X_test, y_test, verbose=0)
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred.flatten())

print(f"Mejores hiperparámetros: {best_hyperparameters.values}")
print(f"Mejores métricas: {metrics}")
print(f"R2 score: {r2}")

Reloading Tuner from my_dir\keras_tuner_demo\tuner0.json


  super().__init__(
  trackable.load_own_variables(weights_store.get(inner_path))


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Mejores hiperparámetros: {'filters': 96, 'kernel_size': 5, 'pool_size': 2, 'units': 128, 'learning_rate': 0.001, 'tuner/epochs': 10, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
Mejores métricas: [0.21656344830989838, 0.4320356547832489, 0.21649251878261566]
R2 score: 0.13391343208744533


In [116]:
predicciones = modelo.predict(X_prueba_preprocesado)
print(predicciones)

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 618us/step
[[0.26292157]
 [0.25565478]
 [0.35611767]
 ...
 [0.27591947]
 [0.27354664]
 [0.12439909]]


In [114]:
umbral_predicciones = (predicciones >= 0.5).astype(int)
umbral_predicciones = umbral_predicciones.flatten()

print(len(umbral_predicciones))
print(len(df_prueba['id_colaborador']))

2020
2020


In [115]:

# Crear un DataFrame con las predicciones

df_predicciones = pd.DataFrame({'ID': df_prueba['id_colaborador'], 'abandono_6meses': umbral_predicciones})

# Guardar las predicciones en un archivo CSV
df_predicciones.to_csv('predicciones.csv', index=False)