<a href="https://colab.research.google.com/github/100477706/Proyecto_Aprendizaje/blob/main/modelo_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**MODELO FINAL**

En el notebook de predicciones entrenamos distintos modelos, ajustando sus hiperparámetros, y los evaluamos utilizando el método de validación cruzada (inner) para escoger el mejor de ellos. El modélo escogido termino siendo la SVM con los parámetros 'C' = 1.4, 'class_weight' = 'balanced' y 'kernel'= 'rbf', el cuál evaluamos con el método holdout (outer) para obtener la prédicción de cómo va a desepeñarse el mismo en el futuro y en la competición, este score fue de 85%. Ahora debemos pasar a entrenar el modelo con todos los datos disponibles y realizar nuestras predicciones para la competición.

#**CARGA DE DATOS**

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, accuracy_score, recall_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score
from sklearn.compose import ColumnTransformer
from sklearn import tree
from sklearn import metrics
import numpy as np
import time

In [4]:
df = pd.read_csv("attrition_availabledata_06.csv.gz", compression="gzip", sep=",") #subir el archivo comprimido
df.head()

Unnamed: 0,hrs,absences,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,5.896431,5,3,3,2.0,4.0,3.0,33,Travel_Rarely,Research & Development,...,Y,13,8,0,6.0,3,5,1,4,No
1,6.114877,19,3,3,3.0,3.0,3.0,20,Travel_Rarely,Research & Development,...,Y,13,8,1,2.0,2,2,2,2,No
2,6.39673,11,3,3,1.0,4.0,4.0,46,Travel_Rarely,Research & Development,...,Y,11,8,1,28.0,2,7,4,3,Yes
3,5.722452,22,4,3,1.0,1.0,4.0,40,Travel_Rarely,Sales,...,Y,13,8,1,9.0,2,8,7,1,No
4,5.663001,21,3,3,4.0,4.0,3.0,38,Travel_Frequently,Research & Development,...,Y,17,8,3,10.0,2,10,9,9,No


#**PREPROCESO DE DATOS**

In [5]:
# Identificar tipos de variables
categorical_vars = df.select_dtypes(include=['object']).columns.tolist()
numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Columnas constantes y de ID
irrelevant_cols = ['Attrition', 'EmployeeID', 'EmployeeCount', 'StandardHours', 'Over18']
categorical_vars = [col for col in categorical_vars if col not in irrelevant_cols]
numerical_vars = [col for col in numerical_vars if col not in irrelevant_cols]

# Retiramos las columnas innecesarias y hacemos la división entre train y test
X = df.drop(columns=irrelevant_cols)
# Como la variable 'Attrition' es YES o NO, entonces la convertimos a un valor
# binario para el modelo
y = df['Attrition'].map({'Yes': 1, 'No': 0})

#**ENTRENO DEL MODELO**

In [7]:
# Preprocesador para datos numéricos: imputación con la mediana + escalado robusto
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

# Preprocesador para datos categóricos: OneHotEncoding
cat_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar ambos en un ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, numerical_vars),
    ('cat', cat_transformer, categorical_vars)
])

# Pipeline final con preprocesamiento y el modelo final y los mejores parámetros
final_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=100477706, C=1.4, class_weight='balanced', kernel='rbf'))
])

# Entrenamos el modelo con los datos de train
final_model = final_pipe.fit(X, y)

#**PREDICCIONES DE COMPETICIÓN**

In [3]:
cd = pd.read_csv("attrition_competition_06.csv.gz", compression="gzip", sep=",") #subir el archivo comprimido
cd.head()

Unnamed: 0,hrs,absences,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,BusinessTravel,Department,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,5.98373,20,2,3,4.0,3.0,3.0,50,Non-Travel,Human Resources,...,0.0,Y,11,8,0,19.0,3,18,0,13
1,7.963643,13,3,3,3.0,,3.0,40,Travel_Rarely,Sales,...,1.0,Y,12,8,2,21.0,4,21,7,7
2,6.13138,10,3,3,3.0,3.0,3.0,42,Non-Travel,Research & Development,...,2.0,Y,11,8,1,11.0,2,5,0,2
3,7.321722,16,2,3,1.0,3.0,3.0,25,Travel_Rarely,Sales,...,2.0,Y,14,8,1,6.0,3,2,2,2
4,6.466932,14,3,4,3.0,3.0,3.0,34,Non-Travel,Research & Development,...,9.0,Y,20,8,1,6.0,3,3,1,2


##**Preproceso de datos en las variables de competicion**

In [8]:
# Identificar tipos de variables
competition_categorical_vars = cd.select_dtypes(include=['object']).columns.tolist()
competition_numerical_vars = cd.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Columnas constantes y de ID
competition_irrelevant_cols = ['EmployeeID', 'EmployeeCount', 'StandardHours', 'Over18']
competition_categorical_vars = [col for col in competition_categorical_vars if col not in competition_irrelevant_cols]
competition_numerical_vars = [col for col in competition_numerical_vars if col not in competition_irrelevant_cols]

# Retiramos las columnas innecesarias y hacemos la división entre train y test
X = cd.drop(columns=competition_irrelevant_cols)

In [14]:
predictions = final_model.predict(X)
predictions_series = pd.Series(predictions)
print(predictions_series.value_counts(normalize=True))

0    0.806122
1    0.193878
Name: proportion, dtype: float64


In [15]:
predictions = final_model.predict(X)
atrition_df = pd.Series(predictions).map({1: 'Yes', 0: 'No'}) # Convert predictions to a Pandas Series before using map
predictions_series = pd.Series(atrition_df)
print(predictions_series.value_counts(normalize=True))

No     0.806122
Yes    0.193878
Name: proportion, dtype: float64


In [16]:
predictions_df = pd.DataFrame({'EmployeeID': cd['EmployeeID'], 'Attrition':atrition_df })
predictions_df.to_csv('predictions.csv', index=False)


In [17]:


# Read the CSV file into a pandas DataFrame
predictions_df = pd.read_csv('predictions.csv')

# Display the contents of the DataFrame
print(predictions_df)

# Or, to display the first few rows, use head():
print(predictions_df.head())

      EmployeeID Attrition
0           1964        No
1           3779        No
2           4350        No
3           4291        No
4           3076        No
...          ...       ...
1465        1591       Yes
1466         962        No
1467        1212        No
1468        3691        No
1469        3535        No

[1470 rows x 2 columns]
   EmployeeID Attrition
0        1964        No
1        3779        No
2        4350        No
3        4291        No
4        3076        No
