# Machine learning: La forma clásica

## Cargar datos y crear características básicas

In [None]:
# Import pandas
import pandas as pd

In [None]:
# Load data
df = pd.read_csv("../data/attrition.csv")

In [None]:
# Show first 5 rows
df.head()

In [None]:
# Dummy encoding
df = pd.get_dummies(df)

In [None]:
# Show first 5 rows
df.head()

In [None]:
# Rename column attrition_yes to attrition
df.rename(columns={'Attrition_Yes': 'Attrition'}, inplace=True)

In [None]:
# Drop column Attrition_no
df = df.drop(columns=['Attrition_No'])

In [None]:
# Drop employee id
df = df.drop(columns=['EmployeeID'])

## Modelo: clasiificación binaria

In [None]:
# Create X and y
X = df.drop(columns=['Attrition'])
y = df['Attrition']

In [None]:
# Find best variables with RFECV
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

# Create the RFE object and compute a cross-validated score.
rfecv = RFECV(estimator=RandomForestClassifier(), step=1, cv=5, scoring='accuracy', min_features_to_select=10, n_jobs=-1)
rfecv.fit(X, y)

In [None]:
# Transform the dataframe to show the selected features
X_selected = rfecv.transform(X)

In [None]:
rfecv.get_feature_names_out()

In [None]:
# Split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=0)

In [None]:
# Create baseline model using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

In [None]:
# Plot variable importance from the random forest
import matplotlib.pyplot as plt

feature_names = rfecv.get_feature_names_out().tolist()
importances = model.feature_importances_
forest_importances = pd.Series(importances, index=feature_names)
plt.figure(figsize=(20,10))
forest_importances.nlargest(10).plot(kind='barh').invert_yaxis()
# increase the font size
plt.rcParams.update({'font.size': 12})
# add title and labels
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Variable')
# show the plot
plt.show()

## Modelo: clasificación binaria con grid search y cross validation

In [None]:
# Create a pipeline to search for the best parameters
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

# Create a dictionary of parameters to try
parameters = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__max_depth': [4, 6, 8]
}

# Create a grid search object
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

In [None]:
# View the best parameters
print(grid_search.best_params_)

# View the best score
print(grid_search.best_score_)

In [None]:
# View the complete results
print(grid_search.cv_results_)

In [None]:
# View the mean test score for each parameter combination
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# View the best model
print(grid_search.best_estimator_)

In [None]:
# Predict on the test set
y_pred = grid_search.predict(X_test)

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

In [None]:
# Plot variable importance from the random forest
import matplotlib.pyplot as plt

feature_names = rfecv.get_feature_names_out().tolist()
importances = grid_search.best_estimator_.steps[1][1].feature_importances_
forest_importances = pd.Series(importances, index=feature_names)
plt.figure(figsize=(20,10))
forest_importances.nlargest(10).plot(kind='barh').invert_yaxis()
# increase the font size
plt.rcParams.update({'font.size': 12})