# Predicciones

In [114]:
import pandas as pd
import sklearn as sk
from sklearn import tree
from sklearn import model_selection
from sklearn import metrics

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [30, 10]

## 1. Lectura de datos

In [60]:
df = pd.read_csv("HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


## 2. Análisis exploratorio de datos

In [None]:
df["Age"].hist(bins=40)

## 3. Preprocesamiento de datos

In [None]:
## Quitar valores perdidos
df.isna().sum()

In [None]:
## Pasar todo a número
df.dtypes

In [62]:
# OverTime                    object

# df["Attrition"].unique()
df["Attrition"] = df["Attrition"].map({"Yes": 1, "No": 0})
# df["BusinessTravel"].unique()
df["BusinessTravel"] = df["BusinessTravel"].map({"Non-Travel": 0, "Travel_Rarely": 0.5, "Travel_Frequently": 1.0})

#df["Department"].unique()
df = pd.get_dummies(df, columns=["Department"])

#df["EducationField"].unique()
df = pd.get_dummies(df, columns=["EducationField"])

# df["Gender"].unique()
df = pd.get_dummies(df, columns=["Gender"])

#df["JobRole"].unique()
df["Manager_Director"] = (df["JobRole"] == "Manufacturing Director") | (df["JobRole"] == "Research Director") | (df["JobRole"] == "Manager")
df = pd.get_dummies(df, columns=["JobRole"])

#df["MaritalStatus"].unique()
df = pd.get_dummies(df, columns=["MaritalStatus"])

df = df.drop(columns=["Over18"])
df["OverTime"] = df["OverTime"].map({"Yes": 1, "No": 0})

## 4. Entrenamiento de los modelos (v1)

In [73]:
X = df[df.columns.drop("Attrition")]
y = df["Attrition"]

In [None]:
# Modelo que voy a usar
clf = sk.tree.DecisionTreeClassifier(max_depth=3, random_state=42)

# Entrenamos el modelo
clf.fit(X, y)

sk.tree.plot_tree(clf, feature_names=X.columns);

In [None]:
#clf.predict([X.iloc[0]])
y_pred = clf.predict(X)
y_pred

## 4. Entrenamiento de modelos (v2)

In [146]:
X = df[df.columns.drop("Attrition")]
y = df["Attrition"]

# partimos X e y, 80% entrenamiento, 20% prueba
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

#X.shape # 1470, 52
#X_train.shape # 1176, 52
#X_test.shape # 294, 52
#y_train.shape # 1176
#y_test.shape # 294

In [160]:
# Modelo que voy a usar
clf = sk.tree.DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=42)

# Entrenamos el modelo
clf.fit(X_train, y_train)

#sk.tree.plot_tree(clf, feature_names=X_train.columns);

In [161]:
y_pred = clf.predict(X_test)
#y_pred

In [149]:
score = sk.metrics.accuracy_score(y_test, y_pred)
print("accuracy --> ", score)
# criterion='entropy', max_depth=2 --> 0.8537414965986394
# criterion='entropy', max_depth=3 --> 0.8571428571428571
# criterion='gini', max_depth=3 --> 0.8673469387755102

accuracy -->  0.8571428571428571


In [155]:
sk.metrics.confusion_matrix(y_test, y_pred)

array([[246,   0],
       [ 42,   6]])

In [162]:
score = sk.metrics.precision_score(y_test, y_pred)
print("precision --> ", score)

score = sk.metrics.recall_score(y_test, y_pred)
print("recall --> ", score)

score = sk.metrics.f1_score(y_test, y_pred)
print("f1 --> ", score)

precision -->  0.5833333333333334
recall -->  0.2916666666666667
f1 -->  0.38888888888888895


##  5. Validación cruzada

In [163]:
X = df[df.columns.drop("Attrition")]
y = df["Attrition"]

# partimos X e y, 80% entrenamiento, 20% prueba
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

In [174]:
clf = sk.tree.DecisionTreeClassifier(criterion='gini', max_depth=20, random_state=42)

In [176]:
scores = sk.model_selection.cross_val_score(clf, X, y, scoring="f1", cv=5)
print("f1 --> ", scores.mean(), scores.var())

f1 -->  0.35432530809224216 0.0012817253923247572
