In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
cwd = os.getcwd()
cwd

In [None]:
os.chdir('./drive/MyDrive/DS')

In [None]:
df = pd.read_csv('./WA_Fn-UseC_-HR-Employee-Attrition.csv')
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Attrition'].value_counts().plot(kind='barh')
plt.title('Attrition rate')
plt.xlabel('Count')
plt.show()

In [None]:
df['Attrition'].value_counts()/len(df)

In [None]:
(df['Attrition'].value_counts()/len(df)).plot(kind='pie', autopct='%1.0f%%', title='Attrition percentage')

In [None]:
df.columns

In [None]:
for column in df.columns:
    print(f'{column}: Number of unique values {df[column].nunique()}')
    print('===========================================================')

In [None]:
df.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis='columns', inplace=True)

### Univariate Analysis


In [None]:
object_col = []
for column in df.columns:
    if df[column].dtype == object and len(df[column].unique()) <= 30:
        object_col.append(column)
        print(f'{column}:{df[column].unique()}')
        print(df[column].value_counts())
        print('================================')
object_col.remove('Attrition')
object_col

In [None]:
plt.figure(figsize=(10,6))
for i, predictor in enumerate(object_col):
    plt.figure(i)
    sns.countplot(data=df, x=predictor, hue='Attrition')

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
df["Attrition"] = label.fit_transform(df.Attrition)

In [None]:
len(object_col)

In [None]:
num_col = []
for column in df.columns:
    if df[column].dtypes != object and df[column].nunique() < 30:
        print(f'{column}:{df[column].unique()}')
        num_col.append(column)
        print("====================================")
num_col.remove('Attrition')
num_col

In [None]:
plt.figure(figsize=(10,6))
for i, predictor in enumerate(num_col):
    plt.figure(i)
    sns.kdeplot(data=df, x=predictor, hue='Attrition')

### Bivariate Analysis


In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap(df.corr(), annot=True, annot_kws={"size":15})

In [None]:
plt.figure(figsize=(30, 30))
sns.pairplot(df)

In [None]:
dummy_col = [column for column in df.drop('Attrition', axis=1).columns if df[column].nunique() < 20]
df = pd.get_dummies(df, columns=dummy_col, drop_first=True, dtype='uint8')
df.info()

In [None]:
df = data.T.drop_duplicates()
df = data.T

# Remove Duplicate Rows
df.drop_duplicates(inplace=True)
print(df.shape)

### Encoding

In [None]:
df['Age_re'] = df['Age']
df = df.drop('Age',axis=1)

In [None]:
one_hot_df=pd.get_dummies(df['BusinessTravel'],drop_first=True)
df=pd.concat([df,one_hot_df],axis=1)
one_hot_df1=pd.get_dummies(df['Department'],drop_first=True)
df=pd.concat([df,one_hot_df1],axis=1)
one_hot_df2=pd.get_dummies(df['EducationField'],drop_first=True)
df=pd.concat([df,one_hot_df2],axis=1)
one_hot_df3=pd.get_dummies(df['Gender'],drop_first=True)
df=pd.concat([df,one_hot_df3],axis=1)
one_hot_df4=pd.get_dummies(df['JobRole'],drop_first=True)
df=pd.concat([df,one_hot_df4],axis=1)
one_hot_df5=pd.get_dummies(df['MaritalStatus'],drop_first=True)
df=pd.concat([df,one_hot_df5],axis=1)
one_hot_df6=pd.get_dummies(df['OverTime'],drop_first=True)
df=pd.concat([df,one_hot_df6],axis=1)

In [None]:
df.drop('BusinessTravel', inplace=True, axis=1)
df.drop('Department', inplace=True, axis=1)
df.drop('EducationField', inplace=True, axis=1)
df.drop('Gender', inplace=True, axis=1)
df.drop('JobRole', inplace=True, axis=1)
df.drop('MaritalStatus', inplace=True, axis=1)
df.drop('OverTime', inplace=True, axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

for column in df.columns:
    if df[column].dtype == np.number:
        continue 
    df[column] = LabelEncoder().fit_transform(df[column])

### Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df.to_numpy())
df_scaled = pd.DataFrame(df_scaled)
 
print("Scaled Dataset Using MinMaxScaler")
df_scaled

In [None]:
df_scaled.columns = ['Attrition', 'DailyRate', 'DistanceFromHome', 'Education','EnvironmentSatisfaction', 'HourlyRate',
                     'JobInvolvement','JobLevel','JobSatisfaction','MonthlyIncome','MonthlyRate','NumCompaniesWorked',
                     'PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel','TotalWorkingYears',
                     'TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion',
                     'YearsWithCurrManager','Age_re','Travel_Frequently','Travel_Rarely','Research & Development','Sales',
                     'Life Sciences','Marketing','Medical','Other','Technical Degree','Male','Human Resources','Laboratory Technician',
                     'Manager','Manufacturing Director','Research Director','Research Scientist','Sales Executive','Sales Representative',
                     'Married','Single','Yes'
                    ]
df_scaled

### Specify Predictor/Target Variables

In [None]:
X = df_scaled.drop('Attrition', axis=1)
y = df_scaled['Attrition']

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from imblearn.combine import SMOTEENN
sm = SMOTEENN()
X_train, y_train = sm.fit_resample(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score

def compare(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print("TESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

In [None]:
from sklearn.metrics import precision_recall_curve, roc_curve

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g--", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.title("Precision/Recall Tradeoff")
    

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], "k--")
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    


### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=False,

                                    )
rf_clf.fit(X_train, y_train)
compare(rf_clf, X_train, X_test, y_train, y_test)

### RandomForest Hyperparameter Tuning

In [None]:
param_grid = dict(
    n_estimators= [100, 500], 
    max_features= ['auto', 'sqrt'],
    max_depth= [2, 3, 5, 10, 15, None], 
    min_samples_split= [2, 5],
    min_samples_leaf= [1, 2], 
    bootstrap= [True, False]
)

rf_clf = RandomForestClassifier(random_state=42)
search = GridSearchCV(rf_clf, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
search.fit(X_train, y_train)

rf_clf = RandomForestClassifier(**search.best_params_, random_state=42)
rf_clf.fit(X_train, y_train)
compare(rf_clf, X_train, X_test, y_train, y_test)

### RandomForest Roc Curve  plot

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_test, rf_clf.predict(X_test))
plt.figure(figsize=(14, 25))
plt.subplot(4, 2, 1)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

plt.subplot(4, 2, 2)
plt.plot(precisions, recalls)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.title("PR Curve: precisions/recalls tradeoff");

plt.subplot(4, 2, 3)
fpr, tpr, thresholds = roc_curve(y_test, rf_clf.predict(X_test))
plot_roc_curve(fpr, tpr)

In [None]:
scores_dict = {
        'Train': roc_auc_score(y_train, rf_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, rf_clf.predict(X_test)),
    }

In [None]:
scores_dict

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
                                    
dt_clf.fit(X_train, y_train)
compare(dt_clf, X_train, X_test, y_train, y_test)

### DecisionTree Roc Curve  plot

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_test, dt_clf.predict(X_test))
plt.figure(figsize=(14, 25))
plt.subplot(4, 2, 1)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

plt.subplot(4, 2, 2)
plt.plot(precisions, recalls)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.title("PR Curve: precisions/recalls tradeoff");

plt.subplot(4, 2, 3)
fpr, tpr, thresholds = roc_curve(y_test, dt_clf.predict(X_test))
plot_roc_curve(fpr, tpr)

In [None]:
scores_dict = {
        'Train': roc_auc_score(y_train, dt_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, dt_clf.predict(X_test)),
    }
scores_dict

### XGBoost

In [None]:
from xgboost import XGBClassifier


xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

compare(xgb_clf, X_train, X_test, y_train, y_test)

### XGB Hyperparameter Tuning

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV



param_grid = param_test = {'n_estimators':list(range(50,200,500))}

xgb_clf = XGBClassifier(random_state=42)
search = GridSearchCV(xgb_clf, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
search.fit(X_train, y_train)

xgb_clf = XGBClassifier(**search.best_params_, random_state=42)
xgb_clf.fit(X_train, y_train)
compare(xgb_clf, X_train, X_test, y_train, y_test)

### XGBoost ROC curve plot

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_test, xgb_clf.predict(X_test))
plt.figure(figsize=(14, 25))
plt.subplot(4, 2, 1)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

plt.subplot(4, 2, 2)
plt.plot(precisions, recalls)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.title("PR Curve: precisions/recalls tradeoff");

plt.subplot(4, 2, 3)
fpr, tpr, thresholds = roc_curve(y_test, xgb_clf.predict(X_test))
plot_roc_curve(fpr, tpr)

In [None]:
scores_dict = {
        'Train': roc_auc_score(y_train, xgb_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, xgb_clf.predict(X_test)),
    }

In [None]:
scores_dict

### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier()
ab_clf.fit(X_train, y_train)

compare(ab_clf, X_train, X_test, y_train, y_test)

### AdaBoost Hyperparameter tuning

In [None]:
param_grid = dict(
    n_estimators= [10,50,100,500], 
    learning_rate= [0.0001,0.001,0.01,0.1,1],
    algorithm= ['SAMME', 'SAMME.R']
)

ab_clf = AdaBoostClassifier(random_state=42)
search = GridSearchCV(ab_clf, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
search.fit(X_train, y_train)

ab_clf = AdaBoostClassifier(**search.best_params_, random_state=42)
ab_clf.fit(X_train, y_train)
compare(ab_clf, X_train, X_test, y_train, y_test)

### Adaboost ROC curve plot

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_test, ab_clf.predict(X_test))
plt.figure(figsize=(14, 25))
plt.subplot(4, 2, 1)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

plt.subplot(4, 2, 2)
plt.plot(precisions, recalls)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.title("PR Curve: precisions/recalls tradeoff");

plt.subplot(4, 2, 3)
fpr, tpr, thresholds = roc_curve(y_test, ab_clf.predict(X_test))
plot_roc_curve(fpr, tpr)

In [None]:
scores_dict = {
        'Train': roc_auc_score(y_train, ab_clf.predict(X_train)),
        'Test': roc_auc_score(y_test, ab_clf.predict(X_test)),
    }

In [None]:
scores_dict

In [None]:
from sklearn import metrics
plt.rcParams['figure.figsize'] = [10,10]
classifiers = [rf_clf,dt_clf,xgb_clf,ab_clf]
ax = plt.gca()
for i in classifiers:
    metrics.plot_roc_curve(i, X_test, y_test, ax=ax)