#### styles

In [None]:
%%html
<style>

    h1{
        font-weight: 700 !important;
    }
    .jp-MarkdownOutput h1,.jp-MarkdownOutput h2,.jp-MarkdownOutput h3,.jp-MarkdownOutput h4,.jp-MarkdownOutput h5,.jp-MarkdownOutput h6 {
        font-weight: 800 !important;
        color: #2563eb;
        width: max-content;
        border-bottom: 4px solid #38bdf8;
    }
</style>

# IMPORTING LIBRARIES

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv('./dataset.csv')
df.shape

In [None]:
df.info()

# EDA

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
new_df = df.drop(['EmployeeCount', 'StandardHours'], axis=1)
sns.heatmap(new_df.corr(), ax=ax, annot=True, cmap='GnBu')

In [None]:
df.shape

In [None]:
df.describe(include='object')

`Most of the values in attrition column are No` <br>
`All the employees are Over 18`

In [None]:
df.isna().sum().sum() # Null Values

In [None]:
df.duplicated().sum() 

In [None]:
df['Age'].mean()
df['Age'].plot.hist(edgecolor='black', color='#60a5fa')

`More employees are between age 30-40`

`Average Age is round 37`

In [None]:
df['Attrition'].value_counts().plot.pie(explode=[0, 0.1], wedgeprops={'edgecolor': 'black', 'linestyle':'dashed'}, autopct="%1.2f%%", cmap='GnBu')

`Data is imbalanced`

In [None]:
df['Gender'].value_counts().plot.pie(explode=[0, 0.1], wedgeprops={'edgecolor': 'black', 'linestyle':'dashed'}, figsize=(16, 6), autopct="%1.2f%%", cmap='GnBu')
plt.legend()
plt.show()

In [None]:
df['MaritalStatus'].value_counts().plot.pie(wedgeprops={'edgecolor': 'black', 'linestyle':'dashed'}, figsize=(16, 6), autopct="%1.2f%%", cmap='GnBu')
plt.legend()
plt.show()

In [None]:
df['EducationField'].value_counts().plot.pie(wedgeprops={'edgecolor': 'black', 'linestyle':'dashed'}, figsize=(16, 6), autopct="%1.2f%%", cmap='GnBu')
plt.legend()
plt.show()

In [None]:
df['Department'].value_counts().plot.pie(wedgeprops={'edgecolor': 'black', 'linestyle':'dashed'}, figsize=(16, 6), autopct="%1.2f%%", cmap='GnBu')
plt.legend()
plt.show()

## **How attrition is related to other columns**

In [None]:
business_travel = df.groupby(['BusinessTravel', 'Attrition']).size()
business_travel = business_travel.unstack().sort_values(by='No', ascending=True)
business_travel.plot.bar(edgecolor='black', figsize=(16, 6), cmap='GnBu')
plt.show()

`Most employees who travel rarely don't leave the company. From the plot we can tell, sending employees on business travels or not doesn't really make much of a difference and doesn't have a significant effect on attrition.`

In [None]:
mapping_env_satisfaction = {
    1: 'low',
    2: 'medium',
    3: 'high',
    4: 'very high'
}

environment_satisfaction = df['EnvironmentSatisfaction']

df['EnvironmentSatisfaction_category'] = environment_satisfaction.replace(mapping_env_satisfaction)

In [None]:
environment_satisfaction = df.groupby(['EnvironmentSatisfaction_category', 'Attrition']).size()
environment_satisfaction = environment_satisfaction.unstack().sort_values(by='No', ascending=True)
environment_satisfaction.plot.bar(edgecolor='black', figsize=(16, 6), ylim=(0, 800), cmap='GnBu')
plt.title('Environment Satisfaction')
plt.show()

`All the employees seems happy with their environment`

In [None]:
df.groupby(['Department', 'Attrition']).size().unstack()

In [None]:
department = df.groupby(['Department', 'Attrition']).size()
department = department.unstack().sort_values(by='No', ascending=True)
department.plot.bar(edgecolor='black', figsize=(16, 6), cmap='GnBu')
plt.title('Department')
plt.show()

`More employee from Research & Development Department are leaving.`

`But if we differentiate on the percentage then Human Resources Department has most attrition`

In [None]:
education = df.groupby(['Education', 'Attrition']).size()
education = education.unstack().sort_values(by='No', ascending=True)
education.plot.bar(edgecolor='black', figsize=(16, 6), cmap='GnBu')
plt.title('Education')
plt.show()

`Education Dosen't effect `

In [None]:
education_field = df.groupby(['EducationField', 'Attrition']).size()
education_field = education_field.unstack()
education_field

In [None]:
education_field = education_field.sort_values(by='No', ascending=True)
education_field.plot.bar(edgecolor='black', figsize=(16, 6), cmap='GnBu')
plt.title('Education Field')
plt.show()

`Employees who have Human Resources as there Education Background have 35% chance of leaving the organization`

`and the employee from Medical, Life Sciences or Other have less attrition.`

In [None]:
jobrole = df.groupby(['JobRole', 'Attrition']).size().unstack()
jobrole

In [None]:
jobrole = jobrole.sort_values(by='No', ascending=True)
jobrole.plot.bar(edgecolor='black', figsize=(16, 6), cmap='GnBu')
plt.title('Job Role')
plt.show()

`The employee those who work as a Research Director have very less or no attrition.`

`and laboratory Technician have highest attrition.`

`If we consider percentage wise then Sales Representative Job has the highest attrition followed by Human Resources Job.`

In [None]:
overtime = df.groupby(['OverTime', 'Attrition']).size().unstack()
overtime

In [None]:
overtime = overtime.sort_values(by='No', ascending=True)
overtime.plot.bar(edgecolor='black', figsize=(16, 6), cmap='GnBu')
plt.title('Over Time')
plt.show()

`There is 44% chance of attrition if the employee works over time.`

`It's better to not ask employee to work over time.`

In [None]:
# for i in df.columns:
#     col = df.groupby([i, 'Attrition']).size()
#     col = col.unstack(fill_value=0).sort_values(by='No', ascending=True)
#     col.plot.bar(edgecolor='black', figsize=(16, 6))
#     plt.title(i)
#     plt.show()

# Model Building

We are using the following algorithms for model building
- [x] Random Forest Regressor
- [x] SVM 
- [x] Logistic Regression
- [x] AdaBoost
- [x] XgBoost
- [ ] Neural Networks

## Columns to drop
- EnvironmentSatisfaction_category:- We added this column for EDA
- Over18: All the employees are over 18 so doesn't matter whether to include it or not.
- StandardHours:- StandardHours doesn't really affect the attrition 
- EmployeeNumber:- It's an unique identifier for a employee 
- EmployeeCount:- EmployeeCount doesn't affect the attrition <br/>
**Neural Network DNN** <br />
**Feature Selection** <br />
**Pre Processing** <br />

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, mean_squared_error, r2_score, roc_curve
from sklearn.model_selection import GridSearchCV

In [None]:
label_encoder = LabelEncoder()

In [None]:
model_df = df.drop(['Attrition', 'EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis = 1)
Y = label_encoder.fit_transform(df.loc[:, 'Attrition'])

In [None]:
x_categorical = model_df.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = model_df.select_dtypes(exclude=['object'])
x = pd.concat([x_numerical, x_categorical], axis=1)

### **Train Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, Y, test_size=0.3)

In [None]:
def plot_confusion_matrix(y_train, y_train_pred, y_test, y_test_pred):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
    sns.heatmap(confusion_matrix(y_train, y_train_pred), annot=True, ax=ax1, cmap='GnBu', fmt='g')
    sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, ax= ax2, cmap='GnBu', fmt='g')
    
    ax1.set_title('Training Confusion Matrix')
    ax2.set_title('Testing Confusion Matrix')
    
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [None]:
def get_accuracy_score(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print("="*25, end='')
    print("Training Accuracy", end='')
    print("="*25)
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(clf_report)
    print(f"ACCURACY SCORE: {accuracy_score(y_train, y_train_pred):.4f}")
    print("="*25, end='')
    print("Testing Accuracy", end='')
    print("="*25)
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(clf_report)
    print(f"ACCURACY SCORE: {accuracy_score(y_test, y_test_pred):.4f}")
    print("="*50)
    plot_confusion_matrix(y_train, y_train_pred, y_test, y_test_pred)

## Random Forest Regressor

In [None]:
random_clf = RandomForestClassifier(n_estimators=10, random_state=0, oob_score=True)
random_clf.fit(X_train, y_train)

In [None]:
get_accuracy_score(random_clf, X_train, X_test, y_train, y_test)

### Hyperparameter Tuning

In [None]:
n_estimators = [20, 60, 100, 120]
max_features = [0.2, 0.6, 1.0]
max_depth = [2, 8, None] # not imp
max_samples = [0.5, 0.75, 1.0]
bootstrap = [True, False, None]
oob=[True, False, None]

In [None]:
param_grid = {'n_estimators': n_estimators, 
              'max_features': max_features, 
              'max_depth': max_depth, 
              'max_samples' : max_samples, 
              'bootstrap': bootstrap,
              'oob_score': oob
             }

In [None]:
rf_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

In [None]:
rf_grid.fit(X_train, y_train)

In [None]:
rf = RandomForestClassifier(**rf_grid.best_params_)
rf.fit(X_train, y_train)

In [None]:
get_accuracy_score(rf, X_train, X_test, y_train, y_test)

## LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(random_state=0)

In [None]:
lr.fit(X_train, y_train)

In [None]:
get_accuracy_score(lr, X_train, X_test, y_train, y_test)

### Hyperparameter tuning

In [None]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'fit_intercept': [True, False, None]
}

In [None]:
lr = LogisticRegression()
lr_grid = GridSearchCV(estimator = lr, param_grid=param_grid, cv = 5, n_jobs=-1)

In [None]:
lr_grid.fit(X_train, y_train)

In [None]:
lr = LogisticRegression(**lr_grid.best_params_)
lr.fit(X_train, y_train)

In [None]:
get_accuracy_score(lr, X_train, X_test, y_train, y_test)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier(n_estimators=100, algorithm='SAMME', random_state=0)
ada_clf.fit(X_train, y_train)

In [None]:
y_train_pred = ada_clf.predict(X_train)
y_test_pred = ada_clf.predict(X_test)

In [None]:
get_accuracy_score(ada_clf, X_train, X_test, y_train, y_test)

### Hyperparameter tuning

In [None]:
param_grid = {'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0], 'n_estimators': [10, 50, 100, 500]}

In [None]:
ada_clf = AdaBoostClassifier()
ada_grid = GridSearchCV(estimator=ada_clf, param_grid=param_grid, cv=5, n_jobs=-1)

In [None]:
ada_grid.fit(X_train, y_train)

In [None]:
ada_clf = AdaBoostClassifier(**ada_grid.best_params_)
ada_clf.fit(X_train, y_train)

In [None]:
get_accuracy_score(ada_clf, X_train, X_test, y_train, y_test)

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svc_clf = SVC(kernel='rbf', random_state=42)

In [None]:
svc_clf.fit(X_train, y_train)

In [None]:
get_accuracy_score(svc_clf, X_train, X_test, y_train, y_test)

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_clf = xgb.XGBClassifier(objective='binary:logistic')

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
get_accuracy_score(xgb_clf, X_train, X_test, y_train, y_test)

## Comparing Each Algorithm Using ROC

In [None]:
models = {'XGBoost':xgb_clf, 'AdaBoost':ada_clf, 'Logistic Regessor': lr, 'Random Forest': rf, 'Support Vector Machine': svc_clf}

for k, v in models.items():
    try:
        y_pred_prob = v.predict_proba(X_test)[:, 1]
    except:
        y_pred_prob = v.decision_function(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    auc_score = roc_auc_score(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{k} (AUC = {auc_score:.3f})')
    
plt.plot([0, 1], [0, 1], 'k--')
plt.legend()
plt.savefig('comparision.png')

# Work In Progress

In [None]:
label_encoder = LabelEncoder()
df["Attrition"] = label_encoder.fit_transform(df.Attrition)

In [None]:
dummy_col = [column for column in df.drop('Attrition', axis=1).columns if df[column].nunique() < 20]
data = pd.get_dummies(df, columns=dummy_col, drop_first=True, dtype='uint8')

In [None]:
data = data.T.drop_duplicates()
data = data.T

data.drop_duplicates(inplace=True)

data.drop('Attrition', axis=1).corrwith(data.Attrition).sort_values().plot(kind='barh', figsize=(10, 30))
# df.drop('Attrition', axis=1).corrwith(df.Attrition).sort_values().plot(kind='barh', figsize=(10, 30))

In [None]:
features_selection = data.drop('Attrition', axis=1).corrwith(data.Attrition).sort_values()
model_df = features_selection[np.abs(features_selection) > 0.02].index
len(model_df)

In [None]:
x_categorical = df.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = df.select_dtypes(exclude=['object'])
new_df_ = pd.concat([x_numerical, x_categorical], axis=1)

In [None]:
num_cols = len(x_numerical.columns)
num_rows = (num_cols + 2) // 3  
num_cols_per_row = min(3, num_cols)
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols_per_row, figsize=(15, (num_rows * 5)))
col_counter = 0
for i in range(num_rows):
    for j in range(num_cols_per_row):
        if col_counter < num_cols:
            col_name = x_numerical.columns[col_counter]
            x_numerical[col_name].plot.box(ax=axes[i, j])
            axes[i, j].set_title(col_name)
            col_counter += 1
        else:
            axes[i, j].axis('off') 


plt.tight_layout()
plt.show()

In [None]:
1470