In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split , GridSearchCV ,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler , OneHotEncoder

In [2]:
d_f = pd.read_csv('Book-01.csv')

In [3]:
d_f.select_dtypes(int).columns

Index(['Employee Number', 'Training Times Last Year', 'Age',
       'CF_attrition count', 'CF_attrition rate', 'CF_current Employee',
       'Daily Rate', 'Distance From Home', 'Environment Satisfaction',
       'Hourly Rate', 'Job Involvement', 'Job Level', 'Job Satisfaction',
       'Monthly Income', 'Monthly Rate', 'Num Companies Worked',
       'Percent Salary Hike', 'Performance Rating',
       'Relationship Satisfaction', 'Standard Hours', 'Stock Option Level',
       'Total Working Years', 'Work Life Balance', 'Years At Company',
       'Years In Current Role', 'Years Since Last Promotion',
       'Years With Curr Manager'],
      dtype='object')

In [4]:
X = d_f[['Age', 'Distance From Home','Job Level', 'Num Companies Worked','Total Working Years','Performance Rating']]
y = d_f['CF_attrition count']

In [5]:
X.head()

Unnamed: 0,Age,Distance From Home,Job Level,Num Companies Worked,Total Working Years,Performance Rating
0,41,1,2,8,8,3
1,49,8,2,1,10,4
2,37,2,1,6,7,3
3,33,3,1,1,8,3
4,27,2,1,9,6,3


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [7]:
models = [
    ('R-ForestClassifier',RandomForestClassifier(random_state=42)),
    ('G-BoostClassifier',GradientBoostingClassifier(random_state=42))
]

In [8]:
best_model = None
best_accuracy = 0.0

In [9]:
for name,model in models:
    pipeline = Pipeline([
        ('imputer',SimpleImputer(strategy='mean')),
        ('encoder',OneHotEncoder(handle_unknown='ignore')),
        ('model',model)
    ])

In [10]:
scores = cross_val_score(pipeline,X_train,y_train,cv=5)

In [11]:
mean_accuracy = scores.mean()

In [12]:
pipeline.fit(X_train,y_train)

In [13]:
y_pred = pipeline.predict(X_test)

In [14]:
accuracy = accuracy_score(y_test,y_pred)

In [15]:
print('Model',name)
print('CV Accuracy',mean_accuracy)
print('Test accuracy',accuracy)

Model G-BoostClassifier
CV Accuracy 0.827573015220074
Test accuracy 0.8478260869565217


In [16]:
if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = pipeline
print('Best model',best_model)

Best model Pipeline(steps=[('imputer', SimpleImputer()),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', GradientBoostingClassifier(random_state=42))])


In [21]:
H_Params = {
   'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, None]
}

In [22]:
grid_search= GridSearchCV(estimator=pipeline,param_grid=H_Params,cv=5)

In [23]:
grid_search.fit(X_train,y_train)

In [25]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
acc_scr_2 = accuracy_score(y_test,y_pred)

In [26]:
print(acc_scr_2)

0.8315217391304348


In [27]:
print("Best-HP : ",grid_search.best_params_)

Best-HP :  {'model__max_depth': 10, 'model__n_estimators': 100}
