In [34]:
import pandas as pd 
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from joblib import dump

In [35]:
DATA_DIR = 'data/'
MODEL_DIR = 'artifacts/'
df = pd.read_csv(DATA_DIR + 'HR_comma_sep.csv')
df.rename(columns={'Department':'department'}, inplace=True)


Unnamed: 0,0,1,2,3,4
satisfaction_level,0.38,0.8,0.11,0.72,0.37
last_evaluation,0.53,0.86,0.88,0.87,0.52
number_project,2,5,7,5,2
average_montly_hours,157,262,272,223,159
time_spend_company,3,6,4,5,3
Work_accident,0,0,0,0,0
left,1,1,1,1,1
promotion_last_5years,0,0,0,0,0
department,sales,sales,sales,sales,sales
salary,0,1,1,0,0


In [36]:
#encode categorical data
salary_enc = LabelEncoder()
df['salary'] = salary_enc.fit_transform(df.salary)
department_enc = LabelEncoder()
df['department'] = department_enc.fit_transform(df.department)

In [38]:
dfy = df['left']
df.drop('left', axis=1, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.15)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,0,7,0
1,0.80,0.86,5,262,6,0,0,7,1
2,0.11,0.88,7,272,4,0,0,7,1
3,0.72,0.87,5,223,5,0,0,7,0
4,0.37,0.52,2,159,3,0,0,7,0
...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,8,0
14995,0.37,0.48,2,160,3,0,0,8,0
14996,0.37,0.53,2,143,3,0,0,8,0
14997,0.11,0.96,6,280,4,0,0,8,0


In [37]:
class Classifier (BaseEstimator):
    def __init__(self, estimator=None):
        self.estimator = estimator
    
    def fit(self, X, y):
        self.estimator.fit(X, y)
        return self
    
    def predict(self, X, y):
        return self.estimator.predict(X, y)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    
    def score(self, X, y):
        return self.estimator.score(X, y)

In [28]:
pipe = Pipeline([('clf', Classifier())])

parameters = [
    {'clf' : [RandomForestClassifier()],
     'clf__n_estimators' : [75, 100, 125],
     'clf__min_samples_split':[2, 4, 6],
     'clf__max_depth': [5, 10, 15]
     }
]
grid = GridSearchCV(pipe, parameters, cv=5, scoring='roc_auc')
grid.fit(x_train,y_train)
model = grid.best_estimator_
score = grid.best_score_

In [29]:
y_pred = model.predict(x_test)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'The ROC-AUC for test data is found to be {roc_auc}')

The ROC-AUC for test data is found to be 0.9753550507458619


In [33]:
dump(model, MODEL_DIR + 'clf_v1.joblib')
dump(dep_enc, MODEL_DIR + 'deoartment_enc.joblib')

['artifacts/clf_v1.joblib']