## Importing libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Importing preprocessed data
data = pd.read_csv(r"C:\Users\Lenovo\Documents\jupyter notebook DATA SCIENCE\ML project\Project on HR Analytics data\eda_data")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,0,8,2,0,1,1,35,5.0,8,1,0,49,0
1,1,7,1,1,2,1,30,5.0,4,0,0,60,0
2,2,8,1,1,1,1,34,3.0,7,0,0,50,0
3,3,8,1,1,2,2,39,1.0,10,0,0,50,0
4,4,6,1,1,2,1,45,3.0,2,0,0,73,0


In [4]:
# Dropping Unnamed column
data = data.drop(r"Unnamed: 0",axis=1)

In [5]:
data.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,8,2,0,1,1,35,5.0,8,1,0,49,0
1,7,1,1,2,1,30,5.0,4,0,0,60,0
2,8,1,1,1,1,34,3.0,7,0,0,50,0
3,8,1,1,2,2,39,1.0,10,0,0,50,0
4,6,1,1,2,1,45,3.0,2,0,0,73,0


In [6]:
data.dtypes

department                int64
education                 int64
gender                    int64
recruitment_channel       int64
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [12]:
data['no_of_trainings'].unique()

array([ 1,  2,  3,  4,  7,  5,  6,  8, 10,  9], dtype=int64)

In [7]:
data['age'] = data['age'].astype(int)

In [13]:
data.isna().sum()

department              0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [14]:
# Splitting to  x and y 
X = data.drop('is_promoted',axis=1)
y = data['is_promoted']

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.33,random_state=42)

## SMOTE

In [18]:
from imblearn.over_sampling import SMOTE

In [19]:
sm = SMOTE(random_state=42)
X_res,y_res = sm.fit_resample(X_train,y_train)

In [20]:
from sklearn.model_selection import GridSearchCV #GridSearchCV is for parameter tuning
from sklearn.ensemble import RandomForestClassifier
cls=RandomForestClassifier()
n_estimators=[50,75,100] 
criterion=['gini','entropy'] 
max_depth=[3,5,10] 
parameters={'n_estimators': n_estimators,'criterion':criterion,'max_depth':max_depth}
RFC_cls = GridSearchCV(cls, parameters)
RFC_cls.fit(X_res,y_res)


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 10],
                         'n_estimators': [50, 75, 100]})

In [21]:
# Getting the best parameters
RFC_cls.best_params_

{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}

In [22]:
final_cls = RandomForestClassifier(criterion='gini',max_depth=10,n_estimators=100)
final_cls.fit(X_res,y_res)

RandomForestClassifier(max_depth=10)

In [25]:
y_pred = final_cls.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.7998009620169182

In [30]:
#precision,Recall,F1 Score
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test,y_pred)

(array([0.96459734, 0.23848965]),
 array([0.81183411, 0.6641791 ]),
 array([0.88164733, 0.35095895]),
 array([16613,  1474], dtype=int64))

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.81      0.88     16613
           1       0.24      0.66      0.35      1474

    accuracy                           0.80     18087
   macro avg       0.60      0.74      0.62     18087
weighted avg       0.91      0.80      0.84     18087



In [50]:
# saving the model
import joblib

In [51]:
joblib.dump(final_cls,r"C:\Users\Lenovo\Documents\jupyter notebook DATA SCIENCE\ML project\rfmodel.pkl")

['C:\\Users\\Lenovo\\Documents\\jupyter notebook DATA SCIENCE\\ML project\\rfmodel.pkl']

## Importing test dataset

In [32]:
testdata = pd.read_csv(r"C:\Users\Lenovo\Documents\jupyter notebook DATA SCIENCE\ML project\test_2umaH9m.csv")

In [33]:
testdata.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [34]:
testdata = testdata.drop(['employee_id','region'],axis=1)

In [35]:
testdata.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Technology,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,HR,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,Sales & Marketing,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,Procurement,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,Finance,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


## Changing the testdata according to the training data

In [36]:
map_education =  {"Master's & above":2, "Bachelor's":1, "Below Secondary":0}
testdata['education'] = testdata['education'].map(map_education)

In [37]:
map_department = {'Sales & Marketing':8, 'Operations':7, 'Technology':6, 'Analytics':4,'R&D':0,'Procurement':5, 'Finance':3, 'HR':2, 'Legal':1}
testdata['department'] = testdata['department'].map(map_department)

In [38]:
rec_map = {'sourcing':1, 'other':2, 'referred':0}
testdata['recruitment_channel']= testdata['recruitment_channel'].map(rec_map)

In [39]:
map_gender = {"m":1,"f":0}
testdata['gender'] = testdata['gender'].map(map_gender)

In [40]:
testdata.dtypes


department                int64
education               float64
gender                    int64
recruitment_channel       int64
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
dtype: object

In [47]:
testdata['previous_year_rating'].fillna(testdata['previous_year_rating'].mean(),inplace=True)

In [48]:
testdata['education'].fillna(testdata['education'].mode()[0],inplace=True)

In [49]:
testdata.isna().sum()

department              0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

Importing the model 

In [52]:
model = joblib.load(r"C:\Users\Lenovo\Documents\jupyter notebook DATA SCIENCE\ML project\rfmodel.pkl")

Predicting the target

In [54]:
test_pred=model.predict(testdata)

In [61]:
test_pred.shape

(23490,)

In [1]:
#END