In [1]:
# Start by connecting google drive into the google colab

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.activity.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fexperimentsandconfigs%20https%3a%2f%2fwww.googleapis.com%2fauth%2fphotos.native&response_type=code

Enter your authorization code:
4/1AX4XfWhMEDFi8kjdqVh25fgnfb_i80ztbQOUJYTHIgWQKu_e_8SVaFpprZ4
Mounted at /content/gdrive


<h1 align='center'>  HR ANALYTICS CHALLENGE </h1>
<h3 align='center'> <b>Predict Whether a Potential Promotee Will be Promoted or Not</b> </h3>

### **The Challenge**

HR analytics is revolutionising the way human resources departments operate, leading to higher efficiency and better results overall. Human resources has been using analytics for years. However, the collection, processing and analysis of data has been largely manual, and given the nature of human resources dynamics and HR KPIs, the approach has been constraining HR. Therefore, it is surprising that HR departments woke up to the utility of machine learning so late in the game. 

## 0. Import relevant Dependencies

Incase you are getting any error saying the package is not installed while running the below cell, then you can use two methods:
- pip install ________.
- google 'How to install ________'.

In [3]:
# Import Dependencies -To see the graphs in the notebook.
%matplotlib inline   

# Python Imports
import math,time,random,datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization -This is where the graphs come in.
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
plt.style.use('fivethirtyeight')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Performance Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

# Display all the columns/rows of the DataFrame.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## 0. Loading the required Data

In [4]:
# Import the train data.
train = pd.read_csv('/content/gdrive/My Drive/Final_train.csv')

## 1. Model Building

### Algorithms
From here, we will be running the following algorithms.

- Logistic Regression
- KNN
- Naive Bayes
- Stochastic Gradient Decent
- Linear SVC
- Decision Tree
- Gradient Boosted Trees
- Random Forest
- CatBoost Algorithm

In [5]:
train.head()

Unnamed: 0,is_promoted,department,education,gender,recruitment_channel,no_of_trainings,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,0,7,2,0,2,-0.415276,1.385021,0.50046,1.356878,-0.154018,-1.075931
1,0,4,0,1,0,-0.415276,1.385021,-0.437395,-0.736986,-0.154018,-0.253282
2,0,7,0,1,2,-0.415276,-0.259125,0.265996,-0.736986,-0.154018,-1.001145
3,0,7,0,1,0,1.226063,-1.903271,0.969387,-0.736986,-0.154018,-1.001145
4,0,8,0,1,0,-0.415276,-0.259125,-0.906322,-0.736986,-0.154018,0.718939


In [6]:
X = train.drop(columns= 'is_promoted')
y = train['is_promoted']

### Overcoming Class Imbalance

In [7]:
from imblearn.combine import SMOTETomek
from collections import Counter

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [10]:
os= SMOTETomek(random_state= 42)
X_train_ns,y_train_ns = os.fit_resample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 40086, 1: 3760})
The number of classes after fit Counter({0: 39518, 1: 39518})


In any model building, we mainly focus on 3 main steps:

- Fitting the model and finding the accuracy (accuracy score) of the fitted model.
- Perform K-Fold Cross Validation (K needs to be specified).
- Find the accuracy of the Cross Validation. 

**We will be running a whole bunch of models to figure out which model is best suited for our data.**

#### Model 1: Logistic Regression

In [11]:
start_time = time.time()
algorithm = LogisticRegression()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
log_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
log_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
log_acc_cv = round(metrics.accuracy_score(y_train_ns, log_train_pred)*100, 2)

log_pre_cv = precision_score(y_train_ns, log_train_pred)
log_rec_cv = recall_score(y_train_ns, log_train_pred)
log_f1_cv = f1_score(y_train_ns, log_train_pred)

log_time = (time.time()- start_time)

In [12]:
# Logistic Regression
print('Accuracy of the model is: ', log_acc)
print('Accuracy of 10-Fold CV is: ', log_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= log_time))

print('Precision: ', log_pre_cv)
print('Recall: ', log_rec_cv)
print('F1-Score: ', log_f1_cv)


Accuracy of the model is:  72.17
Accuracy of 10-Fold CV is:  72.17
Running time is:  0:00:03.494938
Precision:  0.7230299172501591
Recall:  0.7185839364340301
F1-Score:  0.7208000710723034


#### Model 2: K-Nearest Neighbours

In [13]:
start_time = time.time()
algorithm = KNeighborsClassifier()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
knn_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
knn_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
knn_acc_cv = round(metrics.accuracy_score(y_train_ns, knn_train_pred)*100, 2)

knn_pre_cv = precision_score(y_train_ns, knn_train_pred)
knn_rec_cv = recall_score(y_train_ns, knn_train_pred)
knn_f1_cv = f1_score(y_train_ns, knn_train_pred)

knn_time = (time.time()- start_time)

In [14]:
# K-Nearest Neighbours
print('Accuracy of the model is: ', knn_acc)
print('Accuracy of 10-Fold CV is: ', knn_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= knn_time))

print('Precision: ', knn_pre_cv)
print('Recall: ', knn_rec_cv)
print('F1-Score: ', knn_f1_cv)

Accuracy of the model is:  93.67
Accuracy of 10-Fold CV is:  90.41
Running time is:  0:01:19.105205
Precision:  0.875646937047144
Recall:  0.9418998937193178
F1-Score:  0.9075658937410088


#### Model 3: Gaussian Naive Bayes

In [15]:
start_time = time.time()
algorithm = GaussianNB()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
gnb_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
gnb_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
gnb_acc_cv = round(metrics.accuracy_score(y_train_ns, gnb_train_pred)*100, 2)

gnb_pre_cv = precision_score(y_train_ns, gnb_train_pred)
gnb_rec_cv = recall_score(y_train_ns, gnb_train_pred)
gnb_f1_cv = f1_score(y_train_ns, gnb_train_pred)

gnb_time = (time.time()- start_time)

In [16]:
# Gaussian Naive Bayes
print('Accuracy of the model is: ', gnb_acc)
print('Accuracy of 10-Fold CV is: ', gnb_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= gnb_time))

print('Precision: ', gnb_pre_cv)
print('Recall: ', gnb_rec_cv)
print('F1-Score: ', gnb_f1_cv)

Accuracy of the model is:  67.3
Accuracy of 10-Fold CV is:  67.32
Running time is:  0:00:00.571526
Precision:  0.7483494159471813
Recall:  0.5220152841743003
F1-Score:  0.6150199749567706


#### Model 4: Linear Support Vector Machines (SVC)

In [17]:
start_time = time.time()
algorithm = LinearSVC()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
svc_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
svc_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
svc_acc_cv = round(metrics.accuracy_score(y_train_ns, svc_train_pred)*100, 2)

svc_pre_cv = precision_score(y_train_ns, svc_train_pred)
svc_rec_cv = recall_score(y_train_ns, svc_train_pred)
svc_f1_cv = f1_score(y_train_ns, svc_train_pred)

svc_time = (time.time()- start_time)

In [18]:
# Linear Support Vector Machines
print('Accuracy of the model is: ', svc_acc)
print('Accuracy of 10-Fold CV is: ', svc_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= svc_time))

print('Precision: ', svc_pre_cv)
print('Recall: ', svc_rec_cv)
print('F1-Score: ', svc_f1_cv)

Accuracy of the model is:  72.53
Accuracy of 10-Fold CV is:  72.55
Running time is:  0:02:30.294694
Precision:  0.7242984774128602
Recall:  0.7282757224555898
F1-Score:  0.7262816549531247


#### Model 5: Stochastic Gradient Descent

In [19]:
start_time = time.time()
algorithm = SGDClassifier()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
SGD_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
SGD_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
SGD_acc_cv = round(metrics.accuracy_score(y_train_ns, SGD_train_pred)*100, 2)

SGD_pre_cv = precision_score(y_train_ns, SGD_train_pred)
SGD_rec_cv = recall_score(y_train_ns, SGD_train_pred)
SGD_f1_cv = f1_score(y_train_ns, SGD_train_pred)

SGD_time = (time.time()- start_time)

In [20]:
# Stochastic Gradient Descent
print('Accuracy of the model is: ', SGD_acc)
print('Accuracy of 10-Fold CV is: ', SGD_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= SGD_time))

print('Precision: ', SGD_pre_cv)
print('Recall: ', SGD_rec_cv)
print('F1-Score: ', SGD_f1_cv)

Accuracy of the model is:  73.68
Accuracy of 10-Fold CV is:  72.68
Running time is:  0:00:05.477207
Precision:  0.7120247924108727
Recall:  0.7616276127334379
F1-Score:  0.7359913924855421


#### Model 6: Decision Tree Classifier

In [21]:
start_time = time.time()
algorithm = DecisionTreeClassifier()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
dt_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
dt_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
dt_acc_cv = round(metrics.accuracy_score(y_train_ns, dt_train_pred)*100, 2)

dt_pre_cv = precision_score(y_train_ns, dt_train_pred)
dt_rec_cv = recall_score(y_train_ns, dt_train_pred)
dt_f1_cv = f1_score(y_train_ns, dt_train_pred)

dt_time = (time.time()- start_time)

In [22]:
#  Decision Tree Classifier
print('Accuracy of the model is: ', dt_acc)
print('Accuracy of 10-Fold CV is: ', dt_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= dt_time))

print('Precision: ', dt_pre_cv)
print('Recall: ', dt_rec_cv)
print('F1-Score: ', dt_f1_cv)

Accuracy of the model is:  98.76
Accuracy of 10-Fold CV is:  94.06
Running time is:  0:00:02.985321
Precision:  0.9485059247810407
Recall:  0.931752619059669
F1-Score:  0.9400546350430187


#### Model 7: Gradient Boost Trees

In [23]:
start_time = time.time()
algorithm = GradientBoostingClassifier()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
gbt_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
gbt_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
gbt_acc_cv = round(metrics.accuracy_score(y_train_ns, gbt_train_pred)*100, 2)

gbt_pre_cv = precision_score(y_train_ns, gbt_train_pred)
gbt_rec_cv = recall_score(y_train_ns, gbt_train_pred)
gbt_f1_cv = f1_score(y_train_ns, gbt_train_pred)

gbt_time = (time.time()- start_time)

In [24]:
# Gradient Boost Trees
print('Accuracy of the model is: ', gbt_acc)
print('Accuracy of 10-Fold CV is: ', gbt_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= gbt_time))

print('Precision: ', gbt_pre_cv)
print('Recall: ', gbt_rec_cv)
print('F1-Score: ', gbt_f1_cv)

Accuracy of the model is:  87.94
Accuracy of 10-Fold CV is:  87.12
Running time is:  0:01:25.181975
Precision:  0.844112886198888
Recall:  0.9105217875398552
F1-Score:  0.8760606245054477


#### Model 8: Random Forest


In [25]:
start_time = time.time()
algorithm = RandomForestClassifier()

## Step 1:
model = algorithm.fit(X_train_ns,y_train_ns)      # Creating the model. We will fit the algorithm to the training data.
rf_acc = round(model.score(X_train_ns,y_train_ns)*100, 2)

## Step 2:  --> This code performs Cross Validation automatically.
rf_train_pred = model_selection.cross_val_predict(algorithm, X_train_ns,y_train_ns, cv= 10, n_jobs= -1)

## Step 3:  --> Cross Validation accuracy metric.
rf_acc_cv = round(metrics.accuracy_score(y_train_ns, rf_train_pred)*100, 2)

rf_pre_cv = precision_score(y_train_ns, rf_train_pred)
rf_rec_cv = recall_score(y_train_ns, rf_train_pred)
rf_f1_cv = f1_score(y_train_ns, rf_train_pred)

rf_time = (time.time()- start_time)

In [26]:
print('Accuracy of the model is: ', rf_acc)
print('Accuracy of 10-Fold CV is: ', rf_acc_cv)
print('Running time is: ', datetime.timedelta(seconds= rf_time))

print('Precision: ', rf_pre_cv)
print('Recall: ', rf_rec_cv)
print('F1-Score: ', rf_f1_cv)

Accuracy of the model is:  98.76
Accuracy of 10-Fold CV is:  95.35
Running time is:  0:01:13.699100
Precision:  0.957189509133585
Recall:  0.9493901513234475
F1-Score:  0.9532738775821328


### Model Results

Now let's see which model has the best cross-validation accuracy.

- <b>NOTE:</b> We care more about the accuracy of cross validation, as the metrics we get from the model can randomly score higher than usual.

In [27]:
cv_models = pd.DataFrame({'Model':[' Logistic Regression', 'K-Nearest Neighbours', 'Gaussian Naive Bayes', 
                                'Linear Support Vector Machines (SVC)', 'Stochastic Gradient Descent', 
                                'Decision Tree Classifier', 'Gradient Boost Trees', 'Random Forest'],
                      'Score':[log_acc_cv, knn_acc_cv, gnb_acc_cv, svc_acc_cv, SGD_acc_cv, dt_acc_cv, gbt_acc_cv, rf_acc_cv]})

print('-----Cross-Validation Accuracy Scores-----')
cv_models.nlargest(9,'Score')

-----Cross-Validation Accuracy Scores-----


Unnamed: 0,Model,Score
7,Random Forest,95.35
5,Decision Tree Classifier,94.06
1,K-Nearest Neighbours,90.41
6,Gradient Boost Trees,87.12
4,Stochastic Gradient Descent,72.68
3,Linear Support Vector Machines (SVC),72.55
0,Logistic Regression,72.17
2,Gaussian Naive Bayes,67.32


### Precision and Recall

Precision and Recall are metrics that you use when you have an imbalanced classification problem.

- Recall - a metric which measures a models ability to find all relevant cases in a dataset.
- Precision - a metric which measures a models ability to correctly identify only relevant cases.

Combining  Precision and Recall gives us the **F1 score.**

They fall between 0 and 1, with 1 being better.

In [28]:
f1_cv_models = pd.DataFrame({'Model':[' Logistic Regression', 'K-Nearest Neighbours', 'Gaussian Naive Bayes', 
                                'Linear Support Vector Machines (SVC)', 'Stochastic Gradient Descent', 
                                'Decision Tree Classifier', 'Gradient Boost Trees', 'Random Forest'],
                      'F1-Score':[log_f1_cv, knn_f1_cv, gnb_f1_cv, svc_f1_cv, SGD_f1_cv, dt_f1_cv, gbt_f1_cv, rf_f1_cv]})

print('-----Cross-Validation Accuracy Scores-----')
f1_cv_models.nlargest(8,'F1-Score')

-----Cross-Validation Accuracy Scores-----


Unnamed: 0,Model,F1-Score
7,Random Forest,0.953274
5,Decision Tree Classifier,0.940055
1,K-Nearest Neighbours,0.907566
6,Gradient Boost Trees,0.876061
4,Stochastic Gradient Descent,0.735991
3,Linear Support Vector Machines (SVC),0.726282
0,Logistic Regression,0.7208
2,Gaussian Naive Bayes,0.61502


> **Recall = TP/(TP + FN)**
- Here the Recall is pretty high. This means that there is a lesser amount of False Negatives (predicting 'Did not launch' when it was actually 'Launched').

> **Pression = TP/(TP + FP)**
- Precision is high. Thus, we can say say that there is less False Positives (predicting 'Launched' when it actually 'Did not launch')

## HyperParameter Tuning - Random Forest

In [29]:
## Randomized Search CV - Faster than GridSearchCV

# The number of trees in the forest.
n_estimators = [int(x) for x in np.linspace(start= 100, stop= 1200, num= 12)]
# The number of features to consider when looking for the best split
max_features = ['auto', 'sqrt']
# The maximum depth/levels of the tree.
max_depth = [int(x) for x in np.linspace(5, 30, num= 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]

In [30]:
## Hyper Parameter Optimization

## Key Dictionary
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [31]:
## Hyper-Parameter Optimization Using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

In [32]:
rf_search = RandomizedSearchCV(estimator= rf, param_distributions= random_grid, scoring= 'f1', n_iter = 50, cv = 5, verbose=2, random_state= 42, n_jobs = -1)

In [33]:
rf_search.fit(X_train_ns,y_train_ns)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 64.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 110.6min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [34]:
rf_search.best_params_

{'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 600}

In [35]:
rf_search.best_score_

0.9501187506565991

### Prediction

Let's use the model with the highest cross-validation accuracy score to make a prediction on the test dataset.

We want to make predictions on the same columnns our model is trained on.

So we have to select the subset of right columns of the test dateframe, encode them and make a prediciton with our model.

In [36]:
# Create a list of columns to be used for predictions.
wanted_columns = X_train.columns
wanted_columns

Index(['department', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score'],
      dtype='object')

In [37]:
# Make predictions using RandomForest model on wanted columns.
predictions = rf_search.predict(X_test[wanted_columns])

In [38]:
#  RandomForest Algorithm
print('Accuracy of the model is: ', accuracy_score(y_test, predictions))
print('Precision: ', precision_score(y_test, predictions))
print('Recall: ', recall_score(y_test, predictions))
print('F1: ', f1_score(y_test, predictions))

Accuracy of the model is:  0.9137931034482759
Precision:  0.47810650887573963
Recall:  0.44493392070484583
F1:  0.46092413006274957


## 2. Prediction on the Test dataset

Let's use the model with the highest cross-validation accuracy score to make a prediction on the test dataset.

We want to make predictions on the same columnns our model is trained on.

So we have to select the subset of right columns of the test dateframe, encode them and make a prediciton with our model.

In [39]:
test = pd.read_csv('/content/gdrive/My Drive/Final_test.csv')
test.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8,0,1,2,-0.423094,-0.266732,-1.1432,1.336715,-0.152665,1.024263
1,2,0,0,0,-0.423094,-0.266732,-0.19259,-0.748103,-0.152665,-0.914377
2,7,0,1,0,-0.423094,-1.907786,-0.430243,-0.748103,-0.152665,-1.212629
3,5,0,0,0,2.905264,-1.087259,0.758019,-0.748103,-0.152665,0.129506
4,1,0,1,2,-0.423094,0.553794,0.282714,-0.748103,-0.152665,-0.168746


In [41]:
# Make predictions using RandomForest model on wanted columns.
predictions = rf_search.predict(test[wanted_columns])

In [42]:
# Our predictions array is comprised of 0's and 1's.
predictions[:30]

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0])

In [44]:
pd.set_option('display.max_rows',100)
df = pd.read_csv('/content/gdrive/My Drive/test.csv')

# Create a dataframe and append the relevant colimns.
submission = pd.DataFrame()
submission['employee_id'] = df['employee_id']
submission['is_promoted'] = predictions
submission.head()

Unnamed: 0,employee_id,is_promoted
0,8724,1
1,74430,0
2,72255,0
3,38562,0
4,64486,0


In [45]:
submission['is_promoted'].value_counts()

0    13740
1     9750
Name: is_promoted, dtype: int64

In [46]:
# Are our test and submission the same length?
if len(submission) == len(test):
    print('The submission and the test dataframes are of the same length')
else:
    print('Dataframes mismatched')

The submission and the test dataframes are of the same length


In [47]:
# convert submission dataframe to csv.
submission.to_csv('/content/gdrive/My Drive/HR_Analytics.csv', index= False)
print('Submission csv is ready')

Submission csv is ready
