In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/predicting-hiring-decisions-in-recruitment-data/recruitment_data.csv


In [2]:
import numpy as np
import pandas as pd

import pickle as pkl
 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [3]:
df = pd.read_csv('/kaggle/input/predicting-hiring-decisions-in-recruitment-data/recruitment_data.csv')


In [4]:
df.head()

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision
0,26,1,2,0,3,26.783828,48,78,91,1,1
1,39,1,4,12,3,25.862694,35,68,80,2,1
2,48,0,2,3,2,9.920805,20,67,13,2,0
3,34,1,2,5,2,6.407751,36,27,70,3,0
4,30,0,1,6,1,43.105343,23,52,85,2,0


**Data exploration, data cleaning, and model preparation**

In [5]:
df.dtypes

Age                      int64
Gender                   int64
EducationLevel           int64
ExperienceYears          int64
PreviousCompanies        int64
DistanceFromCompany    float64
InterviewScore           int64
SkillScore               int64
PersonalityScore         int64
RecruitmentStrategy      int64
HiringDecision           int64
dtype: object

In [6]:
df.isnull().sum()


Age                    0
Gender                 0
EducationLevel         0
ExperienceYears        0
PreviousCompanies      0
DistanceFromCompany    0
InterviewScore         0
SkillScore             0
PersonalityScore       0
RecruitmentStrategy    0
HiringDecision         0
dtype: int64

In [7]:
df.shape


(1500, 11)

In [8]:
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1500.0,35.148667,9.252728,20.0,27.0,35.0,43.0,50.0
Gender,1500.0,0.492,0.500103,0.0,0.0,0.0,1.0,1.0
EducationLevel,1500.0,2.188,0.862449,1.0,2.0,2.0,3.0,4.0
ExperienceYears,1500.0,7.694,4.641414,0.0,4.0,8.0,12.0,15.0
PreviousCompanies,1500.0,3.002,1.41067,1.0,2.0,3.0,4.0,5.0
DistanceFromCompany,1500.0,25.505379,14.567151,1.031376,12.838851,25.502239,37.737996,50.992462
InterviewScore,1500.0,50.564,28.626215,0.0,25.0,52.0,75.0,100.0
SkillScore,1500.0,51.116,29.353563,0.0,25.75,53.0,76.0,100.0
PersonalityScore,1500.0,49.387333,29.353201,0.0,23.0,49.0,76.0,100.0
RecruitmentStrategy,1500.0,1.893333,0.689642,1.0,1.0,2.0,2.0,3.0


In [9]:
df['HiringDecision'].value_counts(normalize = True ,dropna = False)


HiringDecision
0    0.69
1    0.31
Name: proportion, dtype: float64

**Create the training and testing data**

In [10]:
y = df["HiringDecision"]

In [11]:
X = df.copy()
X = X.drop("HiringDecision", axis = 1)

In [12]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 0)


**Determine set of hyperparameters**

In [13]:
cv_params = {'n_estimators' : [50,100], 
              'max_depth' : [10,50],        
              'min_samples_leaf' : [0.5,1], 
              'min_samples_split' : [0.001, 0.01],
              'max_features' : ["sqrt"], 
              'max_samples' : [.5,.9]}

**Create list of split indices**

In [14]:

split_index = [0 if x in X_val.index else -1 for x in X_train.index]
custom_split = PredefinedSplit(split_index)

In [15]:
rf = RandomForestClassifier(random_state=0)

In [16]:
rf_val = GridSearchCV(rf, cv_params, cv=custom_split, refit='f1', n_jobs = -1, verbose = 1)

**Fit the model.**

In [17]:


%%time




rf_val.fit(X_train, y_train)

Fitting 1 folds for each of 32 candidates, totalling 32 fits
CPU times: user 241 ms, sys: 55 ms, total: 296 ms
Wall time: 3.94 s


In [18]:
rf_val.best_params_

{'max_depth': 50,
 'max_features': 'sqrt',
 'max_samples': 0.9,
 'min_samples_leaf': 1,
 'min_samples_split': 0.01,
 'n_estimators': 50}

In [19]:

rf_opt = RandomForestClassifier(n_estimators = 50, max_depth = 50, 
                                min_samples_leaf = 1, min_samples_split = 0.01,
                                max_features="sqrt", max_samples = 0.9, random_state = 0)

In [20]:
rf_opt.fit(X_train, y_train)

In [21]:
y_pred = rf_opt.predict(X_test)

In [22]:
pc_test = precision_score(y_test, y_pred,pos_label = 0)
print("The precision score is {pc:.3f}".format(pc = pc_test))

The precision score is 0.905


In [23]:
rc_test = recall_score(y_test, y_pred,pos_label = 0)
print("The recall score is {rc:.3f}".format(rc = rc_test))

The recall score is 0.954


In [24]:
ac_test = accuracy_score(y_test, y_pred)
print("The accuracy score is {ac:.3f}".format(ac = ac_test))

The accuracy score is 0.899


In [25]:
f1_test = f1_score(y_test, y_pred,pos_label = 0)
print("The F1 score is {f1:.3f}".format(f1 = f1_test))

The F1 score is 0.929
