In [351]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [352]:
train = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

train['train_test'] = 1
test['train_test'] = 0

all_data = pd.concat([train, test])

%matplotlib inline
all_data.columns

In [None]:
Dataset taken from https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists

# Context and Content

A company which is active in Big Data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment.

This dataset designed to understand the factors that lead a person to leave current job for HR researches too. By model(s) that uses the current credentials,demographics,experience data you will predict the probability of a candidate to look for a new job or will work for the company, as well as interpreting affected factors on employee decision.

The whole data divided to train and test . Target isn't included in test but the test target values data file is in hands for related tasks. A sample submission correspond to enrollee_id of test set provided too with columns : enrollee _id , target.

Features

* enrollee_id : Unique ID for candidate

* city: City code

* city_ development _index : Developement index of the city (scaled)

* gender: Gender of candidate

* relevent_experience: Relevant experience of candidate

* enrolled_university: Type of University course enrolled if any

* education_level: Education level of candidate

* major_discipline :Education major discipline of candidate

* experience: Candidate total experience in years

* company_size: No of employees in current employer's company

* company_type : Type of current employer

* lastnewjob: Difference in years between previous job and current job

* training_hours: training hours completed

* target: 0 – Not looking for job change, 1 – Looking for a job change

# Overview

## 1) data exploration

## 2) Feature engineering

## 3) Data preprocessing for model

## 4) Basic model building

## 5) Model Tuning

## 6) Ensemble Model building

## 7) Results

### Data exploration

In [353]:
#Quick look at our data types & null counts
all_data.info()

In [354]:
#To further understand our dataset, we use the describe() method
all_data.describe()

In [355]:
all_data.describe()

In [356]:
#It's good practice to seperate numeric and categorical values. The describe method helps us get the former.
all_data.describe().columns

In [357]:
data_num = all_data[['city_development_index', 'training_hours', 'target']]
data_cat = all_data[['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']]

In [358]:
#distribution of all numeric variables
for i in data_num.columns:
    plt.hist(data_num[i])
    plt.title(i)
    plt.show()

In [359]:
print(data_num.corr())
sns.heatmap(data_num.corr())

In [360]:
print(pd.pivot_table(all_data, index="target", values=['city_development_index', 'training_hours', 'last_new_job', 'experience']))

In [361]:
#Let's plot categorical data t=in order to gain more insights
for i in data_cat:
    sns.barplot(data_cat[i].value_counts().index, data_cat[i].value_counts()).set_title(i)
    plt.show()

In [362]:
print(pd.pivot_table(all_data, index='target', columns='gender', values='train_test', aggfunc='count'))
print("###########################")
print(pd.pivot_table(all_data, index='target', columns='company_size', values='train_test', aggfunc='count'))
print("###########################")
print(pd.pivot_table(all_data, index='target', columns='education_level', values='train_test', aggfunc='count'))

# Feature engineering

In [363]:
#Change major discipline to STEM / Other since participants with stem majors are the majority.
print(all_data.major_discipline.value_counts())
all_data.major_discipline = all_data.major_discipline.apply(lambda x : "Other" if x != "STEM" and pd.notna(x) else x)
print(all_data.major_discipline.value_counts())

In [364]:
#Change university enrolled vs no enrollement
print(all_data.enrolled_university.value_counts())
all_data.enrolled_university = all_data.enrolled_university.apply(lambda x : "enrolled" if x != "no_enrollment" and pd.notna(x) else x)
print(all_data.enrolled_university.value_counts())

In [365]:
#Let's change the last_new_job values from string to numerical. We'll asign to those recorded as having over 4 years of experience the value 5 and those with no experience 0.
def cat_to_num(value):
    try:
        return int(value)
    except:
        if pd.notna(value):
            value=0 if value=="never" else 5
        return value
all_data.last_new_job = all_data.last_new_job.apply(cat_to_num)

In [366]:
#Same for the years of experience each employee has
def cat_to_num2(value):
    try:
        return int(value)
    except:
        if pd.notna(value):
            value=0 if value=="<1" else 21
        return value
all_data.experience = all_data.experience.apply(cat_to_num2)

In [367]:
all_data.company_size.value_counts()

In [368]:
def cat_to_num3(value):
    try:
        return int(value[:value.index("-")])
    except:
        if value == "<10":
            return 0
        elif value == "10/49":
            return 10
        elif value == "10000+":
            return 10000
    return value
all_data.company_size = all_data.company_size.apply(cat_to_num3)

In [369]:
all_data.company_size.value_counts()

### Data preprocessing for model

In [370]:
all_data.dropna(subset=['gender', 'relevent_experience', 'enrolled_university', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours', 'train_test'], inplace=True)
all_dummies = pd.get_dummies(all_data[['gender', 'relevent_experience', 'enrolled_university', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job', 'training_hours', 'train_test']])

all_dummies.head()

X_train = all_dummies[all_dummies.train_test == 1].drop(['train_test'], axis=1)
X_test = all_dummies[all_dummies.train_test == 0].drop(['train_test'], axis=1)

Y_train = all_data[all_data.train_test == 1].target
Y_train.shape

In [371]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
all_dummies_scaled = all_dummies.copy()
all_dummies_scaled[['experience','last_new_job','training_hours']]= scale.fit_transform(all_dummies_scaled[['experience','last_new_job','training_hours']])
all_dummies_scaled

X_train_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 1].drop(['train_test'], axis =1)
X_test_scaled = all_dummies_scaled[all_dummies_scaled.train_test == 0].drop(['train_test'], axis =1)

Y_train = all_data[all_data.train_test==1].target

### Model Building (Baseline Validation Performance)

* Naive Bayes
* Logistic Regression
* Decision Tree
* K Nearest Neighbor
* Random Forest
* Support Vector Classifier
* Xtreme Gradient Boosting
* Soft Voting Classifier - All Models

In [372]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [373]:
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [374]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [375]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [376]:
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [377]:
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [378]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [379]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [380]:
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train,Y_train,cv=5)
print(cv)
print(cv.mean())

In [381]:
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [382]:
svc = SVC(probability = True)
cv = cross_val_score(svc,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [383]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [384]:
#Voting classifier takes all of the inputs and averages the results. For a "hard" voting classifier each classifier gets 1 vote "yes" or "no" and the result is just a popular vote. For this, you generally want odd numbers
#A "soft" classifier averages the confidence of each of the models. If a the average confidence is > 50% that it is a 1 it will be counted as such
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),('svc',svc),('xgb',xgb)], voting = 'soft') 

In [385]:
cv = cross_val_score(voting_clf,X_train_scaled,Y_train,cv=5)
print(cv)
print(cv.mean())

In [405]:
voting_clf.fit(X_train_scaled,Y_train)
y_hat_base_vc = voting_clf.predict(X_test_scaled).astype(int)
basic_submission = {'enrollee_id': test.enrollee_id, 'target': y_hat_base_vc}
base_submission = pd.DataFrame(data=basic_submission)
base_submission.to_csv('base_submission.csv', index=False)

In [404]:
X_test.head()

### Model Tuned Performance

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

In [None]:
#simple performance reporting function
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

In [None]:
lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train_scaled,Y_train)
clf_performance(best_clf_lr,'Logistic Regression')

In [None]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train_scaled,Y_train)
clf_performance(best_clf_knn,'KNN')

In [None]:
svc = SVC(probability = True)
param_grid = tuned_parameters = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10],
                                  'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': [.1, 1, 10, 100, 1000]}]
clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train_scaled,Y_train)
clf_performance(best_clf_svc,'SVC')

In [None]:
#Because the total feature space is so large, I used a randomized search to narrow down the paramters for the model. I took the best model from this and did a more granular search 
"""
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [100,500,1000], 
                                  'bootstrap': [True,False],
                                  'max_depth': [3,5,10,20,50,75,100,None],
                                  'max_features': ['auto','sqrt'],
                                  'min_samples_leaf': [1,2,4,10],
                                  'min_samples_split': [2,5,10]}
                                  
clf_rf_rnd = RandomizedSearchCV(rf, param_distributions = param_grid, n_iter = 100, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf_rnd = clf_rf_rnd.fit(X_train_scaled,y_train)
clf_performance(best_clf_rf_rnd,'Random Forest')"""

In [None]:
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [400,450,500,550],
               'criterion':['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [15, 20, 25],
                                  'max_features': ['auto','sqrt', 10],
                                  'min_samples_leaf': [2,3],
                                  'min_samples_split': [2,3]}
                                  
clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train_scaled,Y_train)
clf_performance(best_clf_rf,'Random Forest')

In [None]:
best_rf = best_clf_rf.best_estimator_.fit(X_train_scaled,Y_train)
feat_importances = pd.Series(best_rf.feature_importances_, index=X_train_scaled.columns)
feat_importances.nlargest(20).plot(kind='barh')

In [None]:
"""xgb = XGBClassifier(random_state = 1)

param_grid = {
    'n_estimators': [20, 50, 100, 250, 500,1000],
    'colsample_bytree': [0.2, 0.5, 0.7, 0.8, 1],
    'max_depth': [2, 5, 10, 15, 20, 25, None],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2],
    'subsample': [0.5,0.6,0.7, 0.8, 0.9],
    'learning_rate':[.01,0.1,0.2,0.3,0.5, 0.7, 0.9],
    'gamma':[0,.01,.1,1,10,100],
    'min_child_weight':[0,.01,0.1,1,10,100],
    'sampling_method': ['uniform', 'gradient_based']
}

#clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
#best_clf_xgb = clf_xgb.fit(X_train_scaled,y_train)
#clf_performance(best_clf_xgb,'XGB')
clf_xgb_rnd = RandomizedSearchCV(xgb, param_distributions = param_grid, n_iter = 1000, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb_rnd = clf_xgb_rnd.fit(X_train_scaled,y_train)
clf_performance(best_clf_xgb_rnd,'XGB')"""

In [None]:
xgb = XGBClassifier(random_state = 1)

param_grid = {
    'n_estimators': [450,500,550],
    'colsample_bytree': [0.75,0.8,0.85],
    'max_depth': [None],
    'reg_alpha': [1],
    'reg_lambda': [2, 5, 10],
    'subsample': [0.55, 0.6, .65],
    'learning_rate':[0.5],
    'gamma':[.5,1,2],
    'min_child_weight':[0.01],
    'sampling_method': ['uniform']
}

clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb = clf_xgb.fit(X_train_scaled,Y_train)
clf_performance(best_clf_xgb,'XGB')

In [None]:
y_hat_xgb = best_clf_xgb.best_estimator_.predict(X_test_scaled).astype(int)
xgb_submission = {'PassengerId': test.PassengerId, 'Survived': y_hat_xgb}
submission_xgb = pd.DataFrame(data=xgb_submission)
submission_xgb.to_csv('xgb_submission3.csv', index=False)

### Model Additional Ensemble Approaches

In [None]:
best_lr = best_clf_lr.best_estimator_
best_knn = best_clf_knn.best_estimator_
best_svc = best_clf_svc.best_estimator_
best_rf = best_clf_rf.best_estimator_
best_xgb = best_clf_xgb.best_estimator_

voting_clf_hard = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'hard') 
voting_clf_soft = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'soft') 
voting_clf_all = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc), ('lr', best_lr)], voting = 'soft') 
voting_clf_xgb = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc), ('xgb', best_xgb),('lr', best_lr)], voting = 'soft')

print('voting_clf_hard :',cross_val_score(voting_clf_hard,X_train,Y_train,cv=5))
print('voting_clf_hard mean :',cross_val_score(voting_clf_hard,X_train,Y_train,cv=5).mean())

print('voting_clf_soft :',cross_val_score(voting_clf_soft,X_train,Y_train,cv=5))
print('voting_clf_soft mean :',cross_val_score(voting_clf_soft,X_train,Y_train,cv=5).mean())

print('voting_clf_all :',cross_val_score(voting_clf_all,X_train,Y_train,cv=5))
print('voting_clf_all mean :',cross_val_score(voting_clf_all,X_train,Y_train,cv=5).mean())

print('voting_clf_xgb :',cross_val_score(voting_clf_xgb,X_train,Y_train,cv=5))
print('voting_clf_xgb mean :',cross_val_score(voting_clf_xgb,X_train,Y_train,cv=5).mean())

In [None]:
#in a soft voting classifier you can weight some models more than others. I used a grid search to explore different weightings
#no new results here
params = {'weights' : [[1,1,1],[1,2,1],[1,1,2],[2,1,1],[2,2,1],[1,2,2],[2,1,2]]}

vote_weight = GridSearchCV(voting_clf_soft, param_grid = params, cv = 5, verbose = True, n_jobs = -1)
best_clf_weight = vote_weight.fit(X_train_scaled,Y_train)
clf_performance(best_clf_weight,'VC Weights')
voting_clf_sub = best_clf_weight.best_estimator_.predict(X_test_scaled)

In [None]:
#Make Predictions 
voting_clf_hard.fit(X_train_scaled, Y_train)
voting_clf_soft.fit(X_train_scaled, Y_train)
voting_clf_all.fit(X_train_scaled, Y_train)
voting_clf_xgb.fit(X_train_scaled, Y_train)

best_rf.fit(X_train_scaled, Y_train)
y_hat_vc_hard = voting_clf_hard.predict(X_test_scaled).astype(int)
y_hat_rf = best_rf.predict(X_test_scaled).astype(int)
y_hat_vc_soft =  voting_clf_soft.predict(X_test_scaled).astype(int)
y_hat_vc_all = voting_clf_all.predict(X_test_scaled).astype(int)
y_hat_vc_xgb = voting_clf_xgb.predict(X_test_scaled).astype(int)

In [None]:
#convert output to dataframe 
final_data = {'PassengerId': test.PassengerId, 'Survived': y_hat_rf}
submission = pd.DataFrame(data=final_data)

final_data_2 = {'PassengerId': test.PassengerId, 'Survived': y_hat_vc_hard}
submission_2 = pd.DataFrame(data=final_data_2)

final_data_3 = {'PassengerId': test.PassengerId, 'Survived': y_hat_vc_soft}
submission_3 = pd.DataFrame(data=final_data_3)

final_data_4 = {'PassengerId': test.PassengerId, 'Survived': y_hat_vc_all}
submission_4 = pd.DataFrame(data=final_data_4)

final_data_5 = {'PassengerId': test.PassengerId, 'Survived': y_hat_vc_xgb}
submission_5 = pd.DataFrame(data=final_data_5)

final_data_comp = {'PassengerId': test.PassengerId, 'Survived_vc_hard': y_hat_vc_hard, 'Survived_rf': y_hat_rf, 'Survived_vc_soft' : y_hat_vc_soft, 'Survived_vc_all' : y_hat_vc_all,  'Survived_vc_xgb' : y_hat_vc_xgb}
comparison = pd.DataFrame(data=final_data_comp)

In [None]:
#track differences between outputs 
comparison['difference_rf_vc_hard'] = comparison.apply(lambda x: 1 if x.Survived_vc_hard != x.Survived_rf else 0, axis =1)
comparison['difference_soft_hard'] = comparison.apply(lambda x: 1 if x.Survived_vc_hard != x.Survived_vc_soft else 0, axis =1)
comparison['difference_hard_all'] = comparison.apply(lambda x: 1 if x.Survived_vc_all != x.Survived_vc_hard else 0, axis =1)

In [None]:
comparison.difference_hard_all.value_counts()

In [None]:
#prepare submission files 
submission.to_csv('submission_rf.csv', index =False)
submission_2.to_csv('submission_vc_hard.csv',index=False)
submission_3.to_csv('submission_vc_soft.csv', index=False)
submission_4.to_csv('submission_vc_all.csv', index=False)
submission_5.to_csv('submission_vc_xgb2.csv', index=False)