In [19]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, r2_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import  make_scorer
from sklearn import datasets
from numpy.testing import assert_almost_equal

from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

One important function of HR is to retain talent. High employee attrition in a company can result to financial cost, lower morale, decrease in producvity  and more. Therefore it's imperative for HR to be able to maintain a healthy employee attrition rate. With this, HR wants you to build a model that will predict if an employee will attrite (leave the company) or stay. By having this kind of model in the company, HR would be able to proactively retain talent and have a healthier attrition rate. 

They have provided you with a dataset which contains the following columns. Source from <a href="https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists">Kaggle</a>: 

- enrollee_id : Unique ID for candidate
- city: City code
- city_ development _index : Developement index of the city (scaled)
- gender: Gender of candidate
- relevent_experience: Relevant experience of candidate
- enrolled_university: Type of University course enrolled if any
- education_level: Education level of candidate
- major_discipline :Education major discipline of candidate
- experience: Candidate total experience in years
- company_size: No of employees in current employer's company
- company_type : Type of current employer
- lastnewjob: Difference in years between previous job and current job
- training_hours: training hours completed
- target: 0 – Not looking for job change, 1 – Looking for a job change

In [4]:
hr_train_df = pd.read_csv("hr_job_datasci_train.csv")
hr_train_df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


The cell below shows you the columns with missing values, # of missing values and the unique values for each column.

In [5]:
# columns_with_null = hr_train_df.columns[hr_train_df.isna().any()]
# display(hr_train_df[columns_with_null].isnull().sum()/hr_train_df.shape[0])
# display(hr_train_df[columns_with_null])
# for c in columns_with_null:
#     print(f"{c} {hr_train_df[c].unique()}")

## Fill the nan values with an 'unknown' tag (2pts)

In [6]:
hr_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [7]:
model_data = hr_train_df.copy()
model_data['gender'] = model_data['gender'].fillna('unknown')
model_data['enrolled_university'] = model_data['enrolled_university'].fillna('unknown')
model_data['major_discipline'] = model_data['major_discipline'].fillna('unknown')
model_data['education_level'] = model_data['education_level'].fillna('unknown')
model_data['experience'] = model_data['experience'].fillna('unknown')
model_data['company_size'] = model_data['company_size'].fillna('unknown')
model_data['company_type'] = model_data['company_type'].fillna('unknown')
model_data['last_new_job'] = model_data['last_new_job'].fillna('unknown')
model_data.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,unknown,unknown,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,unknown,No relevent experience,Full time course,Graduate,STEM,5,unknown,unknown,never,83,0.0
3,33241,city_115,0.789,unknown,No relevent experience,unknown,Graduate,Business Degree,<1,unknown,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


## Prepare data for modeling by OneHotEncoding the categorical variables and removing the enrolee_id as part of the data (3pts)

In [8]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  19158 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     19158 non-null  object 
 6   education_level         19158 non-null  object 
 7   major_discipline        19158 non-null  object 
 8   experience              19158 non-null  object 
 9   company_size            19158 non-null  object 
 10  company_type            19158 non-null  object 
 11  last_new_job            19158 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [9]:
model_data.drop("enrollee_id", inplace=True, axis=1)

In [10]:
multi_categorical_features = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline',
                              'experience', 'company_size', 'company_type', 'last_new_job']

enc = OneHotEncoder(handle_unknown='ignore')
res = enc.fit_transform(model_data[multi_categorical_features])
res_df = pd.DataFrame(res.todense(), columns=enc.get_feature_names_out())

model_data.drop(multi_categorical_features, axis=1, inplace=True)
model_data = pd.concat([model_data, res_df], axis=1)

model_data.head()

Unnamed: 0,city_development_index,training_hours,target,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,...,company_type_Public Sector,company_type_Pvt Ltd,company_type_unknown,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never,last_new_job_unknown
0,0.92,36,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.776,47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.624,83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.789,52,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.767,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Build an SVM classifier that uses gridsearch (3pts)


In [11]:
X = model_data.drop(labels=['target'], axis=1)
y = model_data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train ", X_train.shape)
print("Test ", X_test.shape)

Train  (15326, 194)
Test  (3832, 194)


In [12]:
param_grid = {
    'C': [1, 10, 100],
    'gamma': [1, 10, 100],
    'kernel': ['rbf']
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
}

clf_grid = GridSearchCV(SVC(), param_grid, scoring=scorers, cv=5, refit='precision_score', verbose = 3)
clf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=0.750) f1: (test=0.005) precision_score: (test=0.500) recall_score: (test=0.003) total time= 1.4min
[CV 2/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=0.752) f1: (test=0.010) precision_score: (test=0.800) recall_score: (test=0.005) total time= 1.3min
[CV 3/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=0.751) f1: (test=0.003) precision_score: (test=1.000) recall_score: (test=0.001) total time= 1.3min
[CV 4/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=0.751) f1: (test=0.010) precision_score: (test=0.667) recall_score: (test=0.005) total time= 1.3min
[CV 5/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=0.750) f1: (test=0.003) precision_score: (test=0.333) recall_score: (test=0.001) total time= 1.3min
[CV 1/5] END C=1, gamma=10, kernel=rbf; accuracy_score: (test=0.750) f1: (test=0.005) precision_score: (test=0.500) recall_score: (test

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 10, 100], 'gamma': [1, 10, 100],
                         'kernel': ['rbf']},
             refit='precision_score',
             scoring={'accuracy_score': make_scorer(accuracy_score),
                      'f1': make_scorer(f1_score),
                      'precision_score': make_scorer(precision_score),
                      'recall_score': make_scorer(recall_score)},
             verbose=3)

## Calculate accuracy, precision, recall, f1-score of the best model based on gridsearch (2pts)

In [13]:
y_pred = clf_grid.best_estimator_.predict(X_test)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.45454545454545453
0.005235602094240838
0.010351966873706004
0.7505219206680585


## Use the Recursive Feature Elimination Method with Cross Validation for feature selection (3pts)

In [None]:
rfecv = RFECV(estimator=estimator, scoring='accuracy', min_features_to_select=3)
rfecv.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
rfecv.grid_scores_

## Plot the Accuracy VS # of Features (2pts)

In [None]:
list(range(1,len(rfecv.grid_scores_)+1))

In [None]:
ax = sns.lineplot(x=list(range(1,len(rfecv.grid_scores_)+1)), y=list(rfecv.grid_scores_))
ax.set_title("Accuracy vs # of Features")
ax.set_xlabel("# of Features")
ax.set_xlabel("Accuracy")
ax.set_ylim(.85, .88)

## From the plot, what is the optimal # of features, what are these features? (2pts)

## Build an SVM classifier that uses gridsearch and the selected features from RFE (3pts)

## Compare Results of Baseline and using the Feature Selection Method, why do you think one performed better over the other? What would be your recommendation to improve the model performance? (5pts)