In [71]:
import pandas as pd 
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, r2_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  make_scorer
from sklearn.svm import SVC

from sklearn import datasets
from numpy.testing import assert_almost_equal

One important function of HR is to retain talent. High employee attrition in a company can result to financial cost, lower morale, decrease in producvity  and more. Therefore it's imperative for HR to be able to maintain a healthy employee attrition rate. With this, HR wants you to build a model that will predict if an employee will attrite (leave the company) or stay. By having this kind of model in the company, HR would be able to proactively retain talent and have a healthier attrition rate. 

They have provided you with a dataset which contains the following columns. Source from <a href="https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists">Kaggle</a>: 

- enrollee_id : Unique ID for candidate
- city: City code
- city_ development _index : Developement index of the city (scaled)
- gender: Gender of candidate
- relevent_experience: Relevant experience of candidate
- enrolled_university: Type of University course enrolled if any
- education_level: Education level of candidate
- major_discipline :Education major discipline of candidate
- experience: Candidate total experience in years
- company_size: No of employees in current employer's company
- company_type : Type of current employer
- lastnewjob: Difference in years between previous job and current job
- training_hours: training hours completed
- target: 0 – Not looking for job change, 1 – Looking for a job change

In [72]:
hr_train_df = pd.read_csv("hr_job_datasci_train.csv")
hr_train_df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


The cell below shows you the columns with missing values, # of missing values and the unique values for each column.

In [73]:
columns_with_null = hr_train_df.columns[hr_train_df.isna().any()]
display(hr_train_df[columns_with_null].isnull().sum()/hr_train_df.shape[0])
display(hr_train_df[columns_with_null])
for c in columns_with_null:
    print(f"{c} {hr_train_df[c].unique()}")

gender                 0.235306
enrolled_university    0.020148
education_level        0.024011
major_discipline       0.146832
experience             0.003393
company_size           0.309949
company_type           0.320493
last_new_job           0.022080
dtype: float64

Unnamed: 0,gender,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,Male,no_enrollment,Graduate,STEM,>20,,,1
1,Male,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4
2,,Full time course,Graduate,STEM,5,,,never
3,,,Graduate,Business Degree,<1,,Pvt Ltd,never
4,Male,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4
...,...,...,...,...,...,...,...,...
19153,Male,no_enrollment,Graduate,Humanities,14,,,1
19154,Male,no_enrollment,Graduate,STEM,14,,,4
19155,Male,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4
19156,Male,no_enrollment,High School,,<1,500-999,Pvt Ltd,2


gender ['Male' nan 'Female' 'Other']
enrolled_university ['no_enrollment' 'Full time course' nan 'Part time course']
education_level ['Graduate' 'Masters' 'High School' nan 'Phd' 'Primary School']
major_discipline ['STEM' 'Business Degree' nan 'Arts' 'Humanities' 'No Major' 'Other']
experience ['>20' '15' '5' '<1' '11' '13' '7' '17' '2' '16' '1' '4' '10' '14' '18'
 '19' '12' '3' '6' '9' '8' '20' nan]
company_size [nan '50-99' '<10' '10000+' '5000-9999' '1000-4999' '10/49' '100-500'
 '500-999']
company_type [nan 'Pvt Ltd' 'Funded Startup' 'Early Stage Startup' 'Other'
 'Public Sector' 'NGO']
last_new_job ['1' '>4' 'never' '4' '3' '2' nan]


## Fill the nan values with an 'unknown' tag (2pts)

In [74]:
model_data = hr_train_df.dropna(axis=0)
model_data

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
7,402,city_46,0.762,Male,Has relevent experience,no_enrollment,Graduate,STEM,13,<10,Pvt Ltd,>4,18,1.0
8,27107,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,50-99,Pvt Ltd,1,46,1.0
11,23853,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,5,5000-9999,Pvt Ltd,1,108,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19147,21319,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,1,100-500,Pvt Ltd,1,52,1.0
19149,251,city_103,0.920,Male,Has relevent experience,no_enrollment,Masters,STEM,9,50-99,Pvt Ltd,1,36,1.0
19150,32313,city_160,0.920,Female,Has relevent experience,no_enrollment,Graduate,STEM,10,100-500,Public Sector,3,23,0.0
19152,29754,city_103,0.920,Female,Has relevent experience,no_enrollment,Graduate,Humanities,7,10/49,Funded Startup,1,25,0.0


In [75]:
model_data2 = hr_train_df[columns_with_null].copy()
model_data2['gender'] = hr_train_df[columns_with_null]['gender'].fillna('Unknown')
model_data2['enrolled_university'] = hr_train_df[columns_with_null]['enrolled_university'].fillna('Unknown')
model_data2['education_level'] = hr_train_df[columns_with_null]['education_level'].fillna('Unknown')
model_data2['major_discipline'] = hr_train_df[columns_with_null]['major_discipline'].fillna('Unknown')
model_data2['experience'] = hr_train_df[columns_with_null]['experience'].fillna('Unknown')
model_data2['company_size'] = hr_train_df[columns_with_null]['company_size'].fillna('Unknown')
model_data2['company_type'] = hr_train_df[columns_with_null]['company_type'].fillna('Unknown')
model_data2['last_new_job'] = hr_train_df[columns_with_null]['last_new_job'].fillna('Unknown')
model_data2

Unnamed: 0,gender,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,Male,no_enrollment,Graduate,STEM,>20,Unknown,Unknown,1
1,Male,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4
2,Unknown,Full time course,Graduate,STEM,5,Unknown,Unknown,never
3,Unknown,Unknown,Graduate,Business Degree,<1,Unknown,Pvt Ltd,never
4,Male,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4
...,...,...,...,...,...,...,...,...
19153,Male,no_enrollment,Graduate,Humanities,14,Unknown,Unknown,1
19154,Male,no_enrollment,Graduate,STEM,14,Unknown,Unknown,4
19155,Male,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4
19156,Male,no_enrollment,High School,Unknown,<1,500-999,Pvt Ltd,2


## Prepare data for modeling by OneHotEncoding the categorical variables and removing the enrolee_id as part of the data (3pts)

In [76]:
model_data.drop("enrollee_id", inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data.drop("enrollee_id", inplace=True, axis=1)


In [95]:
multi_categorical_features = ['gender', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']
enc = OneHotEncoder(handle_unknown='ignore')
res = enc.fit_transform(model_data2[multi_categorical_features])
res_df = pd.DataFrame(res.todense(), columns=enc.get_feature_names())

model_data2.drop(multi_categorical_features, axis=1, inplace=True)
model_data2 = pd.concat([model_data, res_df], axis=1)

model_data2.head()



Unnamed: 0,city,city_development_index,relevent_experience,training_hours,target,x0_Female,x0_Male,x0_Other,x1_Full time course,x1_Part time course,...,x6_Public Sector,x6_Pvt Ltd,x6_nan,x7_1,x7_2,x7_3,x7_4,x7_>4,x7_never,x7_nan
1,city_40,0.776,No relevent experience,47.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,city_162,0.767,Has relevent experience,8.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,city_46,0.762,Has relevent experience,18.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,city_103,0.92,Has relevent experience,46.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11,city_103,0.92,Has relevent experience,108.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Build an SVM classifier that uses gridsearch (3pts)


In [83]:
X = model_data.drop(labels=['target'], axis=1)
y = model_data['target']

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print("Train ",X_train.shape)
print("Test ",X_test.shape)

Train  (6268, 12)
Test  (2687, 12)


In [100]:
param_grid = {
    'C': [1, 10, 100, 1000],
    'gamma': [1, 10, 100, 1000],
    'kernel': ['rbf']
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
}

clf_grid = GridSearchCV(SVC(), param_grid, scoring=scorers, cv=5, refit='precision_score', verbose = 3)
clf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=nan) precision_score: (test=nan) recall_score: (test=nan) total time=   0.0s
[CV 2/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=nan) precision_score: (test=nan) recall_score: (test=nan) total time=   0.0s
[CV 3/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=nan) precision_score: (test=nan) recall_score: (test=nan) total time=   0.0s
[CV 4/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=nan) precision_score: (test=nan) recall_score: (test=nan) total time=   0.0s
[CV 5/5] END C=1, gamma=1, kernel=rbf; accuracy_score: (test=nan) precision_score: (test=nan) recall_score: (test=nan) total time=   0.0s
[CV 1/5] END C=1, gamma=10, kernel=rbf; accuracy_score: (test=nan) precision_score: (test=nan) recall_score: (test=nan) total time=   0.0s
[CV 2/5] END C=1, gamma=10, kernel=rbf; accuracy_score: (test=nan) precision_score: (test=nan) recall_score: (

80 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ajlargonza/anaconda3/envs/data100/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ajlargonza/anaconda3/envs/data100/lib/python3.10/site-packages/sklearn/svm/_base.py", line 190, in fit
    X, y = self._validate_data(
  File "/Users/ajlargonza/anaconda3/envs/data100/lib/python3.10/site-packages/sklearn/base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/ajlargonza/anaconda3/envs/data100/lib/python3.10/site-pac

ValueError: could not convert string to float: 'city_23'

## Calculate accuracy, precision, recall, f1-score of the best model based on gridsearch (2pts)

In [90]:
y_pred = clf_grid.best_estimator_.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

ValueError: could not convert string to float: 'city_16'

## Use the Recursive Feature Elimination Method with Cross Validation for feature selection (3pts)

In [91]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

## Plot the Accuracy VS # of Features (2pts)

## From the plot, what is the optimal # of features, what are these features? (2pts)

## Build an SVM classifier that uses gridsearch and the selected features from RFE (3pts)

## Compare Results of Baseline and using the Feature Selection Method, why do you think one performed better over the other? What would be your recommendation to improve the model performance? (5pts)