In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report


from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Link to data

[IBM HR Data](https://www.kaggle.com/datasets/rushikeshghate/capstone-projectibm-employee-attrition-prediction?select=IBM+HR+Data+new.csv)

This dataset has 23436 rows and 37 columns describing different employees profiles (department, pay rate, whether they are still employedd at the company...). It contains both numeric and categorical variables. It is important to note that this is a hypothetical dataset created by IBM data scientists.

#Graph formatting

In [2]:
SMALL_SIZE = 15
MEDIUM_SIZE = 20
BIGGER_SIZE = 25
# Set font sizes individually. For example, you may want the title
# to be very big, but the axes labels to be only medium size.
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


#Data loading / cleaning

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Project 3 : Math 3439/data/IBM HR Data new.csv')
df.head()


  df = pd.read_csv('/content/drive/MyDrive/Project 3 : Math 3439/data/IBM HR Data new.csv')


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Employee Source
0,41.0,Voluntary Resignation,Travel_Rarely,1102.0,Sales,1,2.0,Life Sciences,1,1,...,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0,Referral
1,41.0,Voluntary Resignation,Travel_Rarely,1102.0,Sales,1,2.0,Life Sciences,1,1,...,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0,Referral
2,41.0,Voluntary Resignation,Travel_Rarely,1102.0,Sales,1,2.0,Life Sciences,1,7,...,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0,Referral
3,41.0,Voluntary Resignation,Travel_Rarely,1102.0,Sales,1,2.0,Life Sciences,1,8,...,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0,Referral
4,41.0,Voluntary Resignation,Travel_Rarely,1102.0,Sales,1,2.0,Life Sciences,1,9,...,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0,Referral


In [4]:
df['Attrition'].unique()

array(['Voluntary Resignation', 'Current employee', nan], dtype=object)

In [5]:
df['EnvironmentSatisfaction'].unique()

array([2.00000e+00, 4.00000e+00, 1.00000e+00, 3.00000e+00,         nan,
       1.27249e+05, 1.29588e+05])

In [6]:
df.isna().sum()

Age                          3
Attrition                   13
BusinessTravel               8
DailyRate                   12
Department                  11
DistanceFromHome             9
Education                   12
EducationField               9
EmployeeCount                5
EmployeeNumber               1
Application ID               3
EnvironmentSatisfaction      9
Gender                      10
HourlyRate                   9
JobInvolvement               9
JobLevel                     7
JobRole                      9
JobSatisfaction              9
MaritalStatus               11
MonthlyIncome               13
MonthlyRate                 11
NumCompaniesWorked           9
Over18                      10
OverTime                    12
PercentSalaryHike           14
PerformanceRating           10
RelationshipSatisfaction     8
StandardHours               10
StockOptionLevel             9
TotalWorkingYears            8
TrainingTimesLastYear       11
WorkLifeBalance             10
YearsAtC

## Dropping NAs and duplicates

In [7]:
df = df.dropna()

In [8]:
df.shape[0]

23204

In [9]:
df.drop_duplicates().shape[0]

23190

In [10]:
(df.shape[0] - df.drop_duplicates().shape[0] ) 

14

In [11]:
df = df.drop_duplicates()

In [12]:
df.dtypes

Age                         float64
Attrition                    object
BusinessTravel               object
DailyRate                   float64
Department                   object
DistanceFromHome             object
Education                   float64
EducationField               object
EmployeeCount                object
EmployeeNumber               object
Application ID               object
EnvironmentSatisfaction     float64
Gender                       object
HourlyRate                   object
JobInvolvement              float64
JobLevel                    float64
JobRole                      object
JobSatisfaction              object
MaritalStatus                object
MonthlyIncome                object
MonthlyRate                 float64
NumCompaniesWorked          float64
Over18                       object
OverTime                     object
PercentSalaryHike            object
PerformanceRating           float64
RelationshipSatisfaction    float64
StandardHours               

## Drop columns

In [13]:
df = df.drop(columns=['Application ID', 'EmployeeNumber', 'Over18', 'EmployeeCount'])

## CleanUp / Encoding

In [14]:
df = df[df.Gender != '1']
df = df[df.Gender != '2']

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
df['DistanceFromHome'] = pd.to_numeric(df['DistanceFromHome'], errors='coerce')
df['HourlyRate'] = pd.to_numeric(df['HourlyRate'], errors='coerce')
df['JobSatisfaction'] = pd.to_numeric(df['JobSatisfaction'], errors='coerce')
df['MonthlyIncome'] = pd.to_numeric(df['MonthlyIncome'], errors='coerce')
df['PercentSalaryHike'] = pd.to_numeric(df['PercentSalaryHike'], errors='coerce')

attrition_le = LabelEncoder()
df['Attrition'] = attrition_le.fit_transform(df['Attrition'])
OverTime_le = LabelEncoder()
df['OverTime'] = OverTime_le.fit_transform(df['OverTime'])
Gender_le = LabelEncoder()
df['Gender'] = OverTime_le.fit_transform(df['Gender'])

# get dummies for: Department, EducationField, Employee Source,  MaritalStatus(3), JobRole
df = pd.concat([df, pd.get_dummies(df['Department'], prefix='Department')], axis=1)
df = pd.concat([df, pd.get_dummies(df['EducationField'], prefix='EducationField')], axis=1)
df = pd.concat([df, pd.get_dummies(df['Employee Source'], prefix='Employee Source')], axis=1)
df = pd.concat([df, pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus')], axis=1)
df = pd.concat([df, pd.get_dummies(df['JobRole'], prefix='JobRole')], axis=1)
df = pd.concat([df, pd.get_dummies(df['BusinessTravel'], prefix='BusinessTravel')], axis=1)

df = df.drop(columns=['JobRole', 'MaritalStatus', 'Employee Source', 'EducationField', 'Department', 'BusinessTravel'])

In [17]:
print(list(attrition_le.inverse_transform(df['Attrition'].tail())))
print(list(df['Attrition'].tail()))


['Current employee', 'Current employee', 'Current employee', 'Current employee', 'Current employee']
[0, 0, 0, 0, 0]


In [18]:
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely
0,41.0,1,1102.0,1.0,2.0,2.0,0,94,3.0,2.0,...,0,0,0,0,0,1,0,0,0,1
1,41.0,1,1102.0,1.0,2.0,2.0,0,94,3.0,2.0,...,0,0,0,0,0,1,0,0,0,1
2,41.0,1,1102.0,1.0,2.0,2.0,0,94,3.0,2.0,...,0,0,0,0,0,1,0,0,0,1
3,41.0,1,1102.0,1.0,2.0,2.0,0,94,3.0,2.0,...,0,0,0,0,0,1,0,0,0,1
4,41.0,1,1102.0,1.0,2.0,2.0,0,94,3.0,2.0,...,0,0,0,0,0,1,0,0,0,1


In [19]:
df.select_dtypes('object')

0
1
2
3
4
...
23428
23429
23430
23431
23432


# Split data

In [20]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='Attrition')
y = df['Attrition']

train_df, test_df = train_test_split(df, test_size=0.3)

X_train = train_df.drop(columns='Attrition')
y_train = train_df['Attrition']
X_test = test_df.drop(columns='Attrition')
y_test = test_df['Attrition']

#Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

logr = LogisticRegression(max_iter=30000)

logr.fit(X_train, y_train)

preds = logr.predict(X_test)
logr_proba_preds = logr.predict_proba(X_test)[:, 1]
test_df['logr_proba_preds'] = logr.predict_proba(X_test)[:, 1]
cm = confusion_matrix(y_test, preds, normalize='true')
print(f'Confusion Matrix:')
print(cm)
dfacc = sum(y_test == preds) / len(y_test)
print(f'Accuracy = {dfacc}')
auroc = roc_auc_score(y_test, logr_proba_preds)
print(f'AUROC = {auroc}')
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
tpr = TP/(TP+FN)
tnr = TN/(TN+FP) 
fpr = FP/(FP+TN)
fnr = FN/(TP+FN)
print(f'TNR: {tnr}')
print(f'TPR: {tpr}')
print(f'FPR: {fpr}')
print(f'FNR: {fnr}')

plt.figure(figsize=(12, 8))
plt.hist(test_df[test_df['Attrition'] == 0]['logr_proba_preds'], color='red', label='Current employees', alpha=0.75)
plt.hist(test_df[test_df['Attrition'] == 1]['logr_proba_preds'], color='blue', label='resigned', alpha=0.75)
plt.legend()

NOTE: 
- 0 = Current employee
- 1 = Resigned

% of resignations in our train dataset

In [None]:
resigned = (train_df[train_df['Attrition'] == 1].shape[0] / train_df.shape[0]) * 100
print(resigned)

In [None]:
train_df[train_df['Attrition'] == 0].shape[0]

## Balancing attrition

In [None]:
#take df and balance by attrition
def balanceDf(dfToBalance):
  rows = dfToBalance[(dfToBalance['Attrition'] == 1)].shape[0]
  dfResigned = dfToBalance[(dfToBalance['Attrition'] == 1)]
  dfCurrent = dfToBalance[(dfToBalance['Attrition'] == 0 )]
  dfCurrent = dfCurrent.sample(n = rows)

  assert dfResigned.shape[0] == dfCurrent.shape[0]

  dfBalanced = pd.concat([dfResigned, dfCurrent], axis=0)
  return dfBalanced

In [None]:
#balance df
train_df_blanced = balanceDf(train_df)
test_df_blanced = balanceDf(test_df)
test_df_blanced = test_df_blanced.drop(columns='logr_proba_preds')

#Set X y 
X_train = train_df_blanced.drop(columns='Attrition')
y_train = train_df_blanced['Attrition']
X_test = test_df_blanced.drop(columns='Attrition')
y_test = test_df_blanced['Attrition']

#Logistic regression
logr = LogisticRegression(max_iter=30000)
logr.fit(X_train, y_train)
preds = logr.predict(X_test)
test_df_blanced['logr_proba_preds'] = logr.predict_proba(X_test)[:, 1]
logr_proba_preds = logr.predict_proba(X_test)[:, 1]
# Model Analysis
cm = confusion_matrix(y_test, preds, normalize='true')
print(f'Confusion Matrix:')
print(cm)
dfacc = sum(y_test == preds) / len(y_test)
print(f'Accuracy = {dfacc}')
auroc = roc_auc_score(y_test, logr_proba_preds)
print(f'AUROC = {auroc}')
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
tpr = TP/(TP+FN)
tnr = TN/(TN+FP) 
fpr = FP/(FP+TN)
fnr = FN/(TP+FN)
print(f'TNR: {tnr}')
print(f'TPR: {tpr}')
print(f'FPR: {fpr}')
print(f'FNR: {fnr}')

plt.figure(figsize=(12, 8))
plt.hist(test_df_blanced[test_df_blanced['Attrition'] == 0]['logr_proba_preds'], color='red', label='Current employees', alpha=0.75)
plt.hist(test_df_blanced[test_df_blanced['Attrition'] == 1]['logr_proba_preds'], color='blue', label='resigned', alpha=0.75)
plt.legend()

print(classification_report(y_test, preds))

In [None]:
plt.figure(figsize=(48, 48))
plt.bar(x=X.columns, height=logr.coef_[0]) # The zero is needed because the coefficients are an array of arrays.
plt.xticks(rotation=45) # Rotate the labels so you can read them

# Simple analysis of attrition vs continous variables

In [None]:
df.select_dtypes('float64')

In [None]:
dfplots = df.select_dtypes('float64')

dfLeft = df[(df['Attrition'] == 1)]
dfStayed = df[(df['Attrition'] == 0)]

columnsall = list(dfplots.columns)

for col in columnsall:
  left = dfLeft[col]
  stayed = dfStayed[col]
  data = [left, stayed]
  # Multiple box plots on one Axes
  fig, ax = plt.subplots()
  ax.boxplot(data)
  ax.set_title(col)
  plt.xticks([1, 2], ['left', 'stayed'])
  plt.show()

## Investigating job roles

In [None]:
filter_col = [col for col in df if col.startswith('JobRole')]
filter_col

for col in filter_col:
  print(col)

In [None]:

dfLeft = df[(df['Attrition'] == 1)]
dfStayed = df[(df['Attrition'] == 0)]

filter_col = [col for col in df if col.startswith('JobRole')]

for col in filter_col:
  left = df[(df[col] == 1) & (df['Attrition'] == 1)].shape[0] / df[(df[col] == 1)].shape[0]
  print(f'% of {col} who left: {left*100:.2f}')

## Investigating Job level

In [None]:

filter_col = df['JobLevel'].unique()

for col in filter_col:
  left = df[(df['JobLevel'] == col) & (df['Attrition'] == 1)].shape[0] / df[(df['JobLevel'] == col)].shape[0]
  print(f'% of {col} who left: {left*100:.2f}')

## Combining sales representative and joblevel

In [None]:
filter_col = df['JobLevel'].unique()

for col in filter_col:
  left = df[(df['JobLevel'] == col) & (df['JobRole_Sales Representative'] == 1)].shape[0] / df[(df['JobRole_Sales Representative'] == 1)].shape[0]
  print(f'% of Job level {col} in sales rep: {left*100:.2f}')

In [None]:
filter_col = df['JobLevel'].unique()

for col in filter_col:
  left_total = df[(df['JobRole_Sales Representative'] == 1) & (df['Attrition'] == 1)].shape[0]
  if left_total > 0 :
    left = df[(df['JobLevel'] == col) & (df['JobRole_Sales Representative'] == 1) & (df['Attrition'] == 1)].shape[0] / df[(df['JobRole_Sales Representative'] == 1) & (df['Attrition'] == 1)].shape[0]
    print(f'Amongst all the sales rep who quit, {left*100:.2f}% had a job level of {col} ')

## Combining high attrition job role with job level and looking at attrition for these micro-categories

In [None]:
def combiningJLandJR():
  dfLeft = df[(df['Attrition'] == 1)]

  filter_col = [col for col in df if col.startswith('JobRole')]

  for jobrole in filter_col:
    left = df[(df[jobrole] == 1) & (df['Attrition'] == 1)].shape[0] / df[(df[jobrole] == 1)].shape[0]
    if left > 0.12:
      filter_col_JL = df['JobLevel'].unique()
      for jl in filter_col_JL:
        left_total = df[(df[jobrole] == 1) & (df['Attrition'] == 1)].shape[0]
        if left_total > 0 :
          left_jr_jl = df[(df['JobLevel'] == jl) & (df[jobrole] == 1) & (df['Attrition'] == 1)].shape[0] / left_total
          if left_jr_jl >= 0.5:
            print(f'Amongst all the {jobrole} who quit, {left_jr_jl*100:.2f}% had a job level of {jl}')

In [None]:
combiningJLandJR()

## Combining high attrition role and job level

In [None]:
dfCombinedBalanced = df
conditions = [((dfCombinedBalanced['JobLevel'] == 2.0) & (dfCombinedBalanced['JobRole_Healthcare Representative'] == 1)),
              ((dfCombinedBalanced['JobLevel'] == 1.0) & (dfCombinedBalanced['JobRole_Human Resources'] == 1)),
              ((dfCombinedBalanced['JobLevel'] == 1.0) & (dfCombinedBalanced['JobRole_Laboratory Technician'] == 1)) ,
              ((dfCombinedBalanced['JobLevel'] == 2.0) & (dfCombinedBalanced['JobRole_Manufacturing Director'] == 1)) ,
              ((dfCombinedBalanced['JobLevel'] == 1.0) & (dfCombinedBalanced['JobRole_Research Scientist'] == 1)),
              ((dfCombinedBalanced['JobLevel'] == 2.0) & (dfCombinedBalanced['JobRole_Sales Executive'] == 1)),
              ((dfCombinedBalanced['JobLevel'] == 1.0) & (dfCombinedBalanced['JobRole_Sales Representative'] == 1))]
returnval = [1,1,1,1,1,1,1]
dfCombinedBalanced['hasHighAttri_JR&JL'] = np.select(conditions, returnval)
df[dfCombinedBalanced['Attrition'] == 1 ].shape[0] / df[dfCombinedBalanced['hasHighAttri_JR&JL'] == 1 ].shape[0] 

## Combining factors

In [None]:

dfCombinedBalanced['hasLowJobLevel'] = np.where(dfCombinedBalanced['JobLevel'] < 3, 1, 0)
dfCombinedBalanced['isYoung'] = np.where(dfCombinedBalanced['Age'] < 39, 1, 0)
dfCombinedBalanced['hasLowStockOptionLevel'] = np.where(dfCombinedBalanced['StockOptionLevel'] < 0.5, 1, 0)
dfCombinedBalanced['hasLowYearsInCurrRole'] = np.where(dfCombinedBalanced['YearsInCurrentRole'] < 2, 1, 0)
dfCombinedBalanced['hasHighDistFromHome'] = np.where(dfCombinedBalanced['DistanceFromHome'] > 10, 1, 0)
dfCombinedBalanced['sumOfBool'] = dfCombinedBalanced[['hasLowJobLevel',  'isYoung','hasLowStockOptionLevel', 'hasLowYearsInCurrRole','hasHighDistFromHome']].sum(axis=1)
dfCombinedBalanced.drop(columns = ['hasLowJobLevel', 'isYoung', 'hasLowStockOptionLevel', 'hasLowYearsInCurrRole','hasHighDistFromHome'])
dfCombinedBalanced.drop(columns = ['JobLevel', 'Age', 'StockOptionLevel', 'YearsInCurrentRole','DistanceFromHome'])

dfCombinedBalanced.head()

### Trying logistic regression with: split & balanced data, and new boolean column

In [None]:
train_df_blanced_bool, test_df_blanced_bool = train_test_split(dfCombinedBalanced, test_size=0.3)

#balance df
train_df_blanced_bool = balanceDf(train_df_blanced_bool)
test_df_blanced_bool = balanceDf(test_df_blanced_bool)

#Set X y 
X_train_blanced_bool = train_df_blanced_bool.drop(columns='Attrition')
y_train_blanced_bool = train_df_blanced_bool['Attrition']
X_test_blanced_bool = test_df_blanced_bool.drop(columns='Attrition')
y_test_blanced_bool = test_df_blanced_bool['Attrition']

#Logistic regression
logr2 = LogisticRegression(max_iter=30000)
logr2.fit(X_train_blanced_bool, y_train_blanced_bool)
preds = logr2.predict(X_test_blanced_bool)
test_df_blanced_bool['logr_proba_preds'] = logr2.predict_proba(X_test_blanced_bool)[:, 1]
logr_proba_preds = logr2.predict_proba(X_test_blanced_bool)[:, 1]
# Model Analysis
cm = confusion_matrix(y_test_blanced_bool, preds, normalize='true')
print(f'Confusion Matrix:')
print(cm)
dfacc = sum(y_test_blanced_bool == preds) / len(y_test_blanced_bool)
print(f'Accuracy = {dfacc}')
auroc = roc_auc_score(y_test_blanced_bool, logr_proba_preds)
print(f'AUROC = {auroc}')
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
tpr = TP/(TP+FN)
tnr = TN/(TN+FP) 
fpr = FP/(FP+TN)
fnr = FN/(TP+FN)
print(f'TNR: {tnr}')
print(f'TPR: {tpr}')
print(f'FPR: {fpr}')
print(f'FNR: {fnr}')

plt.figure(figsize=(12, 8))
plt.hist(test_df_blanced_bool[test_df_blanced_bool['Attrition'] == 0]['logr_proba_preds'], color='red', label='Current employees', alpha=0.75)
plt.hist(test_df_blanced_bool[test_df_blanced_bool['Attrition'] == 1]['logr_proba_preds'], color='blue', label='resigned', alpha=0.75)
plt.legend()

print(classification_report(y_test_blanced_bool, preds))

# Nearest neighbour analysis

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay

def knnAnalysis(X_train, y_train, X_test, y_test, n, metric, dfKnn):
  knn = KNeighborsClassifier(n_neighbors=n, metric=metric)
  knn.fit(X_train, y_train)
  preds = knn.predict(X_test)
  acc = knn.score(X_test, y_test)
  y_pred_proba = knn.predict_proba(X_test)[:,1]
  knn_acc = knn.score(X_test, y_test)
  knn_auroc = roc_auc_score(y_test, y_pred_proba)
  cm = confusion_matrix(y_test, preds, normalize='true')
  dfKnn['knn_preds'] = y_pred_proba

  print(f'Confusion Matrix:')
  print(cm)
  print(f'Accuracy = {knn_acc}')
  print(f'AUROC = {knn_auroc}')
  print(f'Confusion Matrix = {cm}')
  print(classification_report(y_test, preds))
  plt.figure(figsize=(12, 8))
  plt.hist(dfKnn[dfKnn['Attrition'] == 0]['knn_preds'], color='red', label='Current employees', alpha=0.75)
  plt.hist(dfKnn[dfKnn['Attrition'] == 1]['knn_preds'], color='blue', label='resigned', alpha=0.75)
  plt.legend()
  plt.show()
  ConfusionMatrixDisplay.from_predictions(y_test, y_pred_proba, normalize='true')
  plt.show()
  plt.figure(figsize=(48, 48))
  plt.bar(x=X.columns, height=knn.coef_[0]) # The zero is needed because the coefficients are an array of arrays.
  plt.xticks(rotation=45) # Rotate the labels so you can read them
  plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors

Nn = NearestNeighbors(n_neighbors=1)
Nn.fit(X_train)
Nn.kneighbors(X_train.iloc[0:5], n_neighbors = 1)

### Using GridSearchCV

In [None]:
def gridSearchCVKnn(X_train, y_train, X_test):
  n = list(range(1, 2))
  hyperparam_grid = {'n_neighbors': n, 'metric': ['hamming', 'euclidean', 'jaccard']}
  knn = KNeighborsClassifier()
  grid = GridSearchCV(knn, hyperparam_grid, n_jobs=-1) # n_jobs tells it to use parallel processing. Long story short, it will be faster.
  grid.fit(X_train, y_train)
  grid.predict(X_test)
  print(f'Best number of neighbours: {grid.best_params_}')
  print(f'Accuracy: {grid.best_score_ *100}')
  return grid

## Stratified data

### no combination of factors

In [None]:
X = df.drop(columns='Attrition')
y = df['Attrition']

X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(X, y, stratify=df['Attrition'], test_size=0.3)

knn = gridSearchCVKnn(X_train_strat, y_train_strat, X_test_strat)

In [None]:
train_df_blanced_bool, test_df_blanced_bool = train_test_split(df, test_size=0.3)

#balance df
train_df_blanced_bool = balanceDf(train_df_blanced_bool)
test_df_blanced_bool = balanceDf(test_df_blanced_bool)

#Set X y 
X_train_blanced_bool = train_df_blanced_bool.drop(columns='Attrition')
y_train_blanced_bool = train_df_blanced_bool['Attrition']
X_test_blanced_bool = test_df_blanced_bool.drop(columns='Attrition')
y_test_blanced_bool = test_df_blanced_bool['Attrition']

#knnAnalysis(X_train_strat, y_train_strat, X_test_strat, y_test_strat, 1, 'hamming', test_df_blanced_bool)

### Different metrics and different # of neighbours

## Balanced data

### Different metrics and different # of neighbours