In [75]:
#%pip install matplotlib

#%pip install scikit-learn
#%pip install catboost 


import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

from catboost import CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestRegressor

### EA 9 Classification Exercise (40 points + 3 points extra)

There are 2 files training and test

This dataset is designed to understand the factors that lead a person to leave their current job for HR research. By model(s) that uses the current credentials, demographics, and experience data you will predict the probability of a candidate looking for a new job or will work for the company, as well as interpreting affected factors on employee decision.

In [76]:
testURL= "https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S23/main/classification/data/Assigment/aug_test.csv"
#"https://github.com/WHPAN0108/BHT-DataScience-S23/blob/main/classification/data/Assigment/aug_test.csv"

trainURL= "https://raw.githubusercontent.com/WHPAN0108/BHT-DataScience-S23/main/classification/data/Assigment/aug_train.csv"
#"https://github.com/WHPAN0108/BHT-DataScience-S23/blob/main/classification/data/Assigment/aug_train.csv"

Note:
- The dataset is imbalanced.
- Most features are categorical (Nominal, Ordinal, Binary), some with high cardinality.
- Missing imputation can be a part of your pipeline as well.

#### Features:

- **city_ development _index** : Developement index of the city (scaled)
- **gender** : Gender of candidate
- **relevent_experience** : Relevant experience of candidate
- **enrolled_university** : Type of University course enrolled if any
- **education_level** : Education level of candidate
- **major_discipline** :Education major discipline of candidate
- **experience** : Candidate total experience in years,
- **company_type** : Type of current employer
- **last_new_job** : Difference in years between previous job and current job
- **training_hours** : training hours completed
- **target** : 0 – Not looking for job change, 1 – Looking for a job change

### Task1 Data clean, imputation

In [77]:
trainData = pd.read_csv(trainURL, delimiter=',')
testData = pd.read_csv(testURL, delimiter=',')
trainData.describe(include='all')

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
count,2100.0,1585,2100,2051,2049,1768,2090,1415,2048.0,2100.0,2100.0
unique,,3,2,3,5,6,22,6,6.0,,
top,,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Pvt Ltd,1.0,,
freq,,1422,1507,1484,1246,1556,369,1068,857.0,,
mean,0.826898,,,,,,,,,65.89619,0.254762
std,0.124464,,,,,,,,,58.432483,0.435831
min,0.448,,,,,,,,,1.0,0.0
25%,0.72925,,,,,,,,,24.0,0.0
50%,0.899,,,,,,,,,49.0,0.0
75%,0.92,,,,,,,,,89.25,1.0


In [78]:
testData.describe(include='all')

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
count,100.0,72,100,96,98,88,100,64,100.0,100.0,100.0
unique,,3,2,3,3,5,20,5,6.0,,
top,,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Pvt Ltd,1.0,,
freq,,68,66,67,63,82,17,50,40.0,,
mean,0.84422,,,,,,,,,72.28,0.22
std,0.117719,,,,,,,,,64.813498,0.416333
min,0.479,,,,,,,,,4.0,0.0
25%,0.79475,,,,,,,,,22.0,0.0
50%,0.91,,,,,,,,,51.5,0.0
75%,0.92,,,,,,,,,93.0,0.0


1. in experience, replace >20 to 21; <1 to 1, and convert this as a numerical column

In [79]:
trainData['experience'] = pd.to_numeric(trainData['experience'].replace({'>20': 21, '<1': 1}))
testData['experience'] = pd.to_numeric(testData['experience'].replace({'>20': 21, '<1': 1}))
testData.describe( include='all')

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
count,100.0,72,100,96,98,88,100.0,64,100.0,100.0,100.0
unique,,3,2,3,3,5,,5,6.0,,
top,,Male,Has relevent experience,no_enrollment,Graduate,STEM,,Pvt Ltd,1.0,,
freq,,68,66,67,63,82,,50,40.0,,
mean,0.84422,,,,,,10.23,,,72.28,0.22
std,0.117719,,,,,,6.637596,,,64.813498,0.416333
min,0.479,,,,,,1.0,,,4.0,0.0
25%,0.79475,,,,,,4.0,,,22.0,0.0
50%,0.91,,,,,,9.5,,,51.5,0.0
75%,0.92,,,,,,15.0,,,93.0,0.0



2. in last_new_job, replace >4 to 5; never to 0, and convert this as a numerical column


In [80]:
trainData['last_new_job'] = pd.to_numeric(trainData['last_new_job'].replace({'>4': 5, 'never': 0}))
testData['last_new_job'] = pd.to_numeric(testData['last_new_job'].replace({'>4': 5, 'never': 0}))


3. If the column is categorical, impute the missing value as its mode. If the column is numerical, impute the missing value as its median



In [81]:
def clean_columnData(df):   
    for column in df.select_dtypes(include=[np.number]).columns:
        median_value = df[column].median()
        replaced = df[column].fillna(median_value)
        df[column] = replaced

    for column in df.select_dtypes(exclude=[np.number]).columns:
        if (df[column].isnull().sum() > 0):
            replaced = df[column].fillna(df[column].mode()[0])
            df[column] = replaced

clean_columnData(trainData)
clean_columnData(testData)


## Task2 Classification

1. Build a classification model from the training set ( you can use any algorithms)
> CatBoost is an algorithm for gradient boosting on decision trees. It is developed by Yandex researchers and engineers, and is used for search, recommendation systems, personal assistant, self-driving cars, weather prediction and many other tasks at Yandex and in other companies, including CERN, Cloudflare, Careem taxi. It is in open-source and can be used by anyone. Reference : https://catboost.ai/

In [86]:
# we need to change the categorical data to numerical data so the model can understand it
trainDataDummies = pd.get_dummies(trainData)
testDataDummies = pd.get_dummies(testData)

# add missing columns to test data
missing_columns = set(trainDataDummies.columns) - set(testDataDummies.columns)
for column in missing_columns:
    testDataDummies[column] = False

# we now convert the binary columns into numerical ones.


In [90]:
def convert_bool_columns(df):
    for column in df.select_dtypes(include=['bool']).columns:
        df[column] = df[column].astype(int)

convert_bool_columns(trainDataDummies)
convert_bool_columns(testDataDummies)

testDataDummies = testDataDummies.reindex(columns=trainDataDummies.columns)


In [91]:
model = CatBoostRegressor(objective='RMSE')


Y_train = trainDataDummies['target']
# there are no identifying columns that could confuse the model, so we only remove target.
X_train = trainDataDummies.drop(columns=['target'])
model.fit(X_train, Y_train)

Learning rate set to 0.046035
0:	learn: 0.4325687	total: 1.42ms	remaining: 1.42s
1:	learn: 0.4297794	total: 2.63ms	remaining: 1.31s
2:	learn: 0.4268301	total: 3.9ms	remaining: 1.29s
3:	learn: 0.4242078	total: 5.5ms	remaining: 1.37s
4:	learn: 0.4217844	total: 7.07ms	remaining: 1.41s
5:	learn: 0.4196387	total: 8.34ms	remaining: 1.38s
6:	learn: 0.4175933	total: 9.63ms	remaining: 1.36s
7:	learn: 0.4157511	total: 10.8ms	remaining: 1.33s
8:	learn: 0.4143082	total: 11.9ms	remaining: 1.31s
9:	learn: 0.4124610	total: 13ms	remaining: 1.29s
10:	learn: 0.4107593	total: 14.3ms	remaining: 1.29s
11:	learn: 0.4093374	total: 15.5ms	remaining: 1.27s
12:	learn: 0.4081060	total: 16.7ms	remaining: 1.27s
13:	learn: 0.4068158	total: 18ms	remaining: 1.27s
14:	learn: 0.4057677	total: 19.1ms	remaining: 1.25s
15:	learn: 0.4046333	total: 20.3ms	remaining: 1.25s
16:	learn: 0.4036598	total: 21.4ms	remaining: 1.24s
17:	learn: 0.4026937	total: 22.5ms	remaining: 1.23s
18:	learn: 0.4019230	total: 23.2ms	remaining: 1.2s

<catboost.core.CatBoostRegressor at 0x1fefbea0950>


2. generate the confusion matrix and calculate the accuracy, precision, recall, and F1-score on training set. 

In [104]:
Y_pred = model.predict(X_train)
# we need to convert the predictions to binary values
Y_predBinary = [1 if x > 0.5 else 0 for x in Y_pred]

cm = confusion_matrix(Y_train, Y_predBinary)
# Optionally, convert the confusion matrix to a DataFrame for better readability
#cm = cm.astype('string')
cm2 = [
    [f"{cm[0][0]} TP", f"{cm[0][1]} FP"],
    [f"{cm[1][0]} FN", f"{cm[1][1]} TN"]
]
cm_df = pd.DataFrame(cm2, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative'])
display(cm_df)

scores = pd.DataFrame([
    accuracy_score(Y_train, Y_predBinary),
    precision_score(Y_train, Y_predBinary),
    recall_score(Y_train, Y_predBinary),
    f1_score(Y_train, Y_predBinary)
]
    , index=['Accuracy', 'Precision', 'Recall', 'F1'], columns=['Score (train)'])
display(scores)

Unnamed: 0,Predicted Positive,Predicted Negative
Actual Positive,1529 TP,36 FP
Actual Negative,159 FN,376 TN


Unnamed: 0,Score (train)
Accuracy,0.907143
Precision,0.912621
Recall,0.702804
F1,0.794087


3. Applying the model in the test set and generating the prediction


In [None]:
X_test = testDataDummies.drop(columns=['target'])
Y_test = testDataDummies['target']

Y_pred_test = model.predict(X_test)

4. generate the confusion matrix from the test set and calculate the accuracy, precision, recall, and F1-score


In [107]:
Y_pred_test = model.predict(X_test)
# we need to convert the predictions to binary values
Y_pred_testBinary = [1 if x > 0.5 else 0 for x in Y_pred_test]

cm = confusion_matrix(Y_test, Y_pred_testBinary)
# Optionally, convert the confusion matrix to a DataFrame for better readability
#cm = cm.astype('string')
cm2 = [
    [f"{cm[0][0]} TP", f"{cm[0][1]} FP"],
    [f"{cm[1][0]} FN", f"{cm[1][1]} TN"]
]
display(pd.DataFrame(cm2, index=['Actual Positive', 'Actual Negative'], columns=['Predicted Positive', 'Predicted Negative']))

display(pd.DataFrame([
    accuracy_score(Y_test, Y_pred_testBinary),
    precision_score(Y_test, Y_pred_testBinary),
    recall_score(Y_test, Y_pred_testBinary),
    f1_score(Y_test, Y_pred_testBinary)
]
    , index=['Accuracy', 'Precision', 'Recall', 'F1'], columns=['Score (test)']))

Unnamed: 0,Predicted Positive,Predicted Negative
Actual Positive,70 TP,8 FP
Actual Negative,16 FN,6 TN


Unnamed: 0,Score (test)
Accuracy,0.76
Precision,0.428571
Recall,0.272727
F1,0.333333


5. compare the results between the training and test set
It appears the model is working much worse on the test data than on the training data - all the scores are much lower, with precision going down from over 90% to just 42%. this might be due to a bad selection of the test data, or because something with the columns having to be mapped to bools might overextend a certain value's importance.

Maybe i picked a bad algorithm with CatBoost, too, but since you made it our choice, and it was very easy to find a reference to it, i chose that one.



The data is modifed , but you can get some help from [(https://www.kaggle.com/datasets/arashnic/hr-analytics-job-change-of-data-scientists?select=sample_submission.csv)Kaggle]

## Extra point
Think about what kind of the method can increase the performance (does not need to run )


Maybe using dummy-values (additional bool columns) for nonnumeric fields is bad for performance, when i executed the model, everything was quite fast, so i have no problem.

if the raw data files were any kind of big, the repetition for nonnumeric strings of course is a hindrance and they could be replaced by number-encodings like:
    no_enrollment=1
    Part time course=2
    Full time course=3
but this might be dangerous, since the person training the model might overlook the fact that those do not represent continuous values and so the model might learn wrong.

of course, choosing a different algorithm is probably significant for the performance, too.