Dependencies installation:

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#If you don't have mglearn installed at you computer, you will need to "uncomment" the next line
#%pip install mglearn
import mglearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score

# regression models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

#Dependencies used in Logistic Regression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer




#Dependencies used in KNN
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

#Random Forrest
from sklearn.ensemble import RandomForestClassifier


Now we are going to read our dataset. Notice that the dataset being loaded is the test.csv & train.csv and not JobApplicants.csv - the reason for that is because we have been working on different computers and therefore had the need for a stable range of data as a training set.

For more information on how the training set was created please visit traintestspilit.ipynb

In [21]:
# Load the training data
df_train = pd.read_csv('Train.csv')
# Load the test data
df_test = pd.read_csv('Test.csv')

#Check headers
df_train.head()
df_test.head()

Unnamed: 0,w,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills,Employed
0,31363,<35,No,Undergraduate,1,Man,No,Dev,9,1,India,7372.0,Bash/Shell;C++;Go;HTML/CSS;JavaScript;Matlab;N...,26,1
1,50414,>35,No,Undergraduate,1,Man,No,Dev,20,14,Canada,77303.0,C#;HTML/CSS;Java;JavaScript;SQL;TypeScript;Doc...,18,1
2,55640,>35,No,NoHigherEd,0,Man,No,Dev,35,31,Germany,89580.0,C;C++;JavaScript;LISP;Managed Hosting,5,0
3,52376,>35,No,Undergraduate,1,Man,No,Dev,30,32,Chile,200000.0,Assembly;Bash/Shell;C;COBOL;Groovy;HTML/CSS;Ja...,27,1
4,12724,>35,No,Master,1,Man,No,Dev,30,21,Russian Federation,28308.0,C#;Java;SQL;ASP.NET;Spring;Microsoft SQL Serve...,7,1


In [22]:
df_train.drop(['w'], axis=1, inplace=True)
df_test.drop(['w'], axis=1, inplace=True)

In [23]:
#What correlates the most with "employed" column?
df_train.corr()['Employed'].sort_values()

  df_train.corr()['Employed'].sort_values()


Employment       -0.010255
YearsCode        -0.008982
YearsCodePro     -0.001812
PreviousSalary    0.002171
ComputerSkills    0.586196
Employed          1.000000
Name: Employed, dtype: float64

## Arranging the data

In [24]:
categorical_cols = ['Age', 'EdLevel', 'Gender', 'MainBranch']
numerical_cols = ['YearsCode', 'YearsCodePro', 'PreviousSalary', 'ComputerSkills']

X_train = df_train[categorical_cols + numerical_cols]  
y_train = df_train['Employed']
X_test = df_test[categorical_cols + numerical_cols]  
y_test = df_test['Employed']

Testing to see if train and test has the same ammount of data and different rows

In [25]:
X_train

Unnamed: 0,Age,EdLevel,Gender,MainBranch,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills
0,>35,Other,Man,NotDev,35,15,77556.0,16
1,>35,PhD,Man,Dev,35,30,124092.0,7
2,<35,Master,NonBinary,Dev,18,2,78672.0,12
3,<35,Undergraduate,Man,Dev,10,1,44790.0,7
4,>35,Master,Man,Dev,21,17,155112.0,16
...,...,...,...,...,...,...,...,...
51418,>35,Undergraduate,Man,Dev,30,21,117229.0,7
51419,>35,Undergraduate,Man,Dev,30,22,58368.0,17
51420,<35,Undergraduate,Man,Dev,8,1,51663.0,15
51421,<35,Undergraduate,Man,Dev,9,8,62697.0,11


In [26]:
X_test

Unnamed: 0,Age,EdLevel,Gender,MainBranch,YearsCode,YearsCodePro,PreviousSalary,ComputerSkills
0,<35,Undergraduate,Man,Dev,9,1,7372.0,26
1,>35,Undergraduate,Man,Dev,20,14,77303.0,18
2,>35,NoHigherEd,Man,Dev,35,31,89580.0,5
3,>35,Undergraduate,Man,Dev,30,32,200000.0,27
4,>35,Master,Man,Dev,30,21,28308.0,7
...,...,...,...,...,...,...,...,...
22034,<35,Master,Man,Dev,10,1,51887.0,21
22035,<35,Master,Man,Dev,16,9,90647.0,17
22036,<35,Undergraduate,Man,Dev,12,8,54049.0,22
22037,<35,Undergraduate,Man,Dev,5,2,21408.0,21


In [27]:
y_train

0        0
1        1
2        1
3        0
4        1
        ..
51418    0
51419    0
51420    1
51421    1
51422    1
Name: Employed, Length: 51423, dtype: int64

In [28]:
y_test

0        1
1        1
2        0
3        1
4        1
        ..
22034    1
22035    1
22036    1
22037    1
22038    1
Name: Employed, Length: 22039, dtype: int64

# Logistic Regression

In [31]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Replace missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Convert categorical variable into dummy/indicator variables
])

In [32]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Replace missing values with the median
    ('scaler', StandardScaler())
])

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ])

In [34]:
# Create the logistic regression pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [56]:
pipeline.fit(X_train, y_train)

In [62]:
print("Training set score: {:.3f}".format(pipeline.score(X_train, y_train)))
print("Test score: {:.5f}".format(pipeline.score(X_test, y_test)))

Training set score: 0.785
Test score: 0.78284


In [60]:
y_pred = pipeline.predict(X_test)
y_pred = pipeline.predict(X_train)

"""print("Training set score: {:.3f}".format(pipeline.score(X_train, y_train)))
print("Test score: {:.5f}".format(pipeline.score(X_test, y_test)))"""
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')  # Adjust based on your target variable
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Accuracy: 0.7828395117745814
F1 Score: 0.7968073363335314
Precision: 0.7978234994048631
Recall: 0.7957937584803256
