# Resume Screener Classification Model on Synthetic Resume Dataset

In [61]:
import pandas as pd
import numpy as np

resume = pd.read_csv('../datasets/ai_resume_screening.csv')

resume

Unnamed: 0,years_experience,skills_match_score,education_level,project_count,resume_length,github_activity,shortlisted
0,6,84.7,Bachelors,7,234,158,No
1,3,59.1,Masters,5,502,77,No
2,12,100.0,Masters,12,753,381,Yes
3,14,66.8,High School,8,529,407,Yes
4,10,99.6,Bachelors,10,754,331,Yes
...,...,...,...,...,...,...,...
29995,9,77.4,Bachelors,13,691,434,Yes
29996,5,77.8,Bachelors,9,473,149,No
29997,1,64.6,Bachelors,7,247,82,No
29998,7,94.8,Bachelors,12,584,409,Yes


# To-do List

[] convert shortlisted to binary

[] columns already properly formatted

[] encode education_level to either a dummy or one-hot

[x] standardize the numeric cols.

[] visualize distributions of features


In [40]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numeric_cols = [
    'years_experience', 'skills_match_score', 'project_count', 'resume_length', 'github_activity'
]

resume[numeric_cols] = scaler.fit_transform(resume[numeric_cols])

In [41]:
resume['shortlisted'] = resume['shortlisted'].apply(lambda x: 1 if x=='Yes' else 0 )

resume

Unnamed: 0,years_experience,skills_match_score,education_level,project_count,resume_length,github_activity,shortlisted
0,-0.325813,0.657139,Bachelors,-0.786856,-1.894637,-1.045712,0
1,-0.974598,-0.869795,Masters,-1.218451,-0.394975,-1.552123,0
2,0.971758,1.569720,Masters,0.292133,1.009560,0.348482,1
3,1.404281,-0.410522,High School,-0.571058,-0.243889,0.511033,1
4,0.539234,1.545862,Bachelors,-0.139463,1.015155,0.035882,1
...,...,...,...,...,...,...,...
29995,0.322972,0.221724,Bachelors,0.507930,0.662623,0.679837,1
29996,-0.542074,0.245583,Bachelors,-0.355261,-0.557251,-1.101979,0
29997,-1.407121,-0.541742,Bachelors,-0.786856,-1.821892,-1.520863,0
29998,-0.109551,1.259562,Bachelors,0.292133,0.063877,0.523537,1


In [42]:
# quick look at class dispersion

resume['shortlisted'].value_counts()

shortlisted
1    20966
0     9034
Name: count, dtype: int64

In [43]:
resume['education_level'].value_counts()

education_level
Bachelors      13461
Masters        10524
PhD             3024
High School     2991
Name: count, dtype: int64

**Two ways to approach encoding education_level**

1. Dummy variable each of the four education levels tracked

2. Turn it into ordinal variable (1-4) w/ 4 being PhD, 1 being High School

In [44]:
resume['education_level'] = (resume['education_level']
                             .astype(str)
                             .str.strip()
                             .str.lower())

resume['education_level'].value_counts()

education_level
bachelors      13461
masters        10524
phd             3024
high school     2991
Name: count, dtype: int64

In [45]:
education_dict = {
    'high school' : 1,
    'bachelors' : 2,
    'masters' : 3,
    'phd' : 4
}

resume['education_level'] = resume['education_level'].map(education_dict)
resume['education_level']

0        2
1        3
2        3
3        1
4        2
        ..
29995    2
29996    2
29997    2
29998    2
29999    3
Name: education_level, Length: 30000, dtype: int64

In [47]:
resume.head()

Unnamed: 0,years_experience,skills_match_score,education_level,project_count,resume_length,github_activity,shortlisted
0,-0.325813,0.657139,2,-0.786856,-1.894637,-1.045712,0
1,-0.974598,-0.869795,3,-1.218451,-0.394975,-1.552123,0
2,0.971758,1.56972,3,0.292133,1.00956,0.348482,1
3,1.404281,-0.410522,1,-0.571058,-0.243889,0.511033,1
4,0.539234,1.545862,2,-0.139463,1.015155,0.035882,1


In [50]:
resume.dtypes

years_experience      float64
skills_match_score    float64
education_level         int64
project_count         float64
resume_length         float64
github_activity       float64
shortlisted             int64
dtype: object

In [51]:
from sklearn.model_selection import train_test_split

X = resume.drop(columns=['shortlisted'])
y = resume['shortlisted']

X


Unnamed: 0,years_experience,skills_match_score,education_level,project_count,resume_length,github_activity
0,-0.325813,0.657139,2,-0.786856,-1.894637,-1.045712
1,-0.974598,-0.869795,3,-1.218451,-0.394975,-1.552123
2,0.971758,1.569720,3,0.292133,1.009560,0.348482
3,1.404281,-0.410522,1,-0.571058,-0.243889,0.511033
4,0.539234,1.545862,2,-0.139463,1.015155,0.035882
...,...,...,...,...,...,...
29995,0.322972,0.221724,2,0.507930,0.662623,0.679837
29996,-0.542074,0.245583,2,-0.355261,-0.557251,-1.101979
29997,-1.407121,-0.541742,2,-0.786856,-1.821892,-1.520863
29998,-0.109551,1.259562,2,0.292133,0.063877,0.523537


In [52]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=617
)

In [53]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

print('Testing Accuracy: ', np.mean(rf.predict(X_train) == y_train) * 100)
print('Testing Accuracy: ', np.mean(rf.predict(X_test) == y_test) * 100)

Testing Accuracy:  100.0
Testing Accuracy:  90.05


In [54]:
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef


# Get basic evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

# Compute Confusion Matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


# Retrieve Classification Report
print('Classification Report')
print(classification_report(y_test, y_pred))

Accuracy: 0.9005
Precision: 0.9234573716439002
Recall: 0.9351299785356547
F1 Score: 0.929257020974049
ROC AUC: 0.9633554012795194
Confusion Matrix:
[[1482  325]
 [ 272 3921]]
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1807
           1       0.92      0.94      0.93      4193

    accuracy                           0.90      6000
   macro avg       0.88      0.88      0.88      6000
weighted avg       0.90      0.90      0.90      6000



In [58]:
# check feature importances, but need to conduct Logit Reg as RF doesn't carry feature importance

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, class_weight='balanced')

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1] 


# Get basic evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

# Compute Confusion Matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


# Retrieve Classification Report
print('Classification Report')
print(classification_report(y_test, y_pred))



Accuracy: 0.8973333333333333
Precision: 0.9594143334189571
Recall: 0.890770331504889
F1 Score: 0.9238189463269849
ROC AUC: 0.9666741060911201
Confusion Matrix:
[[1649  158]
 [ 458 3735]]
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1807
           1       0.96      0.89      0.92      4193

    accuracy                           0.90      6000
   macro avg       0.87      0.90      0.88      6000
weighted avg       0.91      0.90      0.90      6000



In [59]:
feature_importances = pd.Series(lr.coef_[0], index=X.columns)
feature_importances

years_experience      2.300484
skills_match_score    1.080953
education_level       0.880396
project_count         1.542807
resume_length        -0.030156
github_activity       0.257905
dtype: float64

In [62]:
resume.describe()

Unnamed: 0,years_experience,skills_match_score,project_count,resume_length,github_activity
count,30000.0,30000.0,30000.0,30000.0,30000.0
mean,7.506567,73.682653,10.646267,572.5847,325.260667
std,4.624104,16.765909,4.634047,178.709918,159.951803
min,0.0,0.5,0.0,150.0,0.0
25%,3.75,62.1,7.0,441.0,202.0
50%,7.0,74.3,10.0,574.0,321.0
75%,12.0,86.5,14.0,709.0,443.0
max,15.0,100.0,25.0,900.0,842.0
