In [1]:
# importing modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, make_scorer, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [2]:
# reading in the data
df = pd.read_csv("fake_jobs_processed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,has_company_logo,has_questions,telecommuting,fraudulent,10,30,ability,able,about,...,required_education_Some College Coursework Completed,required_education_Some High School Coursework,required_education_Unspecified,required_education_Vocational,required_education_Vocational - Degree,required_education_Vocational - HS Diploma,has_location,has_company_profile,has_salary_range,has_benefits
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
1,1,1,0,0,0,0,2,1,1,2,...,0,0,0,0,0,0,1,1,0,1
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,3,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,1
4,4,1,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,1,0,1


In [3]:
# dropping duplicated index column, and separating into X and Y
df.drop(inplace=True, columns="Unnamed: 0", axis=1)
X = df.drop(columns="fraudulent", axis=1)
y = df["fraudulent"]

# getting train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Base Model
Here I'm using the base SVM model with a linear kernel to fine results. However, a linear kernel is probably not the best here, as we have no idea of the shape of the idea, and its not easily seperable.

In [None]:
# testing a basic linear SVM model
base_model = SVC(kernel='linear').fit(X_train, y_train)
y_pred = base_model.predict(X_test)

# getting the classification report
print(classification_report(y_test, y_pred))

## Radial Basis Function Kernel

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
base_model = SVC(kernel='rbf').fit(X_train, y_train)
y_pred = base_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4265
           1       1.00      0.25      0.40       205

    accuracy                           0.97      4470
   macro avg       0.98      0.62      0.69      4470
weighted avg       0.97      0.97      0.96      4470



In [8]:
# performing validation using GridSearch

# Define the hyperparameter grid
param_grid = {
    'C': [1, 10, 100, 500, 1000], # adjusts level of overfitting. higher overfits, lower, under
    'gamma': ['scale', 'auto'], # adjusts complexity of boundaries, high overfits, lower underfits
}

# making a scorer to target recall of the 1
recall_scorer = make_scorer(recall_score, pos_label=1)

# Perform Grid Search
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, scoring=recall_scorer)
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'C': 500, 'gamma': 'scale'}


In [None]:
# building a model with the best parameters graded on recall
X_train, X_test, y_train, y_test = train_test_split(X, y)
validated_rbf_model = SVC(kernel='rbf', C=500, gamma='scale').fit(X_train, y_train)
y_pred = validated_rbf_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4239
           1       0.73      0.70      0.72       231

    accuracy                           0.97      4470
   macro avg       0.86      0.84      0.85      4470
weighted avg       0.97      0.97      0.97      4470



In [11]:
print(classification_report(y_train, validated_rbf_model.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12775
           1       1.00      1.00      1.00       635

    accuracy                           1.00     13410
   macro avg       1.00      1.00      1.00     13410
weighted avg       1.00      1.00      1.00     13410



## Polynomial Kernel

In [None]:
# basic model training
X_train, X_test, y_train, y_test = train_test_split(X, y)
base_model = SVC(kernel='poly').fit(X_train, y_train)
y_pred = base_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      4246
           1       1.00      0.14      0.25       224

    accuracy                           0.96      4470
   macro avg       0.98      0.57      0.61      4470
weighted avg       0.96      0.96      0.94      4470



In [12]:
# performing validation using GridSearch

# define hyperparameter options
param_grid = {
    'coef0': [0, 1],
    'degree': [2, 3, 5],
    'C': [1, 10, 100, 500, 1000], # adjusts level of overfitting. higher overfits, lower, under
    'gamma': ['scale', 'auto'], # adjusts complexity of boundaries, high overfits, lower underfits
}

# performing grid search with poly
grid_search = GridSearchCV(SVC(kernel='poly'), param_grid, cv=5, scoring=recall_scorer)
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 1000, 'coef0': 1, 'degree': 2, 'gamma': 'scale'}
