In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

# Quick Summary on Model Performance

In [182]:
# F1-scores in Cross Validation test fold
performance_table.style.format("{:.2%}")

Unnamed: 0,NaiveBayes,LogisticRegression,SVM,RandomForest,GBDT,GBDT+LR,XGB
Fold 0,89.63%,96.49%,96.16%,91.49%,96.27%,97.33%,96.24%
Fold 1,91.87%,99.15%,98.59%,93.76%,98.39%,99.54%,98.57%
Fold 2,92.37%,98.86%,98.85%,93.91%,98.11%,99.29%,98.20%
Fold 3,92.51%,98.47%,98.49%,93.35%,98.00%,99.35%,98.04%
Fold 4,92.66%,98.76%,98.70%,93.51%,98.08%,99.27%,98.10%


# 1.Data Preprocessing

## 1.1 File Read

In [2]:
jobs = pd.read_csv('./fake_job_postings.csv')
jobs.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
# check the null values
round(np.sum(jobs.isnull(), axis=0) / jobs.shape[0] * 100, 2)

job_id                  0.00
title                   0.00
location                1.94
department             64.58
salary_range           83.96
company_profile        18.50
description             0.01
requirements           15.07
benefits               40.32
telecommuting           0.00
has_company_logo        0.00
has_questions           0.00
employment_type        19.41
required_experience    39.43
required_education     45.33
industry               27.42
function               36.10
fraudulent              0.00
dtype: float64

In [4]:
selected_columns = ['title', 'location', 'department', 'company_profile', 
                    'description', 'requirements', 'benefits','fraudulent']
jobs = jobs[selected_columns]
jobs.shape

(17880, 8)

In [5]:
def str_concat(data):
    res = ''
    for col in selected_columns:
        if col == 'fraudulent':
            continue
        res += str(data[col]) + '\n'
    return res

In [6]:
jobs['text'] = jobs.apply(str_concat, axis=1)
jobs.head()

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,fraudulent,text
0,Marketing Intern,"US, NY, New York",Marketing,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,"Marketing Intern\nUS, NY, New York\nMarketing\..."
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,"Customer Service - Cloud Video Production\nNZ,..."
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,"Commissioning Machinery Assistant (CMA)\nUS, I..."
3,Account Executive - Washington DC,"US, DC, Washington",Sales,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,"Account Executive - Washington DC\nUS, DC, Was..."
4,Bill Review Manager,"US, FL, Fort Worth",,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,"Bill Review Manager\nUS, FL, Fort Worth\nnan\n..."


In [10]:
text_data = jobs[['text', 'fraudulent']].copy()

## 1.2 Text Processing

In [7]:
import regex as re

In [8]:
def text_preprocessing(text):
    text = re.sub(r'([a-z])([A-Z]|[0-9])', r'\1 \2', text) # deal with concated words such as "MyJob"
    text = re.sub(r'https?:?[\S]+', ' ', text) # deal with URL
    text = re.sub(r'#?URL[\S]+', ' ', text) # deal with "#URL_abcdefg"
    text = re.sub(r'[_]+', ' ', text) # deal with "_"
    
    text = re.sub(r'\W', ' ', text) # just remain the alphanumeric characters
    
    word_list = []
    for word in text.split():
        word = word.lower()
        if len(word) <= 2 or \
            re.findall(r'^[0-9]+', word) or \
            word == 'nan':
                continue
        
        word_list.append(word)
    
    return ' '.join(word_list)

In [11]:
for i in range(5):
    print(f"**************************************************\n",
          f"Example{i}:\nOriginalText:{text_data['text'][i]}\n",
          f"ProcessedText:{text_preprocessing(text_data['text'][i])}\n")

**************************************************
 Example0:
OriginalText:Marketing Intern
US, NY, New York
Marketing
We're Food52, and we've created a groundbreaking and award-winning cooking site. We support, connect, and celebrate home cooks, and give them everything they need in one place.We have a top editorial, business, and engineering team. We're focused on using technology to find new and better ways to connect people around their specific food interests, and to offer them superb, highly curated information about food and cooking. We attract the most talented home cooks and contributors in the country; we also publish well-known professionals like Mario Batali, Gwyneth Paltrow, and Danny Meyer. And we have partnerships with Whole Foods Market and Random House.Food52 has been named the best food website by the James Beard Foundation and IACP, and has been featured in the New York Times, NPR, Pando Daily, TechCrunch, and on the Today Show.We're located in Chelsea, in New York C

In [15]:
text_data['cleaned_text'] = text_data['text'].apply(text_preprocessing)
text_data.head()

Unnamed: 0,text,fraudulent,cleaned_text
0,"Marketing Intern\nUS, NY, New York\nMarketing\...",0,marketing intern new york marketing food and c...
1,"Customer Service - Cloud Video Production\nNZ,...",0,customer service cloud video production auckla...
2,"Commissioning Machinery Assistant (CMA)\nUS, I...",0,commissioning machinery assistant cma wever va...
3,"Account Executive - Washington DC\nUS, DC, Was...",0,account executive washington washington sales ...
4,"Bill Review Manager\nUS, FL, Fort Worth\nnan\n...",0,bill review manager fort worth spot source sol...


## 1.3 Lemmamization

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

In [13]:
def get_pos_tag(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatization(text):
    tokens = text.split()
    pos = pos_tag(tokens)
    pos = [get_pos_tag(p[1]) for p in pos]
    
    lemmatized_token = []
    for i in range(len(tokens)):
        lemmatized_token.append(lemmatizer.lemmatize(tokens[i], pos=pos[i]))
    
    return " ".join(lemmatized_token)

In [16]:
text_data['lemmatized_text'] = text_data['cleaned_text'].apply(lemmatization)
text_data.head()

Unnamed: 0,text,fraudulent,cleaned_text,lemmatized_text
0,"Marketing Intern\nUS, NY, New York\nMarketing\...",0,marketing intern new york marketing food and c...,market intern new york marketing food and crea...
1,"Customer Service - Cloud Video Production\nNZ,...",0,customer service cloud video production auckla...,customer service cloud video production auckla...
2,"Commissioning Machinery Assistant (CMA)\nUS, I...",0,commissioning machinery assistant cma wever va...,commission machinery assistant cma wever valor...
3,"Account Executive - Washington DC\nUS, DC, Was...",0,account executive washington washington sales ...,account executive washington washington sale o...
4,"Bill Review Manager\nUS, FL, Fort Worth\nnan\n...",0,bill review manager fort worth spot source sol...,bill review manager fort worth spot source sol...


In [146]:
for i in range(5):
    print(f"**************************************************\n",
          f"Example{i}:\nCleanedText:{text_data['cleaned_text'][i]}\n",
          f"ProcessedText:{text_data['lemmatized_text'][i]}\n")

**************************************************
 Example0:
CleanedText:marketing intern new york marketing food and created groundbreaking and award winning cooking site support connect and celebrate home cooks and give them everything they need one place have top editorial business and engineering team focused using technology find new and better ways connect people around their specific food interests and offer them superb highly curated information about food and cooking attract the most talented home cooks and contributors the country also publish well known professionals like mario batali gwyneth paltrow and danny meyer and have partnerships with whole foods market and random house food has been named the best food website the james beard foundation and iacp and has been featured the new york times npr pando daily tech crunch and the today show located chelsea new york city food fast growing james beard award winning online food community and crowd sourced and curated recipe hu

## 1.4 Train/Test Data Preparation

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_data['lemmatized_text'], 
                                                    text_data['fraudulent'], 
                                                    test_size=0.3, 
                                                    random_state=666)

print(f"Train X shape: {X_train.shape}")
print(f"{y_train.value_counts()}")
print(f"positive rate: {y_train.value_counts()[1] / y_train.shape[0]:.4f}\n")
print(f"Test X shape: {X_test.shape}")
print(f"{y_test.value_counts()}")
print(f"positive rate: {y_test.value_counts()[1] / y_test.shape[0]:.4f}\n")

Train X shape: (12516,)
0    11915
1      601
Name: fraudulent, dtype: int64
positive rate: 0.0480

Test X shape: (5364,)
0    5099
1     265
Name: fraudulent, dtype: int64
positive rate: 0.0494



In [49]:
# tf representation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,1), max_df=0.5, min_df=0.001)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print(f"Train X vector shape: {X_train_vectorized.shape}")
print(f"Test X vector shape: {X_test_vectorized.shape}")

Train X vector shape: (12516, 7523)
Test X vector shape: (5364, 7523)


In [50]:
# tf-idf representation
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), max_df=0.5, min_df=0.001)

X_train_vectorized_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Train X vector shape: {X_train_vectorized_tfidf.shape}")
print(f"Test X vector shape: {X_test_vectorized_tfidf.shape}")

Train X vector shape: (12516, 7523)
Test X vector shape: (5364, 7523)


In [54]:
X_train_tf = pd.DataFrame(X_train_vectorized.toarray(), 
                       columns=list(vectorizer.get_feature_names_out()))
X_test_tf = pd.DataFrame(X_test_vectorized.toarray(), 
                       columns=list(vectorizer.get_feature_names_out()))
print(X_test.shape)
print(X_train.shape)
X_train_tf.head()

(5364,)
(12516,)


Unnamed: 0,aaa,aan,abap,abc,abe,aberdeen,abilities,ability,able,aboard,...,στην,στο,στον,την,της,τις,τον,του,τους,των
0,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
X_train_tfidf = pd.DataFrame(X_train_vectorized_tfidf.toarray(), 
                       columns=list(tfidf_vectorizer.get_feature_names_out()))
X_test_tfidf = pd.DataFrame(X_test_vectorized_tfidf.toarray(), 
                       columns=list(tfidf_vectorizer.get_feature_names_out()))
print(X_test_tfidf.shape)
print(X_train_tfidf.shape)
X_train_tfidf.head()

(5364, 7523)
(12516, 7523)


Unnamed: 0,aaa,aan,abap,abc,abe,aberdeen,abilities,ability,able,aboard,...,στην,στο,στον,την,της,τις,τον,του,τους,των
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022798,0.029961,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027632,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066151,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053164,0.034934,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
y_train = y_train.reset_index()['fraudulent']
y_test = y_test.reset_index()['fraudulent']
print(y_train.shape)
print(y_test.shape)

(12516,)
(5364,)


## 1.5 Oversampling

In [21]:
# smote
from imblearn.over_sampling import SMOTE

In [62]:
sm = SMOTE()
X_train_tf_oversample, y_train_tf_oversample = sm.fit_resample(X_train_tf, y_train)

print(f"before oversampling, X_train shape is {X_train_tf.shape}\n"
      f"class distribution is {y_train.value_counts()}\n\n")
print(f"after oversampling, X_train shape is {X_train_tf_oversample.shape}\n"
      f"class distribution is {y_train_tf_oversample.value_counts()}")

before oversampling, X_train shape is (12516, 7523)
class distribution is 0    11915
1      601
Name: fraudulent, dtype: int64


after oversampling, X_train shape is (23830, 7523)
class distribution is 0    11915
1    11915
Name: fraudulent, dtype: int64


In [180]:
# sm = SMOTE()
# X_train_tfidf_oversample, y_train_tfidf_oversample = sm.fit_resample(X_train_tfidf, y_train)

# print(f"before oversampling, X_train shape is {X_train_tfidf.shape}\n"
#       f"class distribution is {y_train.value_counts()}\n\n")
# print(f"after oversampling, X_train shape is {X_train_tfidf_oversample.shape}\n"
#       f"class distribution is {y_train_tfidf_oversample.value_counts()}")

# 2.Machine Learning

## 2.1 Naive Bayes

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [79]:
param_grid = {}
clf = GridSearchCV(estimator=MultinomialNB(), 
                   cv=5, 
                   verbose=2, 
                   return_train_score=True,
                   param_grid = param_grid, 
                   scoring='f1')

clf.fit(X_train_tf_oversample, y_train_tf_oversample)
prediction = clf.predict(X_test_tf)

print(f"best hyperparameter combination is {clf.best_params_}")
print(classification_report(y_true=y_test, y_pred=prediction))
print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .................................................... total time=   2.4s
[CV] END .................................................... total time=   1.8s
[CV] END .................................................... total time=   1.8s
[CV] END .................................................... total time=   1.7s
[CV] END .................................................... total time=   1.8s
best hyperparameter combination is {}
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5099
           1       0.64      0.66      0.65       265

    accuracy                           0.97      5364
   macro avg       0.81      0.82      0.82      5364
weighted avg       0.97      0.97      0.97      5364

f1 score: 0.9653243847874721


In [78]:
# param_grid = {}
# clf = GridSearchCV(estimator=MultinomialNB(), 
#                    cv=5, 
#                    verbose=2, 
#                    return_train_score=True,
#                    param_grid = param_grid, 
#                    scoring='f1')

# clf.fit(X_train_tfidf, y_train)
# prediction = clf.predict(X_test_tfidf)

# print(f"best hyperparameter combination is {clf.best_params_}")
# print(classification_report(y_true=y_test, y_pred=prediction))
# print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .................................................... total time=   0.9s
[CV] END .................................................... total time=   0.4s
[CV] END .................................................... total time=   0.4s
[CV] END .................................................... total time=   0.4s
[CV] END .................................................... total time=   0.4s
best hyperparameter combination is {}
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      5099
           1       0.83      0.15      0.26       265

    accuracy                           0.96      5364
   macro avg       0.90      0.57      0.62      5364
weighted avg       0.95      0.96      0.94      5364

f1 score: 0.9565622669649515


In [80]:
baseline_score = clf.best_score_
naive_bayers_results = clf.cv_results_
baseline_score

0.918087652095257

In [81]:
naive_bayers_results

{'mean_fit_time': array([1.47557802]),
 'std_fit_time': array([0.25193304]),
 'mean_score_time': array([0.39532304]),
 'std_score_time': array([0.04155957]),
 'params': [{}],
 'split0_test_score': array([0.89626934]),
 'split1_test_score': array([0.91873741]),
 'split2_test_score': array([0.92369122]),
 'split3_test_score': array([0.92514904]),
 'split4_test_score': array([0.92659126]),
 'mean_test_score': array([0.91808765]),
 'std_test_score': array([0.01122529]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([0.92384993]),
 'split1_train_score': array([0.92041599]),
 'split2_train_score': array([0.92003564]),
 'split3_train_score': array([0.91945822]),
 'split4_train_score': array([0.92199212]),
 'mean_train_score': array([0.92115038]),
 'std_train_score': array([0.00159006])}

## 2.2 Logistic Regression

In [82]:
from sklearn.linear_model import LogisticRegression

In [86]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10]
}
clf = GridSearchCV(estimator=LogisticRegression(solver='liblinear'), 
                   cv=5, 
                   verbose=1, 
                   return_train_score=True,
                   param_grid = param_grid, 
                   scoring='f1')

clf.fit(X_train_tf_oversample, y_train_tf_oversample)
prediction = clf.predict(X_test_tf)

print(classification_report(y_true=y_test, y_pred=prediction))
print(f"The best hyperparameter combination is {clf.best_params_}")
print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      5099
           1       0.68      0.78      0.73       265

    accuracy                           0.97      5364
   macro avg       0.84      0.88      0.86      5364
weighted avg       0.97      0.97      0.97      5364

The best hyperparameter combination is {'C': 10, 'penalty': 'l1'}
f1 score: 0.97110365398956


In [87]:
lr_score = clf.best_score_
lr_results = clf.cv_results_
lr_score

0.9834385118781779

In [88]:
lr_results

{'mean_fit_time': array([4.03911037, 4.06896043, 3.87456856, 4.40937567, 3.94769773,
        4.39651937]),
 'std_fit_time': array([0.351865  , 0.05386685, 0.04524047, 0.09162839, 0.02736872,
        0.08329065]),
 'mean_score_time': array([0.26515698, 0.27542801, 0.26673293, 0.28063264, 0.26906457,
        0.27727313]),
 'std_score_time': array([0.00131104, 0.00165131, 0.00154419, 0.00999361, 0.00270336,
        0.00248358]),
 'param_C': masked_array(data=[0.1, 0.1, 1, 1, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_penalty': masked_array(data=['l1', 'l2', 'l1', 'l2', 'l1', 'l2'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1, 'penalty': 'l1'},
  {'C': 0.1, 'penalty': 'l2'},
  {'C': 1, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 10, 'penalty': 'l1'},
  {'C': 10, 'penalty': 'l2'}],
 'split0_test_score

## 2.3 Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [89]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'criterion' : ["gini", "entropy"],
    'min_samples_split': [1, 3, 5],
    'max_depth': [3, 5]
}
clf = GridSearchCV(estimator=RandomForestClassifier(), 
                   cv=5, 
                   verbose=1, 
                   return_train_score=True,
                   param_grid = param_grid, 
                   scoring='f1')

clf.fit(X_train_tf_oversample, y_train_tf_oversample)
prediction = clf.predict(X_test_tf)

print(f"best hyperparameter combination is {clf.best_params_}")
print(classification_report(y_true=y_test, y_pred=prediction))
print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
best hyperparameter combination is {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 150}
              precision    recall  f1-score   support

           0       0.99      0.89      0.93      5099
           1       0.26      0.75      0.38       265

    accuracy                           0.88      5364
   macro avg       0.62      0.82      0.66      5364
weighted avg       0.95      0.88      0.91      5364

f1 score: 0.8797539149888143


In [90]:
rf_score = clf.best_score_
rf_results = clf.cv_results_
rf_score

0.9320661316355722

In [91]:
rf_results

{'mean_fit_time': array([0.6522902 , 0.49605579, 0.47188945, 1.88071504, 2.5033586 ,
        2.97038479, 1.82189198, 2.53180318, 2.91318698, 0.51320276,
        0.47473893, 0.48715892, 2.27243485, 3.19655962, 4.14696946,
        2.22776718, 3.1752089 , 4.13401713, 0.51286807, 0.46840405,
        0.4755939 , 1.78475156, 2.36041121, 2.97123108, 1.83189487,
        2.47026386, 3.18540959, 0.51248655, 0.47438698, 0.47407846,
        2.26245923, 3.33308969, 4.20951967, 2.27479815, 3.32456784,
        4.06133156]),
 'std_fit_time': array([0.3232905 , 0.00483761, 0.03276395, 0.05188049, 0.04691884,
        0.0509158 , 0.01187769, 0.18296531, 0.03496793, 0.0882728 ,
        0.01129006, 0.00558516, 0.0804941 , 0.03670326, 0.0789029 ,
        0.02905864, 0.02389559, 0.06495627, 0.09712839, 0.0112052 ,
        0.00337426, 0.03043663, 0.02873452, 0.0371392 , 0.02133699,
        0.05482426, 0.14374361, 0.09041033, 0.00991381, 0.02501446,
        0.06480905, 0.12230178, 0.21549596, 0.19908161, 0.381

## 2.4 SVM

In [31]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import SGDClassifier

In [33]:
std_scaler = StandardScaler(with_mean=False)
X_train_std = std_scaler.fit_transform(X_train_oversample)
X_test_std = std_scaler.transform(X_test)

In [34]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': [0.1, 1, 10]
}

clf = GridSearchCV(estimator=svm.LinearSVC(), 
                   cv=5, 
                   verbose=1, 
                   return_train_score=True,
                   param_grid = param_grid, 
                   scoring='f1')

clf.fit(X_train_std, y_train_oversample)
prediction = clf.predict(X_test_std)

print(f"best hyperparameter combination is {clf.best_params_}")
print(classification_report(y_true=y_test, y_pred=prediction))
print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
best hyperparameter combination is {'C': 0.1, 'loss': 'squared_hinge', 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      5099
           1       0.64      0.76      0.69       265

    accuracy                           0.97      5364
   macro avg       0.81      0.87      0.84      5364
weighted avg       0.97      0.97      0.97      5364

f1 score: 0.9670022371364653


In [35]:
svm_score = clf.best_score_
svm_results = clf.cv_results_
svm_score

0.9815778430549728

In [36]:
svm_results

{'mean_fit_time': array([1.72225175, 3.54538708, 1.60157623, 3.27456918, 1.469417  ,
        3.16446538, 1.40662799, 3.74395399, 1.77651587, 3.16947885,
        1.48675423, 3.17754641]),
 'std_fit_time': array([0.58744554, 0.45161658, 0.03262212, 0.19224978, 0.18739926,
        0.19033937, 0.05642211, 0.51617773, 0.18504499, 0.09306771,
        0.2100981 , 0.14031444]),
 'mean_score_time': array([0.        , 0.19428387, 0.        , 0.15729094, 0.        ,
        0.13770781, 0.        , 0.19705043, 0.        , 0.1796556 ,
        0.        , 0.17318387]),
 'std_score_time': array([0.        , 0.00877338, 0.        , 0.0601462 , 0.        ,
        0.04803651, 0.        , 0.04557831, 0.        , 0.03090336,
        0.        , 0.02312606]),
 'param_C': masked_array(data=[0.1, 0.1, 0.1, 0.1, 1, 1, 1, 1, 10, 10, 10, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtype=ob

## 2.5 Gradient Boosting

In [92]:
from sklearn.ensemble import GradientBoostingClassifier

In [93]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5]
}

clf = GridSearchCV(estimator=GradientBoostingClassifier(), 
                   cv=5, 
                   verbose=1, 
                   return_train_score=True,
                   param_grid = param_grid, 
                   scoring='f1')

clf.fit(X_train_tf_oversample, y_train_tf_oversample)
prediction = clf.predict(X_test_tf)

print(f"best hyperparameter combination is {clf.best_params_}")
print(classification_report(y_true=y_test, y_pred=prediction))
print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
best hyperparameter combination is {'max_depth': 5, 'n_estimators': 150}
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      5099
           1       0.54      0.75      0.63       265

    accuracy                           0.96      5364
   macro avg       0.76      0.86      0.80      5364
weighted avg       0.97      0.96      0.96      5364

f1 score: 0.9563758389261745


In [94]:
gdbt_score = clf.best_score_
gdbt_results = clf.cv_results_
gdbt_score

0.9777073441987966

In [95]:
gdbt_results

{'mean_fit_time': array([ 93.31091456, 181.03842497, 263.37414088, 155.56636825,
        293.78203979, 430.09398513]),
 'std_fit_time': array([ 1.90579851,  6.99096036, 10.34753195,  2.20113414,  2.7492553 ,
         0.8508994 ]),
 'mean_score_time': array([0.45119538, 0.48705516, 0.4758235 , 0.50608449, 0.47834415,
        0.48675141]),
 'std_score_time': array([0.01861023, 0.05017416, 0.01204725, 0.1186551 , 0.01056945,
        0.01913926]),
 'param_max_depth': masked_array(data=[3, 3, 3, 5, 5, 5],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[50, 100, 150, 50, 100, 150],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 3, 'n_estimators': 50},
  {'max_depth': 3, 'n_estimators': 100},
  {'max_depth': 3, 'n_estimators': 150},
  {'max_depth': 5, 'n_estimators': 50},
  {'max_depth': 5, 'n

## 2.6 Gradient Boosting + LR

In [126]:
from sklearn.preprocessing import OneHotEncoder

In [97]:
# refit the train data with the best hyperparameter combination
clf = GradientBoostingClassifier(**clf.best_params_)
clf.fit(X_train_tf_oversample, y_train_tf_oversample)

In [132]:
# using one-hot encode for each leaf node
encoder = OneHotEncoder(sparse_output=False)


X_train_gbdt_lr = clf.apply(X_train_tf_oversample)
X_train_gbdt_lr = np.resize(X_train_gbdt_lr, new_shape=(X_train_gbdt_lr.shape[0], X_train_gbdt_lr.shape[1]))
X_test_gbdt_lr = clf.apply(X_test_tf)
X_test_gbdt_lr = np.resize(X_test_gbdt_lr, new_shape=(X_test_gbdt_lr.shape[0], X_test_gbdt_lr.shape[1]))
print(f"before encoding, X_train shape is {X_train_gbdt_lr.shape}, X_test shape is {X_test_gbdt_lr.shape}\n")

encoder.fit(X_train_gbdt_lr)
X_train_gbdt_lr = encoder.transform(X_train_gbdt_lr)
X_test_gbdt_lr = encoder.transform(X_test_gbdt_lr)
print(f"after encoding, X_train shape is {X_train_gbdt_lr.shape}, X_test shape is {X_test_gbdt_lr.shape}\n")
display(X_train_gbdt_lr)

before encoding, X_train shape is (23830, 150), X_test shape is (5364, 150)

after encoding, X_train shape is (23830, 3661), X_test shape is (5364, 3661)



array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [133]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10]
}
clf_gbdtlr = GridSearchCV(estimator=LogisticRegression(solver='liblinear'), 
                   cv=5, 
                   verbose=1, 
                   return_train_score=True,
                   param_grid = param_grid, 
                   scoring='f1')

clf_gbdtlr.fit(X_train_gbdt_lr, y_train_tf_oversample)
prediction = clf_gbdtlr.predict(X_test_gbdt_lr)

print(classification_report(y_true=y_test, y_pred=prediction))
print(f"The best hyperparameter combination is {clf_gbdtlr.best_params_}")
print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5099
           1       0.72      0.80      0.76       265

    accuracy                           0.97      5364
   macro avg       0.85      0.89      0.87      5364
weighted avg       0.98      0.97      0.98      5364

The best hyperparameter combination is {'C': 1, 'penalty': 'l2'}
f1 score: 0.9748322147651006


In [134]:
gdbtlr_score = clf_gbdtlr.best_score_
gdbtlr_results = clf_gbdtlr.cv_results_
gdbtlr_score

0.9895640014895666

In [135]:
gdbtlr_results

{'mean_fit_time': array([1.66338835, 1.60718136, 2.3946672 , 1.52333679, 2.19900556,
        2.06813631]),
 'std_fit_time': array([0.64160391, 0.57439602, 0.70339418, 0.05231434, 0.82123685,
        0.24720832]),
 'mean_score_time': array([0.05520043, 0.0464838 , 0.10018473, 0.02916121, 0.03062458,
        0.0398097 ]),
 'std_score_time': array([0.0354689 , 0.03464485, 0.06539136, 0.00149895, 0.00632173,
        0.01751285]),
 'param_C': masked_array(data=[0.1, 0.1, 1, 1, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_penalty': masked_array(data=['l1', 'l2', 'l1', 'l2', 'l1', 'l2'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1, 'penalty': 'l1'},
  {'C': 0.1, 'penalty': 'l2'},
  {'C': 1, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 10, 'penalty': 'l1'},
  {'C': 10, 'penalty': 'l2'}],
 'split0_test_score

## 2.7 XGBoost

In [37]:
import xgboost as xgb

In [137]:
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [3, 5],
    'reg_lambda': [1, 5, 10]
}

clf = GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.1, 
                                               objective="binary:logistic",
                                               booster="gbtree",
                                               eval_metric="logloss"), 
                   cv=5, 
                   verbose=1, 
                   return_train_score=True,
                   param_grid = param_grid, 
                   scoring='f1')

clf.fit(X_train_tf_oversample, y_train_tf_oversample)
prediction = clf.predict(X_test_tf)

print(f"best hyperparameter combination is {clf.best_params_}")
print(classification_report(y_true=y_test, y_pred=prediction))
print(f"f1 score: {f1_score(y_true=y_test, y_pred=prediction, average='micro')}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
best hyperparameter combination is {'max_depth': 5, 'n_estimators': 150, 'reg_lambda': 1}
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      5099
           1       0.54      0.76      0.64       265

    accuracy                           0.96      5364
   macro avg       0.77      0.86      0.81      5364
weighted avg       0.97      0.96      0.96      5364

f1 score: 0.9567486950037286


In [175]:
xgb_score = clf.best_score_
xgb_results = clf.cv_results_
xgb_score

0.9783376321304704

In [176]:
xgb_results

{'mean_fit_time': array([187.17223434, 186.21531672, 186.7766057 , 223.25526872,
        107.92029252, 113.4825582 , 116.79839497, 267.76583524,
        296.29438653, 435.71321864, 385.10359864, 884.64427557]),
 'std_fit_time': array([3.63642034e+00, 3.97524259e+00, 1.32898152e+00, 7.14689018e+01,
        5.74602190e-01, 2.29742252e+00, 6.66567264e-01, 7.71175629e+01,
        6.51866786e-01, 4.10799727e+00, 1.03019195e+02, 1.03520386e+03]),
 'mean_score_time': array([0.71578207, 0.77206082, 0.7564858 , 0.66263413, 0.45653415,
        0.47511182, 0.4615685 , 0.74335141, 0.84384332, 0.75731463,
        0.67215424, 0.68723798]),
 'std_score_time': array([0.02440968, 0.07818638, 0.09236215, 0.20281622, 0.00645172,
        0.03069898, 0.02439921, 0.17030588, 0.10954355, 0.11481576,
        0.10533283, 0.12558737]),
 'param_max_depth': masked_array(data=[3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False,

# 3.Summary

In [154]:
def mdoel_performance_in_5folds(fold, cv_results):
    test_scores = []
    idx = cv_results['rank_test_score'].argmin()
    
    for i in range(fold):
        test_scores.append(cv_results[f"split{i}_test_score"][idx])
    
    return np.array(test_scores)

In [177]:
results = [naive_bayers_results, lr_results, svm_results, rf_results, gdbt_results, gdbtlr_results, xgb_results]


In [178]:
results = [naive_bayers_results, lr_results, svm_results, rf_results, gdbt_results, gdbtlr_results, xgb_results]
performance = []

for r in results:
    performance.append(mdoel_performance_in_5folds(5, r))

performance = np.array(performance)
performance = performance.transpose()

performance_table = pd.DataFrame(data=performance,
                                columns=['NaiveBayes', 'LogisticRegression','SVM','RandomForest','GBDT','GBDT+LR','XGB'],
                                index=[f"Fold {i}" for i in range(5)])

performance_table

Unnamed: 0,NaiveBayes,LogisticRegression,SVM,RandomForest,GBDT,GBDT+LR,XGB
Fold 0,0.896269,0.964852,0.961603,0.914907,0.962728,0.973273,0.962428
Fold 1,0.918737,0.991471,0.985942,0.937639,0.983898,0.995403,0.985729
Fold 2,0.923691,0.988592,0.98846,0.939137,0.981062,0.992914,0.982043
Fold 3,0.925149,0.984711,0.984893,0.93352,0.980021,0.993535,0.980449
Fold 4,0.926591,0.987567,0.986991,0.935128,0.980829,0.992695,0.981039


In [179]:
performance_table.style.format("{:.2%}")

Unnamed: 0,NaiveBayes,LogisticRegression,SVM,RandomForest,GBDT,GBDT+LR,XGB
Fold 0,89.63%,96.49%,96.16%,91.49%,96.27%,97.33%,96.24%
Fold 1,91.87%,99.15%,98.59%,93.76%,98.39%,99.54%,98.57%
Fold 2,92.37%,98.86%,98.85%,93.91%,98.11%,99.29%,98.20%
Fold 3,92.51%,98.47%,98.49%,93.35%,98.00%,99.35%,98.04%
Fold 4,92.66%,98.76%,98.70%,93.51%,98.08%,99.27%,98.10%
