### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import html
import joblib
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, MultiLabelBinarizer
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.pipeline import make_pipeline
from gensim.parsing.porter import PorterStemmer

# modelling
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuhao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Helper class

In [2]:
def evaluate_model(model, X, y, label):
    """
    :param model: model to evaluate
    :param X: features
    :param y: target
    :param label: label for the model 

    """
    y_pred = model.predict(X)

    print(label + ' Set')
    print("Accuracy:", accuracy_score(y, y_pred))
    print("F1 Score:", f1_score(y, y_pred, average='macro'))
    print()

    print("Classification Report")


    print(classification_report(y, y_pred, digits=4))
    


def get_score(model, X, y):
    """
    :param model: model to evaluate
    :param X: features
    :param y: target

    """
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    print('Accuracy: ', cross_val_score(model, X, y, cv=cv, scoring='accuracy').mean())
    print('Precision Macro: ', cross_val_score(model, X, y, cv=cv, scoring='precision_macro').mean())
    print('Recall Macro: ', cross_val_score(model, X, y, cv=cv, scoring='recall_macro').mean())
    print('F1 Macro: ', cross_val_score(model, X, y, cv=cv, scoring='f1_macro').mean())
    
def compress_file(input_file, output_tar_gz):
    shutil.make_archive(output_tar_gz, 'xztar', '.', input_file)

In [3]:
df = pd.read_csv('../datasets/emscad_v1.csv')

In [4]:
df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'in_balanced_dataset'],
      dtype='object')

### Data Cleaning

In [5]:
df = df[['description', 'requirements', 'benefits', 'fraudulent']].fillna('')

In [6]:
df["feature"] = df['description'] + " "+ df['requirements'] + " " + df['benefits']

In [7]:
df = df[['feature', 'fraudulent']]

In [8]:
df.head(5)

Unnamed: 0,feature,fraudulent
0,"<p>Food52, a fast-growing, James Beard Award-w...",f
1,<p>Organised - Focused - Vibrant - Awesome!<br...,f
2,"<p>Our client, located in Houston, is actively...",f
3,<p><b>THE COMPANY: ESRI – Environmental System...,f
4,<p><b>JOB TITLE:</b> Itemization Review Manage...,f


In [9]:
df['feature'] = df['feature'].str.lower()
df.head(5)

Unnamed: 0,feature,fraudulent
0,"<p>food52, a fast-growing, james beard award-w...",f
1,<p>organised - focused - vibrant - awesome!<br...,f
2,"<p>our client, located in houston, is actively...",f
3,<p><b>the company: esri – environmental system...,f
4,<p><b>job title:</b> itemization review manage...,f


In [10]:
def remove_html_tags_and_escape_chars(input_text):
    # Remove HTML tags
    text_without_html = BeautifulSoup(input_text, 'html.parser').get_text()

    # Unescape HTML characters
    text_without_escape_chars = html.unescape(text_without_html)

    return text_without_escape_chars

In [11]:
df['feature'] = df['feature'].apply(remove_html_tags_and_escape_chars)
df.head(5)

Unnamed: 0,feature,fraudulent
0,"food52, a fast-growing, james beard award-winn...",f
1,organised - focused - vibrant - awesome!do you...,f
2,"our client, located in houston, is actively se...",f
3,the company: esri – environmental systems rese...,f
4,job title: itemization review manager\nlocatio...,f


In [13]:
def remove_non_alpha(input_text):
    return ''.join(char if char.isalpha() or char.isspace() else ' ' for char in input_text)

In [14]:
df['feature'] = df['feature'].apply(remove_non_alpha)
df.head(5)

Unnamed: 0,feature,fraudulent
0,food a fast growing james beard award winn...,f
1,organised focused vibrant awesome do you...,f
2,our client located in houston is actively se...,f
3,the company esri environmental systems rese...,f
4,job title itemization review manager\nlocatio...,f


In [15]:
# tokenise
df['feature'] = df['feature'].apply(lambda x: word_tokenize(x.lower()))

In [16]:
df.head(5)

Unnamed: 0,feature,fraudulent
0,"[food, a, fast, growing, james, beard, award, ...",f
1,"[organised, focused, vibrant, awesome, do, you...",f
2,"[our, client, located, in, houston, is, active...",f
3,"[the, company, esri, environmental, systems, r...",f
4,"[job, title, itemization, review, manager, loc...",f


In [17]:
# remove stopwords
all_stopwords = set(stopwords.words('english'))
all_stopwords.update(['\\r\\n'])
df['feature'] = df['feature'].apply(lambda x: [word for word in x if word not in all_stopwords])

In [18]:
df.head(5)

Unnamed: 0,feature,fraudulent
0,"[food, fast, growing, james, beard, award, win...",f
1,"[organised, focused, vibrant, awesome, passion...",f
2,"[client, located, houston, actively, seeking, ...",f
3,"[company, esri, environmental, systems, resear...",f
4,"[job, title, itemization, review, manager, loc...",f


In [19]:
# stem words
df['feature'] = df['feature'].apply(lambda x: [PorterStemmer().stem(word) for word in x])

In [20]:
df.head(5)

Unnamed: 0,feature,fraudulent
0,"[food, fast, grow, jame, beard, award, win, on...",f
1,"[organis, focus, vibrant, awesom, passion, cus...",f
2,"[client, locat, houston, activ, seek, experien...",f
3,"[compani, esri, environment, system, research,...",f
4,"[job, titl, item, review, manag, locat, fort, ...",f


In [21]:
df['feature'] = df['feature'].apply(lambda x: [word for word in x if len(word) >= 3])

In [22]:
df.head(5)

Unnamed: 0,feature,fraudulent
0,"[food, fast, grow, jame, beard, award, win, on...",f
1,"[organis, focus, vibrant, awesom, passion, cus...",f
2,"[client, locat, houston, activ, seek, experien...",f
3,"[compani, esri, environment, system, research,...",f
4,"[job, titl, item, review, manag, locat, fort, ...",f


In [23]:
df['feature'] = df['feature'].apply(lambda x: ' '.join(x))

In [24]:
# drop rows wwith empty str
df = df[df['feature'] != '']

In [25]:
df['fraudulent'] = df['fraudulent'].apply(lambda x: 1 if x == "t" else 0)

In [26]:
df.head(5)

Unnamed: 0,feature,fraudulent
0,food fast grow jame beard award win onlin food...,0
1,organis focus vibrant awesom passion custom se...,0
2,client locat houston activ seek experienc comm...,0
3,compani esri environment system research insti...,0
4,job titl item review manag locat fort worth de...,0


### Feature extraction using tf-idf

In [100]:
# Fit and transform the text data using TF-IDF
tfidf = TfidfVectorizer()
dtm = tfidf.fit_transform(df['feature'])

### Dimensionsality reduction using SVD <br>
This removes the less important variables in my dataset and improves training speed.

In [102]:
dimension = 500
svd = TruncatedSVD(dimension, random_state=42)
dtm_svd = svd.fit_transform(dtm)
# Apply Normalizer to normalize the data
dtm_svd_normalized = Normalizer(copy=False)
dtm_svd_normalized = dtm_svd_normalized.fit_transform(dtm_svd)

In [103]:
x = pd.DataFrame(dtm_svd)
x.reset_index(inplace=True, drop=True)
y = df['fraudulent']

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

### Modelling

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [None]:
evaluate_model(rf, x_train, y_train, 'Train')
evaluate_model(rf, x_test, y_test, 'Test')

In [None]:
# use SMOTETomek to oversample the minority class
x_res, y_res = SMOTETomek(sampling_strategy='all', random_state=42).fit_resample(x, y)

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [None]:
evaluate_model(rf, x_train, y_train, 'Train')
evaluate_model(rf, x_test, y_test, 'Test')

### Create a pipeline for the model

In [28]:
# make pipeline
tfidf = TfidfVectorizer()
svd = TruncatedSVD(n_components=350, random_state=42)
norm = Normalizer(copy=False)
smote = SMOTETomek(sampling_strategy='all', random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
pipe = make_pipeline(tfidf, svd, norm, smote, rf)
x_train, x_test, y_train, y_test = train_test_split(
    df['feature'], df['fraudulent'], test_size=0.2, random_state=42, stratify=df['fraudulent'])

In [29]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('truncatedsvd',
                 TruncatedSVD(n_components=350, random_state=42)),
                ('normalizer', Normalizer(copy=False)),
                ('smotetomek',
                 SMOTETomek(random_state=42, sampling_strategy='all')),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])

In [30]:
evaluate_model(pipe, x_train, y_train, 'Train')
evaluate_model(pipe, x_test, y_test, 'Test')

Train Set
Accuracy: 0.9999300845976369
F1 Score: 0.9996201002783337

Classification Report
              precision    recall  f1-score   support

           0     0.9999    1.0000    1.0000     13611
           1     1.0000    0.9986    0.9993       692

    accuracy                         0.9999     14303
   macro avg     1.0000    0.9993    0.9996     14303
weighted avg     0.9999    0.9999    0.9999     14303

Test Set
Accuracy: 0.9818232662192393
F1 Score: 0.8812321323185639

Classification Report
              precision    recall  f1-score   support

           0     0.9818    0.9994    0.9905      3403
           1     0.9821    0.6358    0.7719       173

    accuracy                         0.9818      3576
   macro avg     0.9820    0.8176    0.8812      3576
weighted avg     0.9818    0.9818    0.9800      3576



In [31]:
joblib.dump(pipe, '../models/rf.pkl')

['../models/rf.pkl']