### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import joblib
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, MultiLabelBinarizer
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.pipeline import make_pipeline
from gensim.parsing.porter import PorterStemmer

# modelling
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ifish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Helper class

In [2]:
def evaluate_model(model, X, y, label):
    """
    :param model: model to evaluate
    :param X: features
    :param y: target
    :param label: label for the model 

    """
    y_pred = model.predict(X)

    print(label + ' Set')
    print("Accuracy:", accuracy_score(y, y_pred))
    print("F1 Score:", f1_score(y, y_pred, average='macro'))
    print()

    print("Classification Report")


    print(classification_report(y, y_pred, digits=4))
    


def get_score(model, X, y):
    """
    :param model: model to evaluate
    :param X: features
    :param y: target

    """
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    print('Accuracy: ', cross_val_score(model, X, y, cv=cv, scoring='accuracy').mean())
    print('Precision Macro: ', cross_val_score(model, X, y, cv=cv, scoring='precision_macro').mean())
    print('Recall Macro: ', cross_val_score(model, X, y, cv=cv, scoring='recall_macro').mean())
    print('F1 Macro: ', cross_val_score(model, X, y, cv=cv, scoring='f1_macro').mean())
    
def compress_file(input_file, output_tar_gz):
    shutil.make_archive(output_tar_gz, 'xztar', '.', input_file)

In [3]:
df = pd.read_csv('../datasets/emscad_v1.csv')

In [4]:
df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'in_balanced_dataset'],
      dtype='object')

### Data Cleaning

In [5]:
df = df[['description', 'requirements', 'benefits', 'fraudulent']].fillna('')

In [6]:
df["feature"] = df['description'] + " "+ df['requirements'] + " " + df['benefits']

In [7]:
df['feature'] = df['feature'].str.lower()
# remove html tags and word that start with & and \
df['feature'] = df['feature'].str.replace(r'<[^>]*>', '')
df['feature'] = df['feature'].str.replace(r'&[^;]*;', '')
df['feature'] = df['feature'].str.replace(r'\\[a-z]*', '')
# remove punctuation
df['feature'] = df['feature'].str.replace(r'[^\w\s]', '')
# remove digits
df['feature'] = df['feature'].str.replace(r'\d+', '')
# remove whitespace
df['feature'] = df['feature'].str.replace(r'\s+', ' ')

  df['feature'] = df['feature'].str.replace(r'<[^>]*>', '')
  df['feature'] = df['feature'].str.replace(r'&[^;]*;', '')
  df['feature'] = df['feature'].str.replace(r'\\[a-z]*', '')
  df['feature'] = df['feature'].str.replace(r'[^\w\s]', '')
  df['feature'] = df['feature'].str.replace(r'\d+', '')
  df['feature'] = df['feature'].str.replace(r'\s+', ' ')


In [8]:
# tokenise
df['feature'] = df['feature'].apply(lambda x: word_tokenize(x.lower()))

In [9]:
df = df[['feature', 'fraudulent']]

In [10]:
# remove stopwords
all_stopwords = set(stopwords.words('english'))
all_stopwords.update(['\\r\\n'])
df['feature'] = df['feature'].apply(lambda x: [word for word in x if word not in all_stopwords])

In [11]:
# stem words
df['feature'] = df['feature'].apply(lambda x: [PorterStemmer().stem(word) for word in x])

In [12]:
df['feature'] = df['feature'].apply(lambda x: [word for word in x if len(word) >= 3])

In [13]:
df['feature'] = df['feature'].apply(lambda x: ' '.join(x))

In [14]:
# drop rows wwith empty str
df = df[df['feature'] != '']

In [15]:
df['fraudulent'] = df['fraudulent'].apply(lambda x: 1 if x == "t" else 0)

### Feature extraction using tf-idf

In [None]:
# Fit and transform the text data using TF-IDF
tfidf = TfidfVectorizer()
dtm = tfidf.fit_transform(df['feature'])

### Dimensionsality reduction using SVD <br>
This removes the less important variables in my dataset and improves training speed.

In [None]:
dimension = 500
svd = TruncatedSVD(dimension, random_state=42)
dtm_svd = svd.fit_transform(dtm)
# Apply Normalizer to normalize the data
dtm_svd_normalized = Normalizer(copy=False)
dtm_svd_normalized = dtm_svd_normalized.fit_transform(dtm_svd)

In [None]:
x = pd.DataFrame(dtm_svd)
x.reset_index(inplace=True, drop=True)
y = df['fraudulent']

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

### Modelling

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [None]:
evaluate_model(rf, x_train, y_train, 'Train')
evaluate_model(rf, x_test, y_test, 'Test')

In [None]:
# use SMOTETomek to oversample the minority class
x_res, y_res = SMOTETomek(sampling_strategy='all', random_state=42).fit_resample(x, y)

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [None]:
evaluate_model(rf, x_train, y_train, 'Train')
evaluate_model(rf, x_test, y_test, 'Test')

### Create a pipeline for the model

In [19]:
# make pipeline
tfidf = TfidfVectorizer()
svd = TruncatedSVD(n_components=350, random_state=42)
norm = Normalizer(copy=False)
smote = SMOTETomek(sampling_strategy='all', random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
pipe = make_pipeline(tfidf, svd, norm, smote, rf)
x_train, x_test, y_train, y_test = train_test_split(
    df['feature'], df['fraudulent'], test_size=0.2, random_state=42, stratify=df['fraudulent'])

In [20]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('truncatedsvd',
                 TruncatedSVD(n_components=350, random_state=42)),
                ('normalizer', Normalizer(copy=False)),
                ('smotetomek',
                 SMOTETomek(random_state=42, sampling_strategy='all')),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])

In [21]:
evaluate_model(pipe, x_train, y_train, 'Train')
evaluate_model(pipe, x_test, y_test, 'Test')

Train Set
Accuracy: 0.9999300845976369
F1 Score: 0.9996201002783337

Classification Report
              precision    recall  f1-score   support

           0     0.9999    1.0000    1.0000     13611
           1     1.0000    0.9986    0.9993       692

    accuracy                         0.9999     14303
   macro avg     1.0000    0.9993    0.9996     14303
weighted avg     0.9999    0.9999    0.9999     14303

Test Set
Accuracy: 0.982662192393736
F1 Score: 0.887844794094794

Classification Report
              precision    recall  f1-score   support

           0     0.9827    0.9994    0.9910      3403
           1     0.9826    0.6532    0.7847       173

    accuracy                         0.9827      3576
   macro avg     0.9826    0.8263    0.8878      3576
weighted avg     0.9827    0.9827    0.9810      3576



In [30]:
joblib.dump(pipe, '../models/rf.pkl')

['../models/rf.pkl']