### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import html
import joblib
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, MultiLabelBinarizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
from imblearn.pipeline import make_pipeline
from gensim.parsing.porter import PorterStemmer

# modelling
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
nltk.download('punkt')

### Helper class

In [None]:
def evaluate_model(model, X, y, label):
    """
    :param model: model to evaluate
    :param X: features
    :param y: target
    :param label: label for the model 

    """
    y_pred = model.predict(X)

    print(label + ' Set')
    print("Accuracy:", accuracy_score(y, y_pred))
    print("F1 Score:", f1_score(y, y_pred, average='macro'))
    print()

    print("Classification Report")


    print(classification_report(y, y_pred, digits=4))
    


def get_score(model, X, y):
    """
    :param model: model to evaluate
    :param X: features
    :param y: target

    """
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
    print('Accuracy: ', cross_val_score(model, X, y, cv=cv, scoring='accuracy').mean())
    print('Precision Macro: ', cross_val_score(model, X, y, cv=cv, scoring='precision_macro').mean())
    print('Recall Macro: ', cross_val_score(model, X, y, cv=cv, scoring='recall_macro').mean())
    print('F1 Macro: ', cross_val_score(model, X, y, cv=cv, scoring='f1_macro').mean())
    
def compress_file(input_file, output_tar_gz):
    shutil.make_archive(output_tar_gz, 'xztar', '.', input_file)

In [None]:
df = pd.read_csv('../datasets/emscad_v1.csv')

In [None]:
df.columns

### Data Cleaning

In [None]:
df = df[['description', 'requirements', 'benefits', 'fraudulent']].fillna('')

In [None]:
df["feature"] = df['description'] + " "+ df['requirements'] + " " + df['benefits']

In [None]:
df = df[['feature', 'fraudulent']]

In [None]:
df.head(5)

In [None]:
df['feature'] = df['feature'].str.lower()
df.head(5)

In [None]:
def remove_html_tags_and_escape_chars(input_text):
    # Remove HTML tags
    text_without_html = BeautifulSoup(input_text, 'html.parser').get_text()

    # Unescape HTML characters
    text_without_escape_chars = html.unescape(text_without_html)

    return text_without_escape_chars

In [None]:
df['feature'] = df['feature'].apply(remove_html_tags_and_escape_chars)
df.head(5)

In [None]:
def remove_non_alpha(input_text):
    return ''.join(char if char.isalpha() or char.isspace() else ' ' for char in input_text)

In [None]:
df['feature'] = df['feature'].apply(remove_non_alpha)
df.head(5)

In [None]:
# tokenise
df['feature'] = df['feature'].apply(lambda x: word_tokenize(x.lower()))

In [None]:
df.head(5)

In [None]:
# remove stopwords
all_stopwords = set(stopwords.words('english'))
all_stopwords.update(['\\r\\n'])
df['feature'] = df['feature'].apply(lambda x: [word for word in x if word not in all_stopwords])

In [None]:
df.head(5)

In [None]:
# stem words
df['feature'] = df['feature'].apply(lambda x: [PorterStemmer().stem(word) for word in x])

In [None]:
df.head(5)

In [None]:
df['feature'] = df['feature'].apply(lambda x: [word for word in x if len(word) >= 3])

In [None]:
df.head(5)

In [None]:
df['feature'] = df['feature'].apply(lambda x: ' '.join(x))

In [None]:
# drop rows wwith empty str
df = df[df['feature'] != '']

In [None]:
df['fraudulent'] = df['fraudulent'].apply(lambda x: 1 if x == "t" else 0)

In [None]:
df.head(5)

### Feature extraction using tf-idf

In [None]:
# Fit and transform the text data using TF-IDF
tfidf = TfidfVectorizer()
dtm = tfidf.fit_transform(df['feature'])

### Dimensionsality reduction using SVD <br>
This removes the less important variables in my dataset and improves training speed.

In [None]:
dimension = 350
svd = TruncatedSVD(dimension, random_state=42)
dtm_svd = svd.fit_transform(dtm)
# Apply Normalizer to normalize the data
dtm_svd_normalized = Normalizer(copy=False)
dtm_svd_normalized = dtm_svd_normalized.fit_transform(dtm_svd)

In [None]:
x = pd.DataFrame(dtm_svd)
x.reset_index(inplace=True, drop=True)
y = df['fraudulent']

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

### Modelling

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [None]:
evaluate_model(rf, x_train, y_train, 'Train')
evaluate_model(rf, x_test, y_test, 'Test')

In [None]:
# use SMOTETomek to oversample the minority class
x_res, y_res = SMOTETomek(sampling_strategy='all', random_state=42).fit_resample(x, y)

In [None]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

In [None]:
evaluate_model(rf, x_train, y_train, 'Train')
evaluate_model(rf, x_test, y_test, 'Test')

### Create a pipeline for the model

In [None]:
test_df = df

In [None]:
# make pipeline
tfidf = TfidfVectorizer()
svd = TruncatedSVD(n_components=350, random_state=42)
smote = SMOTETomek(sampling_strategy='all', random_state=42)
norm = Normalizer(copy=False)
rf = RandomForestClassifier(n_estimators=300, random_state=42)
pipe = make_pipeline(tfidf, svd, smote, norm, rf)
x_train, x_test, y_train, y_test = train_test_split(
    df['feature'], df['fraudulent'], test_size=0.2, random_state=42, stratify=df['fraudulent'])

In [None]:
pipe.fit(x_train, y_train)

In [None]:
evaluate_model(pipe, x_train, y_train, 'Train')
evaluate_model(pipe, x_test, y_test, 'Test')

In [None]:
joblib.dump(pipe, '../models/rf.pkl')

### CNN Attempt

In [None]:
import tensorflow as tf
from keras import layers, models
from keras.callbacks import EarlyStopping, LearningRateScheduler
from keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Activation, BatchNormalization
from keras.preprocessing.image import ImageDataGenerator

In [None]:
# make pipeline
tfidf = TfidfVectorizer()
svd = TruncatedSVD(n_components=350, random_state=42)
smote = SMOTETomek(sampling_strategy='all', random_state=42)
norm = Normalizer(copy=False)
pipe = make_pipeline(tfidf, svd, smote, norm)
x_train, x_test, y_train, y_test = train_test_split(
    df['feature'], df['fraudulent'], test_size=0.2, random_state=42, stratify=df['fraudulent'])
pipe.fit(x_train, y_train)

In [None]:
new_x_train = pipe.transform(x_train)
new_x_test = pipe.transform(x_test)

In [None]:
basic_adam_model = models.Sequential()
basic_adam_model.add(layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
basic_adam_model.add(layers.MaxPooling2D((2, 2)))
basic_adam_model.add(layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
basic_adam_model.add(layers.MaxPooling2D((2, 2)))
basic_adam_model.add(layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
basic_adam_model.add(layers.MaxPooling2D((2, 2)))
basic_adam_model.add(layers.Flatten())
basic_adam_model.add(layers.Dense(64, activation='relu'))
basic_adam_model.add(layers.Dense(10, activation='sigmoid'))

basic_adam_model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
basic_adam_model.build(input_shape=(None, 32, 32, 1))
basic_adam_model.summary()

In [None]:
batch_size = 64
epochs = 100

es_callback = EarlyStopping(monitor='val_loss', mode='min', patience=5)

basic_adam_model_history = basic_adam_model.fit(new_x_train.reshape(-1, 32, 32, 1), y_train,
                                                validation_data=(new_x_test.reshape(-1, 32, 32, 1), y_test),
                                                batch_size=batch_size, epochs=epochs, callbacks=[es_callback])