In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
# cleaning
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, MultiLabelBinarizer
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from gensim.parsing.porter import PorterStemmer

# modelling
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain\
# Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import RepeatedStratifiedKFold

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuhao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('../datasets/emscad_v1.csv')

In [3]:
df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'in_balanced_dataset'],
      dtype='object')

In [4]:
df = df[['description', 'requirements', 'benefits', 'fraudulent']].fillna('')

In [5]:
df["feature"] = df['description'] + " "+ df['requirements'] + " " + df['benefits']

In [6]:
df['feature'] = df['feature'].str.lower()
# remove html tags and word that start with & and \
df['feature'] = df['feature'].str.replace(r'<[^>]*>', '')
df['feature'] = df['feature'].str.replace(r'&[^;]*;', '')
df['feature'] = df['feature'].str.replace(r'\\[a-z]*', '')
# remove punctuation
df['feature'] = df['feature'].str.replace(r'[^\w\s]', '')
# remove digits
df['feature'] = df['feature'].str.replace(r'\d+', '')
# remove whitespace
df['feature'] = df['feature'].str.replace(r'\s+', ' ')

  df['feature'] = df['feature'].str.replace(r'<[^>]*>', '')
  df['feature'] = df['feature'].str.replace(r'&[^;]*;', '')
  df['feature'] = df['feature'].str.replace(r'\\[a-z]*', '')
  df['feature'] = df['feature'].str.replace(r'[^\w\s]', '')
  df['feature'] = df['feature'].str.replace(r'\d+', '')
  df['feature'] = df['feature'].str.replace(r'\s+', ' ')


In [7]:
# tokenise
df['feature'] = df['feature'].apply(lambda x: word_tokenize(x.lower()))

In [8]:
df = df[['feature', 'fraudulent']]

In [9]:
# remove stopwords
all_stopwords = set(stopwords.words('english'))
all_stopwords.update(['\\r\\n'])
df['feature'] = df['feature'].apply(lambda x: [word for word in x if word not in all_stopwords])

In [10]:
# stem words
df['feature'] = df['feature'].apply(lambda x: [PorterStemmer().stem(word) for word in x])

In [11]:
df['feature'] = df['feature'].apply(lambda x: [word for word in x if len(word) >= 3])

In [12]:
df['feature'] = df['feature'].apply(lambda x: ' '.join(x))

In [13]:
# drop rows wwith empty str
df = df[df['feature'] != '']

In [14]:
df['fraudulent'] = df['fraudulent'].apply(lambda x: 1 if x == "t" else 0)

### Feature extraction using tf-idf

In [15]:
tfidf = TfidfVectorizer()
dtm = tfidf.fit_transform(df['feature'])

### Dimensionsality reduction using SVD <br>
This removes the less important variables in my dataset and improves training speed.

In [16]:
dimension = 350
svd = TruncatedSVD(dimension, random_state=42)
dtm_svd = svd.fit_transform(dtm)
dtm_svd = Normalizer(copy=False).fit_transform(dtm_svd)

In [19]:
x = pd.DataFrame(dtm_svd)
x.reset_index(inplace=True, drop=True)
y = df['fraudulent']

In [20]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)