In [496]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', None)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

In [497]:
#read data
df = pd.read_csv('fake_job_data.csv')

In [498]:
df.describe()

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
count,17880,17534,6333,2868,14572,17880,15191,10684,17880,17880,17880,14409,10830,9775,12977,11425,17880,17880
unique,11231,3105,1337,874,1710,15095,12119,6510,2,2,2,5,7,13,131,37,2,2
top,English Teacher Abroad,"GB, LND, London",Sales,0-0,<p>We help teachers get safe &amp; secure jobs...,"<p>Play with kids, get paid for it </p>\r\n<p>...",<p>University degree required. TEFL / TESOL / ...,<p>See job description</p>,f,t,f,Full-time,Mid-Senior level,Bachelor's Degree,Information Technology and Services,Information Technology,f,f
freq,311,718,551,142,726,376,410,726,17113,14220,9088,11620,3809,5145,1734,1749,17014,16980


In [499]:
#taking the instances belonging to the balanced set only
balanced_df = df[df.in_balanced_dataset == 't']

In [500]:
balanced_df.describe()

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,in_balanced_dataset
count,900,877,283,168,425,900,726,552,900,900,900,638,500,420,581,522,900,900
unique,700,482,150,112,290,750,575,425,2,2,2,5,7,11,79,33,2,1
top,Cruise Staff Wanted *URGENT*,"US, NY, New York",Sales,7200-1380000,<p>We have aggressive growth plans in place fo...,<p>We are a full-service marketing and staffin...,<p><b>Certification &amp; Experience:</b> Prev...,<p><b>Benefits:</b> On board en suite accommod...,f,f,f,Full-time,Entry level,Bachelor's Degree,Information Technology and Services,Administrative,t,t
freq,21,34,26,10,18,21,21,21,841,466,584,497,159,144,51,107,450,900


In [501]:
#focus only on description, requirements, benefits, company profile for the bag of words model on the balanced dataset
balanced_df.drop(columns = ['title', 'location', 'department', 'salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'in_balanced_dataset'], axis = 1, inplace=True)

In [502]:
balanced_df['requirements'] = balanced_df['requirements'].replace(np.nan, '')
balanced_df['benefits'] = balanced_df['benefits'].replace(np.nan, '')
balanced_df['company_profile'] = balanced_df['company_profile'].replace(np.nan, '')

In [503]:
balanced_df.isna().sum()

company_profile    0
description        0
requirements       0
benefits           0
fraudulent         0
dtype: int64

In [504]:
cleaned_balanced_df = balanced_df.copy()

In [505]:
#Combining description, company profile, benefits and requirements
cleaned_balanced_df['description'] = cleaned_balanced_df['company_profile'] + cleaned_balanced_df['description'] + cleaned_balanced_df['requirements'] + cleaned_balanced_df['benefits']

In [506]:
cleaned_balanced_df.drop(columns=['requirements', 'benefits', 'company_profile'], axis = 1, inplace=True)

In [507]:
cleaned_balanced_df.index = range(0,len(cleaned_balanced_df))

In [508]:
cleaned_balanced_df.head()

Unnamed: 0,description,fraudulent
0,<p>The group has raised a fund for the purchas...,t
1,<p>Sales Executive</p><p>Sales Executive</p><p...,t
2,"<p>A Newly established company seeks outgoing,...",t
3,<p>Administrative Assistant<br><br>Essential J...,t
4,<p><!--[if gte mso 9]><xml>\r\n <o:OfficeDocum...,f


In [509]:
import string
def make_lower(text):
    a = text.lower()
    
    return(a)
def remove_digits(text):
    a = re.sub(r'\d+', '', text)
    
    return(a)

def remove_links(text):
    soup = BeautifulSoup(text, 'html.parser')
    b = soup.find_all('a')
    l = []
    for i in b:
        l.append(i.text)
    full_text = ' '.join(l)
    if full_text == '':
        a = text
    else:
        a = full_text
    return(a)    

def remove_img_tags(data):
    p = re.compile(r'<img.*?>')
    return p.sub('', data)

def get_text(text):
    cleaned_text = ""
    text = text.replace("<br>", " ")
    soup = BeautifulSoup(text)
    for tag in soup.find_all('li'):
        cleaned_text = cleaned_text + " " + tag.text + " "
    for tag in soup.find_all('a', href=True):
        cleaned_text = cleaned_text +  " " + tag.text + " "
    for tag in soup.find_all('title'):
        cleaned_text = cleaned_text + " " + tag.text + " "
    for tag in soup.find_all('p'):
        cleaned_text = cleaned_text + " " + tag.text + " "

    return cleaned_text

def remove_punct(text):
    punc = [',','!','$','.','—','+',"’","'","–",'(',')','%','&','"','"',':',';','?','/',"#",'=','<', '>','*','.',"_"]
    text_filtered = ""
    
    for w in text:
        if w not in punc:
            #print(w)
            text_filtered = text_filtered + w
        else:
            text_filtered = text_filtered + " "
            
    return (text_filtered)

def remove_non_ascii(text):
    encoded= text.encode("ascii", "ignore")
    decoded = encoded.decode()
    
    return decoded

def removing_stopwords(text):
    text = (remove_stopwords(text))
    return (text)
    
def stem_text(text):
    words = text.split()
    porter = PorterStemmer()
    stemmed_text = ""
    for word in words:
        stemmed_text = stemmed_text + " " + porter.stem(word)
        
    return stemmed_text

In [510]:
def preprocessing(text):
    text = text.apply(make_lower)
    text = text.apply(remove_img_tags)
    text = text.apply(remove_links)
    text = text.apply(remove_digits)
    text = text.apply(get_text)
    text = text.apply(remove_punct)
    text = text.apply(remove_non_ascii)
    text = text.apply(removing_stopwords)
    text = text.apply(stem_text)
    return text

In [512]:
cleaned_balanced_df['description'] = preprocessing(cleaned_balanced_df['description'])

In [513]:
cleaned_balanced_df.head()

Unnamed: 0,description,fraudulent
0,group rais fund purchas home southeast studen...,t
1,sale execut sale execut sale execut,t
2,newli establish compani seek outgo friendli p...,t
3,administr assist essenti job respons answer r...,t
4,graduationsourc seek help custom custom produ...,f


## Model fits

In [514]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

In [515]:
#creating train and test set
X = cleaned_balanced_df['description']
y = cleaned_balanced_df['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

In [516]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [517]:
vocab = vect.vocabulary_
print("Total number of vocabs:", len(vocab))
bow = vect.transform(X_train)
bow = bow.toarray()

Total number of vocabs: 5853


In [518]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names()

In [525]:
#printing every 50th feature name
print(feature_names[::50])

['aa', 'acquir', 'adtech', 'agenda', 'am', 'angellist', 'apprais', 'artist', 'attend', 'awr', 'batteri', 'berkley', 'blive', 'brainstorm', 'buildin', 'calidad', 'carri', 'centrum', 'christma', 'climb', 'columbia', 'complet', 'congeni', 'conversion', 'courtesi', 'cs', 'damag', 'decision', 'depart', 'dfcdabfeddfdaedbcdbcda', 'dislik', 'dpi', 'dysfunct', 'eeaecbeceabcbecdadc', 'employ', 'entiti', 'estemo', 'exchang', 'extern', 'fastgrow', 'filtrat', 'flourish', 'form', 'frontend', 'gamp', 'gift', 'graphic', 'hack', 'header', 'hive', 'housekeep', 'id', 'incom', 'inhabit', 'intellect', 'involucrado', 'jee', 'kendal', 'labor', 'leader', 'lik', 'london', 'main', 'martinsburg', 'media', 'mi', 'mixtur', 'motherless', 'mvc', 'network', 'nottingham', 'odm', 'onshor', 'osha', 'packag', 'patch', 'perm', 'pig', 'pmi', 'power', 'previou', 'professional', 'psv', 'quarterli', 'raymondhr', 'reconnaiss', 'relaunch', 'research', 'revolution', 'round', 'san', 'scn', 'seminar', 'shareabl', 'silicon', 'small

### Logistic Regression

In [521]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print(logreg.score(X_test, y_test))
pred_logreg = logreg.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print(confusion)

0.725925925925926
[[117  19]
 [ 55  79]]


### Naive Bayes

In [522]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
print(nb.score(X_test, y_test))
pred_nb = nb.predict(X_test)
confusion_nb = confusion_matrix(y_test, pred_nb)
print(confusion_nb)

0.7666666666666667
[[ 92  44]
 [ 19 115]]


### Random forest

In [524]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
pred_rf = rf.predict(X_test)
confusion_rf = confusion_matrix(y_test, pred_rf)
print(confusion_rf)

0.762962962962963
[[125  11]
 [ 53  81]]
