## Imports & functions

In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import missingno as Missingno

from nltk import SnowballStemmer
from nltk.corpus import *
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk import *
from nltk.stem import WordNetLemmatizer
import nltk as nltk
import re
import unidecode
import langid

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score,recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
import lightgbm as lgbm
from datetime import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from scipy import sparse
import joblib
import shap

import matplotlib.pyplot as plt 
from imblearn.under_sampling import RandomUnderSampler

In [2]:
#Ändra till True om modellprestanda och omvandlade indata ska sparas som excel/csv filer för snabbare omkörning av notebook
save_data = False

#Ändra till True om det redan finns omvandlade filer förberedda
read_data = True

#Save path för modeller
save_path_model= "C:\JupyterNotebook\Applicerad_AI\Final_data\Testresults"


In [3]:
#Läs in data
path = "C:\JupyterNotebook\Applicerad_AI"
df = pd.read_csv(f'{path}/fake_jobs_dataset_v2.csv')

In [4]:
lemmatizer = WordNetLemmatizer()

### Functions

In [5]:
def to_lower(x):
    if isinstance(x,str) is True:
        xl = x.lower()
        return xl
    else:
        return x

In [6]:
#Function for removal of stopwords
tokenizer = RegexpTokenizer(r'\w+')

def is_not_stopword(company_description):
    company_description = tokenizer.tokenize(company_description)
    non_stopwords_list = [word for word in company_description if word not in stopwords.words("english")]
    non_stopword_string = " ".join(non_stopwords_list)
    return non_stopword_string

In [7]:
#Function for lemmatization
def lem(x):
    tokens = tokenizer.tokenize(x)
    full_string = [lemmatizer.lemmatize(word) for word in tokens]
    full_string_x = " ".join(full_string)
    return full_string_x

In [8]:
#Function to get pos_tags

full_pos_list = []

def get_pos_tags(words):
    if words != "":
        word_pos_tuple_list = pos_tag(word_tokenize(words))
        word,pos = zip(*word_pos_tuple_list)
        [full_pos_list.append(new_pos_tag) for new_pos_tag in pos if new_pos_tag not in full_pos_list]
        return pos
    else:
        return None

#### For model

In [9]:
# List of algorithms to test

classifier_list = [LinearSVC(),ComplementNB(), KNeighborsClassifier(), xgb.XGBClassifier(), RandomForestClassifier(),DecisionTreeClassifier(), lgbm.LGBMClassifier()]
clasifier_names = ["Linear SVM","Naive Bayes", "KNN","XGBoostClassifier","Random forest classifier","Decision tree classifier","Light GBM"]


In [11]:
#Fuction for modelbenchmarking, also alows saving of an excelfile with comparisons of the different algorithms for presentation purposes

def model_benchmarking(Algorithm_list,Name_of_Algorithms,Train_data,Train_outcome,Test_data,Test_outcome,save_data,full_frame,savepath_model = save_path_model,test_name=""):
    model_dict = {}
    
    d_frame = pd.DataFrame(data={"Accuracy":[], "Recall":[]})
    for i in range(len(Algorithm_list)):
        print(f"Training and testing the performance of {Name_of_Algorithms[i]}")
        clf = Algorithm_list[i]
        clf.fit(Train_data,Train_outcome)
        predictions = clf.predict(Test_data)
        cm = confusion_matrix(Test_outcome,predictions)
        acc = accuracy_score(Test_outcome,predictions)
        rec = recall_score(Test_outcome,predictions,average="macro")
        acc = round(acc,5)
        rec= round(rec,5)
        print(f"The Accuracy of the model is: {acc} and recall score: {rec}")
        display_cm = ConfusionMatrixDisplay(confusion_matrix=cm)
        display_cm.plot()
        plt.show()
        model_dict[Name_of_Algorithms[i]] = [clf,acc,rec]
        framed = pd.DataFrame(data={"Accuracy":[acc],"Recall":[rec]}, index=[test_name + Name_of_Algorithms[i]])
        d_frame = pd.concat([d_frame,framed])
    if save_data == True:
        date_ = datetime.now().strftime("%Y-%m-%d")
        file_name = date_ + " " + test_name +".xlsx"
        savepath_model = savepath_model + "\\" + file_name
        d_frame.to_excel(savepath_model)
    full_frame = pd.concat([full_frame,d_frame])
    return model_dict,full_frame

## Lemmatization and data preparation

In [None]:
Missingno.bar(df)

In [12]:
#Applying functions to remove stop words, links, unicode text and lemmatization of text


columns_to_prepare = ["company_profile","description","requirements","benefits","title"]
if read_data == False:
    for col in columns_to_prepare:
        df["clean_"+col] = df[col]
        df["clean_"+col].fillna(" ",inplace=True)
        df["clean_"+col] = df["clean_"+col].apply(lambda company_text: ' '.join(re.sub("(w+://S+)", " ",company_text).split()))
        df["clean_"+col] = df["clean_"+col].apply(lambda words: to_lower(words))
        df["clean_"+col] = df["clean_"+col].apply(lambda word: unidecode.unidecode(word))
        df["clean_"+col] = df["clean_"+col].apply(lambda word: re.sub('\d+','',word))
        df["clean_"+col] = df["clean_"+col].apply(lambda string: is_not_stopword(string))
        df["clean_"+col] = df["clean_"+col].apply(lambda x: lem(x))
        df_lang = df.copy()
        df_lang["lang"] = df_lang["clean_description"].apply(lambda x: langid.classify(x)[0])
    df_lang_en = df_lang[df_lang["lang"] == "en"]
    df_lang_en = df_lang_en.reset_index(drop=True)
    df_lemma = df_lang_en.drop(columns=["job_id","company_profile","description","requirements","benefits","title","lang"])
    df_lemma.fillna("",inplace=True)
    save_path = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_EN_lemma.csv"
    if save_data == True:
        df_lemma.to_csv(save_path,index=False)

## Modeltesting - Textbased model. No encoding

In [13]:
path_text = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_EN_lemma.csv"

In [14]:
if read_data == True:
    df_text = pd.read_csv(f'{path_text}')
else:
    df_text = df_lemma.copy()

In [15]:
for col in df_text.columns:
    df_text[col] = df_text[col].apply(lambda x: to_lower(x))

In [16]:
df_prep = df_text.copy()

In [19]:
#Manually converting some columns to categorical values

df_prep["employment_type"].replace("other","", inplace=True)
df_prep["has_company_logo"].replace(1,"company logo",inplace=True)
df_prep["has_company_logo"].replace(0,"",inplace=True)
df_prep["telecommuting"].replace(1,"remote work",inplace=True)
df_prep["telecommuting"].replace(0,"",inplace=True)
df_prep["has_questions"].replace(0,"",inplace=True)
df_prep["has_questions"].replace(1,"question",inplace=True)
df_prep.fillna("",inplace=True)

In [20]:
columns_to_skip = ["fraudulent","clean_company_profile","clean_description","clean_requirements","salary_range","department"]
for col in df_prep.columns:
    if col not in columns_to_skip:
        df_prep[col]=df_prep[col].apply(lambda x: re.sub(",","",x))
        df_prep[col]=df_prep[col].apply(lambda x: is_not_stopword(x))
        df_prep[col]=df_prep[col].apply(lambda x: to_lower(x))

In [17]:
save_path = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_EN_text_based.csv"

if save_data == True:
    df_prep.to_csv(save_path,index=False)

## Modeltesting - Textbased. Trying to figure out which columns are to be used in the final model

#### PATH - Importing data

In [21]:
### Importing dataset
path = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_EN_text_based.csv"

if read_data == True:
    df_non_mod = pd.read_csv(path)
else:
    df_non_mod = df_prep.copy()

### 1.1 All information combined into one column

In [22]:
df_full_text = df_non_mod.copy()

In [23]:
df_full_text.fillna("",inplace= True)

df_full_text["full_text"] =  df_full_text["clean_title"] + " " + df_full_text["location"] + " " + df_full_text["clean_benefits"] + " " + df_full_text["industry"] \
    + " " + df_full_text["employment_type"]+ " " + df_full_text["clean_company_profile"] + " " + df_full_text["required_experience"]+ " " \
        + df_full_text["clean_description"]+ " " + df_full_text["clean_requirements"]+ df_full_text["has_questions"] + df_full_text["function"] + df_full_text["required_education"] \
            + df_full_text["telecommuting"] + df_full_text["department"]


df_full_text = df_full_text.drop(columns=["clean_title","location","clean_benefits","has_company_logo","employment_type","industry","clean_company_profile","required_experience","clean_description","clean_requirements","required_education","telecommuting","function","has_questions","salary_range","department"])

In [24]:
df_full_text = df_full_text.drop_duplicates(subset=["full_text"])

In [25]:
X_only_text = df_full_text["full_text"]

y = df_full_text["fraudulent"]

In [26]:
X_only_text.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_text,y, train_size = 0.7, random_state=0)

In [27]:
#Vectorization of the combined data
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
full_frame = pd.DataFrame(columns=["Accuracy","Recall"])

models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.1 All textinfo - text based")

In [None]:
full_frame = models_produced[1]

### 1.2 [DESCRIPTION] All information except company profile

In [28]:
df_description = df_non_mod.drop(columns=["clean_company_profile"])
df_description.fillna("",inplace= True)

df_description["full_text"] =  df_description["clean_title"] + " " + df_description["location"] + " " + df_description["clean_benefits"] + " " + df_description["industry"] \
    + " " + df_description["employment_type"]+  " " + df_description["required_experience"]+ " " \
        + df_description["clean_description"]+ " " + df_description["clean_requirements"]+ df_description["has_questions"] + df_description["function"] + df_description["required_education"] \
            + df_description["telecommuting"] + df_description["department"]

#remove the combined columns
df_description = df_description.drop(columns=["clean_title","location","clean_benefits","has_company_logo","employment_type","industry","required_experience","clean_description","clean_requirements","required_education","telecommuting","function","has_questions","department","salary_range"])

In [29]:
df_description = df_description.drop_duplicates(subset=["full_text"])

In [30]:
X_only_text = df_description["full_text"]
y = df_description["fraudulent"]

X_only_text.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_text,y, train_size = 0.7, random_state=0)
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.2 All textinfo except company profile")
full_frame = models_produced[1]

### 1.3 [COMPANY PROFILE] All information except description 

In [None]:
df_company_profile = df_non_mod.copy()

In [None]:
df_company_profile = df_company_profile.drop(columns=["clean_description"])
df_company_profile.fillna("",inplace= True)

df_company_profile["full_text"] =  df_company_profile["clean_title"] + " " + df_company_profile["location"] + " " + df_company_profile["clean_benefits"] + " " + df_company_profile["industry"] \
    + " " + df_company_profile["employment_type"]+  " " + df_company_profile["required_experience"]+ " " \
        + df_company_profile["clean_company_profile"]+ " " + df_company_profile["clean_requirements"]+ df_company_profile["has_questions"] + df_company_profile["function"] + df_company_profile["required_education"] \
            + df_company_profile["telecommuting"]+ df_company_profile["department"]

#remove the combined columns
df_company_profile = df_company_profile.drop(columns=["clean_title","location","clean_benefits","has_company_logo","employment_type","industry","required_experience","clean_company_profile","clean_requirements","required_education","telecommuting","function","has_questions","department","salary_range"])

In [None]:
df_company_profile = df_company_profile.drop_duplicates(subset=["full_text"])

In [None]:
X_only_text = df_company_profile["full_text"]
y = df_company_profile["fraudulent"]

X_only_text.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_text,y, train_size = 0.7, random_state=0)
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.3 All textinfo except description")
full_frame = models_produced[1]

### 1.4 [COMPANY PROFILE > Description] Company profile is used if available, otherwise company description

In [None]:
df_company_profile_description = df_non_mod.copy()


In [None]:
null_pos_list = np.where(df_company_profile_description['clean_company_profile'].isnull() == True)[0]

In [None]:
for null_loc in null_pos_list:
    df_company_profile_description["clean_company_profile"].loc[null_loc] = df_company_profile_description["clean_description"].loc[null_loc]

In [None]:

df_company_profile_description.fillna("",inplace= True)

df_company_profile_description["full_text"] =  df_company_profile_description["clean_title"] + " " + df_company_profile_description["location"] + " " + df_company_profile_description["clean_benefits"] + " " + df_company_profile_description["industry"] \
    + " " + df_company_profile_description["employment_type"]+  " " + df_company_profile_description["required_experience"]+ " " \
        + df_company_profile_description["clean_company_profile"]+ " " + df_company_profile_description["clean_requirements"]+ df_company_profile_description["has_questions"] + df_company_profile_description["function"] + df_company_profile_description["required_education"] \
            + df_company_profile_description["telecommuting"] +df_company_profile_description["department"]

df_company_profile_description = df_company_profile_description.drop(columns=["clean_title","location","clean_benefits","has_company_logo","employment_type","industry","required_experience","clean_company_profile","clean_requirements","required_education","telecommuting","function","has_questions","clean_description","department","salary_range"])

In [None]:
df_company_profile_description = df_company_profile_description.drop_duplicates(subset=["full_text"])

In [None]:
X_only_text = df_company_profile_description["full_text"]
y = df_company_profile_description["fraudulent"]

X_only_text.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_text,y, train_size = 0.7, random_state=0)
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.4 All text info description används om company profile inte finns")
full_frame = models_produced[1]

### 1.5 [COMPANY PROFILE] Only company profile

In [None]:
df_company_profile_only = df_non_mod.copy()

In [None]:
df_company_profile_only = df_company_profile_only.drop_duplicates(subset=["clean_company_profile"])

In [None]:
X_only_company_profile = df_company_profile_only["clean_company_profile"]
y_company_profile = df_company_profile_only["fraudulent"]

In [None]:
X_only_company_profile.fillna(" ",inplace=True)

X_only_company_profile.drop_duplicates(inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_company_profile,y_company_profile, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.5 Endast company profile")
full_frame = models_produced[1]

### 1.6 [COMPANY PROFILE] only company profile - Ads without company profile are removed

In [None]:
df_company_profile_only = df_non_mod.copy()

In [None]:
df_company_profile_reduced = df_company_profile_only.dropna(subset=["clean_company_profile"])

In [None]:
df_company_profile_reduced = df_company_profile_reduced.drop_duplicates(subset=["clean_company_profile"])

In [None]:
X_only_company_profile = df_company_profile_reduced["clean_company_profile"]
y_company_profile = df_company_profile_reduced["fraudulent"]

X_train,X_test,y_train, y_test = train_test_split(X_only_company_profile,y_company_profile, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.6 Bara company profile NaN tas bort")
full_frame = models_produced[1]

### 1.7 [COMPANY PROFILE] All text information - Ads without company profile are removed

In [None]:
df_company_profile_only = df_non_mod.copy()

In [None]:
df_company_profile_reduced = df_company_profile_only.dropna(subset=["clean_company_profile"])

In [None]:
df_company_profile_reduced.fillna("",inplace= True)

df_company_profile_reduced["full_text"] =  df_company_profile_reduced["clean_title"] + " " + df_company_profile_reduced["location"] + " " + df_company_profile_reduced["clean_benefits"] + " " + df_company_profile_reduced["industry"] \
    + " " + df_company_profile_reduced["employment_type"]+ " " + df_company_profile_reduced["clean_company_profile"] + " " + df_company_profile_reduced["required_experience"]+ " " \
        + df_company_profile_reduced["clean_description"]+ " " + df_company_profile_reduced["clean_requirements"]+ df_company_profile_reduced["has_questions"] + df_company_profile_reduced["function"] + df_company_profile_reduced["required_education"] \
            + df_company_profile_reduced["telecommuting"]

#remove the combined columns
df_company_profile_reduced = df_company_profile_reduced.drop(columns=["clean_title","location","clean_benefits","has_company_logo","employment_type","industry","clean_company_profile","required_experience","clean_description","clean_requirements","required_education","telecommuting","function","has_questions"])

In [None]:
df_company_profile_reduced.drop_duplicates(inplace=True)

In [None]:
X_only_company_profile = df_company_profile_reduced["full_text"]
y_company_profile = df_company_profile_reduced["fraudulent"]

X_train,X_test,y_train, y_test = train_test_split(X_only_company_profile,y_company_profile, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.7 All text information, endast för ads där company profile finns")
full_frame = models_produced[1]

### 1.8 [DESCRIPTION] Description only

In [None]:
df_company_profile_only = df_non_mod.copy()

In [None]:
df_company_profile_only = df_company_profile_only.drop_duplicates(subset=["clean_description"])

In [None]:
X_only_company_profile = df_company_profile_only["clean_description"]
y_company_profile = df_company_profile_only["fraudulent"]

X_only_company_profile.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_company_profile,y_company_profile, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="1.8 Endast description")
full_frame = models_produced[1]

### Pos_Tag for algorithm predictions

In [None]:
from nltk import pos_tag, word_tokenize
nltk.download('punkt')
nltk.download('maxent_treebank_pos_tagger')
nltk.download("averaged_perceptron_tagger")

In [None]:
df_pos_tag = df_non_mod.copy()

In [None]:
df_pos_tag["clean_company_profile"].fillna("",inplace= True)

df_pos_tag["tags"] = df_pos_tag["clean_company_profile"].apply(lambda x: get_pos_tags(x))

df_pos_tag_frame = pd.DataFrame(columns = full_pos_list)
df_pos_tag_frame["tags"] = df_pos_tag["tags"]
df_pos_tag_frame["fraudulent"] = df_pos_tag["fraudulent"]

In [None]:
tags_to_count = list(df_pos_tag_frame.columns)
tags_to_count.remove("fraudulent")
tags_to_count.remove("tags")

In [None]:
#Adding the tag count to each column for each word string

for i in range(len(df_pos_tag_frame)):
    for col in tags_to_count:
        sum_of_tags = 0
        if df_pos_tag_frame["tags"].iloc[i] != None:
            for val in df_pos_tag_frame["tags"].iloc[i]:
                if val == col:
                    sum_of_tags=sum_of_tags+1
            df_pos_tag_frame[col].iloc[i] = sum_of_tags
        else:
            df_pos_tag_frame[col].iloc[i] = 0


In [None]:
X_pos = df_pos_tag_frame.drop(columns=["tags","fraudulent"])
for col in list(X_pos.columns):
    X_pos[col] = X_pos[col].astype(int)

y_pos = df_pos_tag_frame["fraudulent"]

In [None]:
X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(X_pos,y_pos, train_size = 0.7, random_state=0)

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="Word tag")
full_frame = models_produced[1]

## Modeltesting - Undersampling

### Import

In [31]:
path = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_EN_text_based.csv"

if read_data == True:
    df_full = pd.read_csv(path)
else:
    df_full = df_prep.copy()

### 1.10 [UNDERSAMPLING] All text info

In [32]:
X_full = df_full.copy()

In [None]:
X_full.fillna("",inplace= True)

X_full["full_text"] =  X_full["clean_title"] + " " + X_full["location"] + " " + X_full["clean_benefits"] + " " + X_full["industry"] \
    + " " + X_full["employment_type"]+ " " + X_full["clean_company_profile"] + " " + X_full["required_experience"]+ " " \
        + X_full["clean_description"]+ " " + X_full["clean_requirements"]+ X_full["has_questions"] + X_full["function"] + X_full["required_education"] \
            + X_full["telecommuting"] + X_full["department"]

#remove the combined columns
X_full = X_full.drop(columns=["clean_title","location","clean_benefits","has_company_logo","employment_type","industry","clean_company_profile","required_experience","clean_description","clean_requirements","required_education","telecommuting","function","has_questions","salary_range","department"])

In [None]:
X_full.drop_duplicates(inplace=True)

In [None]:
X_full_all = X_full.drop(columns=["fraudulent"])
y_full = X_full["fraudulent"]

X_full_all.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_full_all,y_full, train_size = 0.8, random_state=0)

rus = RandomUnderSampler(random_state=0)

X_train_rus, y_train_rus = rus.fit_resample(X_train,y_train)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

In [None]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_rus["full_text"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["full_text"])

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="1.10 Undersampled - All text info i en kolumn")
full_frame = models_produced[1]

### 1.11 [UNDERSAMPLING - DESCRIPTION] Only description

In [None]:
df_reduced_description = df_full.copy()

In [None]:
df_reduced_description.drop_duplicates(subset=["clean_description"],inplace=True)

In [None]:
X_only_description = df_reduced_description[["clean_description"]]
y_description = df_reduced_description["fraudulent"]

X_only_description.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_description,y_description, train_size = 0.8, random_state=0)

In [None]:
X_train_rus, y_train_rus = rus.fit_resample(X_train,y_train)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_rus["clean_description"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["clean_description"])

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="1.11 Undersampled - Endast description")
full_frame = models_produced[1]

### 1.12 [UNDERSAMPLING - COMPANY PROFILE] Only company profile

In [None]:
df_reduced_company_profile = df_full.copy()

In [None]:
df_reduced_company_profile.drop_duplicates(subset=["clean_company_profile"],inplace=True)

In [None]:
X_only_company_profile = df_reduced_company_profile[["clean_company_profile"]]
y_company_profile = df_reduced_company_profile["fraudulent"]

X_only_company_profile.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_only_company_profile,y_company_profile, train_size = 0.8, random_state=0)

In [None]:
X_train_rus, y_train_rus = rus.fit_resample(X_train,y_train)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_rus["clean_company_profile"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["clean_company_profile"])

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="1.12 Undersampled - Endast company profile")
full_frame = models_produced[1]

### 1.13 [UNDERSAMPLING - COMPANY PROFILE] Company profile + all info except description

In [None]:
df_reduced_company_profile_all_info = df_full.copy()

In [None]:
df_reduced_company_profile_all_info = df_reduced_company_profile_all_info.drop(columns=["clean_description"])
df_reduced_company_profile_all_info.fillna("",inplace= True)

df_reduced_company_profile_all_info["full_text"] =  df_reduced_company_profile_all_info["clean_title"] + " " + df_reduced_company_profile_all_info["location"] + " " + df_reduced_company_profile_all_info["clean_benefits"] + " " + df_reduced_company_profile_all_info["industry"] \
    + " " + df_reduced_company_profile_all_info["employment_type"]+  " " + df_reduced_company_profile_all_info["required_experience"]+ " " \
        + df_reduced_company_profile_all_info["clean_company_profile"]+ " " + df_reduced_company_profile_all_info["clean_requirements"]+ df_reduced_company_profile_all_info["has_questions"] + df_reduced_company_profile_all_info["function"] + df_reduced_company_profile_all_info["required_education"] \
            + df_reduced_company_profile_all_info["telecommuting"] +df_reduced_company_profile_all_info["department"]

#remove the combined columns
df_reduced_company_profile_all_info = df_reduced_company_profile_all_info.drop(columns=["clean_title","location","clean_benefits","has_company_logo","employment_type","industry","required_experience","clean_company_profile","clean_requirements","required_education","telecommuting","function","has_questions","department"])

In [None]:
df_reduced_company_profile_all_info.drop_duplicates(inplace=True)

In [None]:
X_full_no_description = df_reduced_company_profile_all_info[["full_text"]]
y_full_no_description = df_reduced_company_profile_all_info["fraudulent"]

X_full_no_description.fillna(" ",inplace=True)

X_full_no_description.drop_duplicates()

X_train,X_test,y_train, y_test = train_test_split(X_full_no_description,y_full_no_description, train_size = 0.8, random_state=0)

In [None]:
X_train_rus, y_train_rus = rus.fit_resample(X_train,y_train)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_rus["full_text"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["full_text"])

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="1.13 Undersampled - all info except förutom description")
full_frame = models_produced[1]

## Datapreparation - Full dataset + encoding

### Functions


In [None]:
#Funktion för att skapa kolumn namn
def emp_type_fun(x,prefix):
    if x == "":
        return prefix + " NaN"
    elif isinstance(x,str):
        return prefix + " "+ x
    else:
        return prefix + " NaN"

### Encoding of full dataset

In [None]:
read_path = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_EN_lemma.csv"

if read_data == True:
    df_enc = pd.read_csv(read_path)
else:
    df_enc = df_lemma.copy()

In [None]:
for col in df_enc.columns:
    unique_values = len(df_enc[col].unique())
    print(f"{col} innehåller {unique_values} unika värden")
df_enc.dtypes

In [None]:
df_enc_mod = df_enc.copy()

#### Location

##### Functions for location

In [None]:
def split_location(x,df,col):
    if isinstance(x,str) is True:
        split_string =  x.split(",",3)
        split_string_stripped = [x.strip() for x in split_string]
        if len(split_string_stripped) <=3:
            return split_string_stripped
        else:
            return split_string_stripped[:-1]
    else:
        iloc_pos = x.index[0]
        new_val = df[col].iloc[iloc_pos].split(",",3)[:-1]
        split_string_stripped = [x.strip() for x in new_val]
        return split_string_stripped

In [None]:
def get_location_info(data,pos):
    if data == ['']:
        return ""
    else:
        if (pos >= 1 and len(data) == 1):
            return ""
        else:
            return data[pos]

##### Data

In [None]:
df_enc_mod["location"].fillna("",inplace=True)
df_enc_mod["location_max_split"] = df_enc_mod["location"].apply(lambda x: x.count(","))

In [None]:
df_enc_mod["location"] = df_enc_mod["location"].apply(lambda x: split_location(x,df_enc_mod,"location"))
df_enc_mod["location list length"] = df_enc_mod["location"].apply(lambda x: len(x))

In [None]:
new_columns = ["Country","State","City"]

for i in range(len(new_columns)): 
    df_enc_mod[new_columns[i]] = df_enc_mod["location"].apply(lambda x: get_location_info(x,i))

df_enc_final = df_enc_mod.drop(columns=["location","location_max_split","location list length"])

#### Employment type

In [None]:
df_enc_emp_type = df_enc_final.copy()

In [None]:
emp_types = pd.unique(df_enc_emp_type["employment_type"])
OH_enconder = OneHotEncoder()
label_enconder = LabelEncoder()


In [None]:
OH_enconder.fit(df_enc_emp_type[["employment_type"]])
oh_category_list = OH_enconder.categories_
oh_category_list_prefix = [emp_type_fun(word,"employment_type") for word in oh_category_list[0]]

OH_df_employment_type = pd.DataFrame(OH_enconder.fit_transform(df_enc_emp_type[["employment_type"]]).toarray(),columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,OH_df_employment_type],axis=1)

df_enc_final = df_enc_final.drop(columns=["employment_type"])

### Required experience

In [None]:
df_req_experience = df_enc_final.copy()

In [None]:
OH_enconder.fit(df_req_experience[["required_experience"]]);
oh_category_list = OH_enconder.categories_

oh_category_list_prefix = [emp_type_fun(word,"required_experience") for word in oh_category_list[0]]

OH_df_required_experience = pd.DataFrame(OH_enconder.fit_transform(df_req_experience[["required_experience"]]).toarray(),columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,OH_df_required_experience],axis=1)

df_enc_final = df_enc_final.drop(columns=["required_experience"])

### Required education 

In [None]:
df_required_education = df_enc_final.copy()

In [None]:
OH_enconder.fit(df_required_education[["required_education"]])

oh_category_list = OH_enconder.categories_

oh_category_list_prefix = [emp_type_fun(word,"required_education") for word in oh_category_list[0]]

oh_df_req_education = pd.DataFrame(OH_enconder.fit_transform(df_required_education[["required_education"]]).toarray(), columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_req_education],axis=1)

df_enc_final = df_enc_final.drop(columns=["required_education"])

### Function

In [None]:
df_function = df_enc_final.copy()

In [None]:
OH_enconder.fit(df_function[["function"]])

oh_category_list = OH_enconder.categories_

oh_category_list_prefix = [emp_type_fun(word,"function") for word in oh_category_list[0]]

oh_df_function = pd.DataFrame(OH_enconder.fit_transform(df_function[["function"]]).toarray(), columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_function],axis=1)

df_enc_final=df_enc_final.drop(columns=["function"])

### Industry 

In [None]:
df_industry = df_enc_final.copy()

In [None]:
OH_enconder.fit(df_industry[["industry"]])

OH_enconder.categories_[0]
oh_category_list = OH_enconder.categories_[0]

oh_category_list_prefix = [emp_type_fun(word,"industry") for word in oh_category_list]

oh_df_industry = pd.DataFrame(OH_enconder.fit_transform(df_industry[["industry"]]).toarray(),columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_industry],axis=1)

df_enc_final = df_enc_final.drop(columns=["industry"])

In [None]:
df_enc_final

### Department

In [None]:
df_department = df_enc_final.copy()

unique_departments = df_department["department"].fillna("")

df_department_cleaned = df_department.copy()

df_department_cleaned["department"].fillna(" ",inplace=True)

In [None]:
#Remove eventual links
df_department_cleaned["department"] = df_department_cleaned["department"].apply(lambda x: ' '.join(re.sub("(w+://S+)", " ",x).split()))

#Set all words to lower case
df_department_cleaned["department"] = df_department_cleaned["department"].apply(lambda words: words.lower())

#Remove unicode text e.g \0ax
df_department_cleaned["department"] = df_department_cleaned["department"].apply(lambda word: unidecode.unidecode(word))

#Remove numbers
df_department_cleaned["department"] = df_department_cleaned["department"].apply(lambda word: re.sub('\d+','',word))

#Remove stop words
df_department_cleaned["department"] = df_department_cleaned["department"].apply(lambda string: is_not_stopword(string))

#Lemmatization
df_department_cleaned["department"] = df_department_cleaned["department"].apply(lambda x: lem(x))

In [None]:
def check_num(x):
    if isinstance(x,int):
        print("digit")
    elif x.isdigit():
        print("digit")
    else:
        return x

In [None]:
check_if_only_digit = df_department_cleaned["department"].apply(lambda x: check_num(x))

OH_enconder.fit(df_department_cleaned[["department"]])

OH_enconder.categories_[0]
oh_category_list = OH_enconder.categories_[0]

oh_category_list_prefix = [emp_type_fun(word,"department") for word in oh_category_list]

oh_df_department = pd.DataFrame(OH_enconder.fit_transform(df_department_cleaned[["department"]]).toarray(),columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_department],axis=1)
df_enc_final = df_enc_final.drop(columns=["department"])


### Location - Country, State, City

#### Country

In [None]:
df_country = df_enc_final.copy()
OH_enconder.fit(df_country[["Country"]])
oh_category_list = OH_enconder.categories_
oh_category_list_prefix = [emp_type_fun(word,"Country") for word in oh_category_list[0]]

oh_df_country = pd.DataFrame(OH_enconder.fit_transform(df_country[["Country"]]).toarray(), columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_country],axis=1)

df_enc_final = df_enc_final.drop(columns=["Country"])

#### State

In [None]:
df_state = df_enc_final.copy()
OH_enconder.fit(df_state[["State"]])
oh_category_list = OH_enconder.categories_
oh_category_list_prefix = [emp_type_fun(word,"State") for word in oh_category_list[0]]

oh_df_state = pd.DataFrame(OH_enconder.fit_transform(df_state[["State"]]).toarray(), columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_state],axis=1)
df_enc_final = df_enc_final.drop(columns=["State"])

#### City

In [None]:
df_city = df_enc_final.copy()
OH_enconder.fit(df_city[["City"]])
oh_category_list = OH_enconder.categories_
oh_category_list_prefix = [emp_type_fun(word,"City") for word in oh_category_list[0]]

oh_df_city = pd.DataFrame(OH_enconder.fit_transform(df_city[["City"]]).toarray(), columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_city],axis=1)
df_enc_final = df_enc_final.drop(columns=["City"])

### Salary_range

In [None]:
def get_avg(x):
    if x != -1:
        min_max_values = x.split("-")
        if len(min_max_values) == 1:
            return int(min_max_values[0])
        else:   
            if min_max_values[0].isdigit() and min_max_values[1].isdigit():
                average_salary = (int(min_max_values[0])+int(min_max_values[1]))/2
                return average_salary
            else:
                return -1
    else:
        return x

In [None]:
def set_income_label(average_income,low_threshold,high_threshold):
    if average_income < low_threshold:
        return "low salary"
    elif high_threshold > average_income > low_threshold:
        return "medium salary"
    elif average_income > high_threshold:
        return "high salary"
    else:
        return ""

In [None]:
df_salary = df_enc_final.copy()

salary_copy = df_salary[["salary_range"]].copy()

salary_copy.fillna(-1,inplace=True)

salary_copy.replace("",-1, inplace=True)

df_salary["average"] = salary_copy["salary_range"].apply(lambda x: get_avg(x))

salary_threshold_df = df_salary.drop(df_salary[df_salary["average"]<0].index)

salary_thresholds = np.percentile(salary_threshold_df["average"],[20,80])

In [None]:
df_salary["average"]  = df_salary["average"] .apply(lambda x: set_income_label(x,salary_thresholds[0],salary_thresholds[1]))


OH_enconder.fit(df_salary[["average"]])

oh_category_list = OH_enconder.categories_
oh_category_list_prefix = [emp_type_fun(word,"Income") for word in oh_category_list[0]]

oh_df_salary = pd.DataFrame(OH_enconder.fit_transform(df_salary[["average"]]).toarray(), columns=oh_category_list_prefix)

df_enc_final = pd.concat([df_enc_final,oh_df_salary],axis=1)
df_enc_final = df_enc_final.drop(columns=["salary_range"])


### Saving

In [None]:
save_path_encoding = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_encoded.csv"

if save_data == True:
    df_enc_final.to_csv(save_path_encoding,index=False)

## Modeltesting - encoded data

### Import + overview

In [None]:
enc_df_path = "C:\JupyterNotebook\Applicerad_AI\Final_data/fake_jobs_dataset_v2_encoded.csv"

if read_data == True:
    enc_df = pd.read_csv(enc_df_path)
else:
    enc_df = df_enc_final.copy()

### 2.1 Only company profile

In [None]:
model_df = enc_df.copy()

In [None]:
model_df = model_df.drop(columns=["clean_description","clean_requirements","clean_benefits","clean_title"])

In [None]:
model_df.drop_duplicates(inplace=True)

In [None]:
X = model_df.drop(columns=["fraudulent"])
y = model_df["fraudulent"]

X.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X,y, train_size = 0.8, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train["clean_company_profile"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["clean_company_profile"])

In [None]:
X_train = X_train.drop(columns=["clean_company_profile"])
X_test = X_test.drop(columns=["clean_company_profile"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf,X_train))
X_test_tfidf = sparse.hstack((X_test_tfidf,X_test))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="2.1 Encoded dataset - Bara clean company profile")
full_frame = models_produced[1]

### 2.2 Only Description

In [None]:
model_df = enc_df.copy()

In [None]:
model_df = model_df.drop(columns=["clean_requirements","clean_company_profile","clean_benefits","clean_title"])

In [None]:
model_df.drop_duplicates(inplace=True)

In [None]:
X = model_df.drop(columns=["fraudulent"])
y = model_df["fraudulent"]

X.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X,y, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train["clean_description"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["clean_description"])

In [None]:
X_train = X_train.drop(columns=["clean_description"])
X_test = X_test.drop(columns=["clean_description"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf,X_train))
X_test_tfidf = sparse.hstack((X_test_tfidf,X_test))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="2.2 Encoded dataset - Endast Description")
full_frame = models_produced[1]

### 2.3 Combining text data into one column

In [None]:
model_df = enc_df.copy()

In [None]:
to_add = ["clean_description","clean_requirements","clean_company_profile","clean_benefits","clean_title"]

model_df["full_text"] = ""

for col in to_add:
    model_df["full_text"] = model_df["full_text"] + model_df[col].fillna("")

In [None]:
model_df.drop_duplicates(inplace=True)

In [None]:
X = model_df.drop(columns=["clean_title","clean_benefits","fraudulent","clean_requirements","clean_company_profile","clean_description"])
y = model_df["fraudulent"]

X.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X,y, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train["full_text"])
X_test_tfidf = tfidf_vectorizer.transform(X_test["full_text"])

In [None]:
X_train = X_train.drop(columns=["full_text"])
X_test = X_test.drop(columns=["full_text"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf,X_train))
X_test_tfidf = sparse.hstack((X_test_tfidf,X_test))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="2.3 Encoded dataset - All text info i en kolumn")
full_frame = models_produced[1]

### 2.4 All textdata vectorized separately

In [None]:
model_df = enc_df.copy()

In [None]:
model_df.drop_duplicates(inplace=True)

In [None]:
X = model_df.drop(columns=["fraudulent"])
y = model_df["fraudulent"]

X.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X,y, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf_company = tfidf_vectorizer.fit_transform(X_train["clean_company_profile"])
X_test_tfidf_company = tfidf_vectorizer.transform(X_test["clean_company_profile"])

X_train_tfidf_description = tfidf_vectorizer.fit_transform(X_train["clean_description"])
X_test_tfidf_description = tfidf_vectorizer.transform(X_test["clean_description"])

X_train_tfidf_req = tfidf_vectorizer.fit_transform(X_train["clean_requirements"])
X_test_tfidf_req = tfidf_vectorizer.transform(X_test["clean_requirements"])

X_train_tfidf_benefits = tfidf_vectorizer.fit_transform(X_train["clean_benefits"])
X_test_tfidf_benefits = tfidf_vectorizer.transform(X_test["clean_benefits"])

X_train_tfidf_title = tfidf_vectorizer.fit_transform(X_train["clean_title"])
X_test_tfidf_title = tfidf_vectorizer.transform(X_test["clean_title"])



In [None]:
X_train = X_train.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])
X_test = X_test.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf_company,X_train_tfidf_description,X_train_tfidf_req,X_train_tfidf_benefits,X_train_tfidf_title,X_train))
X_test_tfidf = sparse.hstack((X_test_tfidf_company,X_test_tfidf_description,X_test_tfidf_req,X_test_tfidf_benefits,X_test_tfidf_title,X_test))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="2.4 Full dataset - All text data vektoriserat separat")
full_frame = models_produced[1]

### 2.5 All text data except title

In [None]:
model_df = enc_df.copy()

In [None]:
model_df.drop_duplicates(inplace=True)

In [None]:
X = model_df.drop(columns=["fraudulent","clean_title"])
y = model_df["fraudulent"]

X.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X,y, train_size = 0.7, random_state=0)

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf_company = tfidf_vectorizer.fit_transform(X_train["clean_company_profile"])
X_test_tfidf_company = tfidf_vectorizer.transform(X_test["clean_company_profile"])

X_train_tfidf_description = tfidf_vectorizer.fit_transform(X_train["clean_description"])
X_test_tfidf_description = tfidf_vectorizer.transform(X_test["clean_description"])

X_train_tfidf_req = tfidf_vectorizer.fit_transform(X_train["clean_requirements"])
X_test_tfidf_req = tfidf_vectorizer.transform(X_test["clean_requirements"])

X_train_tfidf_benefits = tfidf_vectorizer.fit_transform(X_train["clean_benefits"])
X_test_tfidf_benefits = tfidf_vectorizer.transform(X_test["clean_benefits"])

In [None]:
X_train = X_train.drop(columns=["clean_company_profile","clean_benefits","clean_description","clean_requirements"])
X_test = X_test.drop(columns=["clean_company_profile","clean_benefits","clean_description","clean_requirements"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf_company,X_train_tfidf_description,X_train_tfidf_req,X_train_tfidf_benefits,X_train))
X_test_tfidf = sparse.hstack((X_test_tfidf_company,X_test_tfidf_description,X_test_tfidf_req,X_test_tfidf_benefits,X_test))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train,X_test_tfidf,y_test,save_data, full_frame,test_name="2.5 Full dataset - All text data förutom title separat vektoriserat")
full_frame = models_produced[1]

### Undersampling. 

In [None]:
model_df = enc_df.copy()

In [None]:
model_df.drop_duplicates(inplace=True)

In [None]:
X_full = model_df.drop(columns=["fraudulent"])
y_full = model_df["fraudulent"]

X_full.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_full,y_full, train_size = 0.8, random_state=0)

X_train_rus, y_train_rus = rus.fit_resample(X_train,y_train)

### 3.1 Undersampling company profile

In [None]:
X_train_rus_company = X_train_rus.drop(columns=["clean_requirements","clean_description","clean_benefits","clean_title"])

X_test_company = X_test.copy()

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_rus["clean_company_profile"])
X_test_tfidf = tfidf_vectorizer.transform(X_test_company["clean_company_profile"])

In [None]:
X_train_rus_company = X_train_rus_company.drop(columns=["clean_company_profile"])
X_test_company = X_test_company.drop(columns=["clean_company_profile","clean_requirements","clean_description","clean_benefits","clean_title"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf,X_train_rus_company))
X_test_tfidf = sparse.hstack((X_test_tfidf,X_test_company))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="3.1 Undersampled Encoded dataset - Company profile + encodade kolumner")
full_frame = models_produced[1]

### 3.2 Undersampling Description

In [None]:
X_train_rus_description = X_train_rus.drop(columns=["clean_requirements","clean_company_profile","clean_benefits","clean_title"])

X_test_description = X_test.copy()

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_rus_description["clean_description"])
X_test_tfidf = tfidf_vectorizer.transform(X_test_description["clean_description"])

In [None]:
X_train_rus_description = X_train_rus_description.drop(columns=["clean_description"])
X_test_description = X_test_description.drop(columns=["clean_company_profile","clean_requirements","clean_description","clean_benefits","clean_title"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf,X_train_rus_description))
X_test_tfidf = sparse.hstack((X_test_tfidf,X_test_description))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="3.2 Undersampled Encoded dataset - Description + encodade kolumner")
full_frame = models_produced[1]

### 3.3 All information vectorized separately 

In [None]:
X_train_rus_all = X_train_rus.copy()

X_test_all = X_test.copy()

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf_company = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_company_profile"])
X_test_tfidf_company = tfidf_vectorizer.transform(X_test_all["clean_company_profile"])

X_train_tfidf_description = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_description"])
X_test_tfidf_description = tfidf_vectorizer.transform(X_test_all["clean_description"])

X_train_tfidf_req = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_requirements"])
X_test_tfidf_req = tfidf_vectorizer.transform(X_test_all["clean_requirements"])

X_train_tfidf_benefits = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_benefits"])
X_test_tfidf_benefits = tfidf_vectorizer.transform(X_test_all["clean_benefits"])

X_train_tfidf_title = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_title"])
X_test_tfidf_title = tfidf_vectorizer.transform(X_test_all["clean_title"])

In [None]:
X_train_rus_all = X_train_rus_all.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])
X_test_all = X_test_all.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf_company,X_train_tfidf_description,X_train_tfidf_req,X_train_tfidf_benefits,X_train_tfidf_title,X_train_rus_all))
X_test_tfidf = sparse.hstack((X_test_tfidf_company,X_test_tfidf_description,X_test_tfidf_req,X_test_tfidf_benefits,X_test_tfidf_title,X_test_all))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="3.3 Full dataset - All text data separat vektorisering")
full_frame = models_produced[1]

### 3.4 All info except title

In [None]:
X_train_rus_no_title = X_train_rus.drop(columns=["clean_title"])

X_test_no_title = X_test.copy()

In [None]:
tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf_company = tfidf_vectorizer.fit_transform(X_train_rus_no_title["clean_company_profile"])
X_test_tfidf_company = tfidf_vectorizer.transform(X_test_no_title["clean_company_profile"])

X_train_tfidf_description = tfidf_vectorizer.fit_transform(X_train_rus_no_title["clean_description"])
X_test_tfidf_description = tfidf_vectorizer.transform(X_test_no_title["clean_description"])

X_train_tfidf_req = tfidf_vectorizer.fit_transform(X_train_rus_no_title["clean_requirements"])
X_test_tfidf_req = tfidf_vectorizer.transform(X_test_no_title["clean_requirements"])

X_train_tfidf_benefits = tfidf_vectorizer.fit_transform(X_train_rus_no_title["clean_benefits"])
X_test_tfidf_benefits = tfidf_vectorizer.transform(X_test_no_title["clean_benefits"])

In [None]:
X_train_rus_no_title = X_train_rus_no_title.drop(columns=["clean_company_profile","clean_benefits","clean_description","clean_requirements"])
X_test_no_title = X_test_no_title.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])

In [None]:
X_train_tfidf = sparse.hstack((X_train_tfidf_company,X_train_tfidf_description,X_train_tfidf_req,X_train_tfidf_benefits,X_train_rus_no_title))
X_test_tfidf = sparse.hstack((X_test_tfidf_company,X_test_tfidf_description,X_test_tfidf_req,X_test_tfidf_benefits,X_test_no_title))

In [None]:
models_produced = model_benchmarking(classifier_list,clasifier_names,X_train_tfidf,y_train_rus,X_test_tfidf,y_test,save_data, full_frame,test_name="3.4 Full dataset - All text förutom title")
full_frame = models_produced[1]

### Final result

In [None]:
full_frame.sort_values(by=["Recall"],ascending=False)

### Best model

In [None]:
model_df = enc_df.copy()
model_df.drop_duplicates(inplace=True)

X_full = model_df.drop(columns=["fraudulent"])
y_full = model_df["fraudulent"]

X_full.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_full,y_full, train_size = 0.8, random_state=0)

X_train_rus, y_train_rus = rus.fit_resample(X_train,y_train)

X_train_rus_all = X_train_rus.copy()

X_test_all = X_test.copy()

tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf_company = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_company_profile"])
X_test_tfidf_company = tfidf_vectorizer.transform(X_test_all["clean_company_profile"])

X_train_tfidf_description = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_description"])
X_test_tfidf_description = tfidf_vectorizer.transform(X_test_all["clean_description"])

X_train_tfidf_req = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_requirements"])
X_test_tfidf_req = tfidf_vectorizer.transform(X_test_all["clean_requirements"])

X_train_tfidf_benefits = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_benefits"])
X_test_tfidf_benefits = tfidf_vectorizer.transform(X_test_all["clean_benefits"])

X_train_tfidf_title = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_title"])
X_test_tfidf_title = tfidf_vectorizer.transform(X_test_all["clean_title"])

X_train_rus_all = X_train_rus_all.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])
X_test_all = X_test_all.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])


X_train_tfidf = sparse.hstack((X_train_tfidf_company,X_train_tfidf_description,X_train_tfidf_req,X_train_tfidf_benefits,X_train_tfidf_title,X_train_rus_all))
X_test_tfidf = sparse.hstack((X_test_tfidf_company,X_test_tfidf_description,X_test_tfidf_req,X_test_tfidf_benefits,X_test_tfidf_title,X_test_all))

In [None]:
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train_rus)
predictions = clf.predict(X_test_tfidf)
cm = confusion_matrix(y_test,predictions)
acc = accuracy_score(y_test,predictions)
rec = recall_score(y_test,predictions,average="macro")
acc = round(acc,5)
rec= round(rec,5)
print(f"The Accuracy of the model is: {acc} and recall score: {rec}")
display_cm = ConfusionMatrixDisplay(confusion_matrix=cm)
display_cm.plot()

### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {"loss" :("hinge","squared_hinge"),"C":[1,5,10,20],"max_iter":[1000,10000]}
grid_clf = GridSearchCV(clf,parameters, scoring="recall")
grid_clf.fit(X_train_tfidf,y_train_rus)
grid_clf.best_estimator_

In [None]:
clf = LinearSVC(loss ="hinge")
clf.fit(X_train_tfidf,y_train_rus)
predictions = clf.predict(X_test_tfidf)
cm = confusion_matrix(y_test,predictions)
acc = accuracy_score(y_test,predictions)
rec = recall_score(y_test,predictions,average="macro")
acc = round(acc,5)
rec= round(rec,5)
print(f"The Accuracy of the model is: {acc} and recall score: {rec}")
display_cm = ConfusionMatrixDisplay(confusion_matrix=cm)
display_cm.plot()

### Which datapoints are wronglt classified?

In [None]:
model_df = enc_df.copy()
model_df.drop_duplicates(inplace=True)

X_full = model_df.drop(columns=["fraudulent"])
y_full = model_df["fraudulent"]

X_full.fillna(" ",inplace=True)

X_train,X_test,y_train, y_test = train_test_split(X_full,y_full, train_size = 0.8, random_state=0)

X_train_rus, y_train_rus = rus.fit_resample(X_train,y_train)

X_train_rus_all = X_train_rus.copy()

X_test_all = X_test.copy()

tfidf_vectorizer  = TfidfVectorizer(min_df=2)

X_train_tfidf_company = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_company_profile"])
X_test_tfidf_company = tfidf_vectorizer.transform(X_test_all["clean_company_profile"])

X_train_tfidf_description = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_description"])
X_test_tfidf_description = tfidf_vectorizer.transform(X_test_all["clean_description"])

X_train_tfidf_req = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_requirements"])
X_test_tfidf_req = tfidf_vectorizer.transform(X_test_all["clean_requirements"])

X_train_tfidf_benefits = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_benefits"])
X_test_tfidf_benefits = tfidf_vectorizer.transform(X_test_all["clean_benefits"])

X_train_tfidf_title = tfidf_vectorizer.fit_transform(X_train_rus_all["clean_title"])
X_test_tfidf_title = tfidf_vectorizer.transform(X_test_all["clean_title"])

X_train_rus_all = X_train_rus_all.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])
X_test_all = X_test_all.drop(columns=["clean_company_profile","clean_benefits","clean_title","clean_description","clean_requirements"])


X_train_tfidf = sparse.hstack((X_train_tfidf_company,X_train_tfidf_description,X_train_tfidf_req,X_train_tfidf_benefits,X_train_tfidf_title,X_train_rus_all))
X_test_tfidf = sparse.hstack((X_test_tfidf_company,X_test_tfidf_description,X_test_tfidf_req,X_test_tfidf_benefits,X_test_tfidf_title,X_test_all))

In [None]:
y_train_rus.value_counts()

In [None]:
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train_rus)
predictions = clf.predict(X_test_tfidf)

#### Overview of correctly classified datapoints

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
index_of_correct_classification = []

for index,pred,true_value in zip(list(y_test.index),predictions,y_test):
    if pred == true_value:
        index_of_correct_classification.append(index)

In [None]:
frame = df_non_mod.loc[index_of_correct_classification]

In [None]:
full_missing_frame = pd.DataFrame(data={"Missing %":[]})

for col in frame.columns:
    col_missing = frame[col].isna().sum()
    missing_percent = round(col_missing/len(frame)*100,4)
    d = {"Missing %":pd.Series([missing_percent], index=[col])}
    new_frame = pd.DataFrame(data=d)
    full_missing_frame = pd.concat([full_missing_frame,new_frame])

In [None]:
overview_df = full_missing_frame.copy()
overview_df.reset_index(inplace=True)

col = ["red" if (x > 60) else "blue" for x in overview_df["Missing %"]]

In [None]:
plt.figure(figsize=(18,14))
ax = sns.barplot(data=overview_df,x="index", y="Missing %",color="blue")
plt.title("Missing data for correctly classified ads",fontsize=20)
for item in ax.get_xticklabels():
    item.set_rotation(75)
    item.set_fontsize(20)
ax.set_xlabel("",fontsize = 0)
ax.set_ylabel("Missing %", fontsize=20)
ax.bar_label(ax.containers[0]);

#### Overview of incorrectly classified datapoints

In [None]:
index_of_wrong_classification = []

for index,pred,true_value in zip(list(y_test.index),predictions,y_test):
    if pred != true_value:
        index_of_wrong_classification.append(index)

In [None]:
frame = df_non_mod.loc[index_of_wrong_classification]

In [None]:
full_missing_frame = pd.DataFrame(data={"Missing %":[]})

for col in frame.columns:
    col_missing = frame[col].isna().sum()
    missing_percent = round(col_missing/len(frame)*100,4)
    d = {"Missing %":pd.Series([missing_percent], index=[col])}
    new_frame = pd.DataFrame(data=d)
    full_missing_frame = pd.concat([full_missing_frame,new_frame])

In [None]:
overview_df = full_missing_frame.copy()
overview_df.reset_index(inplace=True)

col = ["red" if (x[0]=="clean_company_profile" or x[0]=="has_company_logo" or x[0]=="has_questions") else "blue" for x in overview_df.values]

In [None]:
col = [print(x[0]) for x in overview_df.values]

In [None]:
plt.figure(figsize=(18,14))
ax = sns.barplot(data=overview_df,x="index", y="Missing %",palette=col)
plt.title("Missing data for incorrectly classified ads",fontsize=20)
for item in ax.get_xticklabels():
    item.set_rotation(75)
    item.set_fontsize(20)
ax.set_xlabel("",fontsize = 0)
ax.set_ylabel("Missing %", fontsize=20)
ax.bar_label(ax.containers[0]);