# Predictive model with Decision Tree Classifier


## Import dependencies

In [1]:
# pandas - pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool.
import pandas as pd

# NumPy - The fundamental package for scientific computing.
import numpy as np

from random import randint

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

## Preparation of the Dataset

In [2]:
path = "../src/dataset/fake_job_postings.csv"

df = pd.read_csv(path, index_col="job_id").fillna("Missing")

target = df["fraudulent"]


def get_most_fraudulent_dummies(feature):
    cross = pd.crosstab(feature, target)
    filtered = cross[1].sort_values(ascending=False).iloc[0:20]
    categories = filtered.index

    dummies = pd.DataFrame(index=feature.index)

    for categorie in categories:
        name = f"{feature.name}-{categorie}"
        categorie_test = feature == categorie
        categorie_dummies = categorie_test.apply(int)
        dummies[name] = categorie_dummies

    return dummies

df = df.join(get_most_fraudulent_dummies(df["department"]))
df = df.join(get_most_fraudulent_dummies(df["employment_type"]))
df = df.join(get_most_fraudulent_dummies(df["required_experience"]))
df = df.join(get_most_fraudulent_dummies(df["required_education"]))
df = df.join(get_most_fraudulent_dummies(df["industry"]))
df = df.join(get_most_fraudulent_dummies(df["function"]))


df["company"] = df["title"]  + " " + df["company_profile"] + " " + df["description"] + " " + df["requirements"] + " " + df["benefits"]


df = df.drop(axis = "columns", columns = ["title", "location", "salary_range", "company_profile", "description", "requirements", "benefits", "fraudulent", "department", "employment_type", "required_experience", "required_education", "industry", "function"])


In [3]:
df

Unnamed: 0_level_0,telecommuting,has_company_logo,has_questions,department-Missing,department-Engineering,department-Clerical,department-Oil & Energy,department-Information Technology,department-Administrative,department-Customer Service,...,function-Marketing,function-Human Resources,function-Management,function-Financial Analyst,function-Advertising,function-Data Analyst,function-Consulting,function-Distribution,function-Design,company
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,"Marketing Intern We're Food52, and we've creat..."
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Customer Service - Cloud Video Production 90 S...
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Commissioning Machinery Assistant (CMA) Valor ...
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Account Executive - Washington DC Our passion ...
5,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bill Review Manager SpotSource Solutions LLC i...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17876,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Account Director - Distribution Vend is looki...
17877,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Payroll Accountant WebLinc is the e-commerce p...
17878,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Project Cost Control Staff Engineer - Cost Con...
17879,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Graphic Designer Missing Nemsia Studios is loo...


In [4]:
class Balanced_dataset:
    def __init__(self, X, y, **kwargs):
        self.X = X
        self.y = y

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, **kwargs)

    def get_balanced_split(self, X, y):
        data_true  = y.loc[y == 1]
        data_false = y.loc[y == 0]
        size_true  = len(data_true)
        size_false = len(data_false)

        if size_true > size_false:
            data_split = np.array_split(data_true, np.floor(size_true / size_false))
            data_base  = data_false
        else:
            data_split = np.array_split(data_false, np.floor(size_false / size_true))
            data_base  = data_true

        y_split = [split.append(data_base) for split in data_split]

        balanced_splits = [(X.loc[split.index], split) for split in y_split]

        return balanced_splits

    def get_train(self):
        return self.get_balanced_split(self.X_train, self.y_train)

    def get_test(self):
        return self.X_test, self.y_test


In [5]:
class Splited_model:
    def __init__(self, dataset, classifier, vectorizer, vectorize_col):
        self.dataset = dataset
        self.classifier = classifier
        self.vectorizer = vectorizer
        self.vectorize_col = vectorize_col

        self.train()

    def train(self):
        self.models  = []
        self.vectors = []

        for X, y in self.dataset.get_train():
            X.reset_index(inplace=True)
            vectorize_Series = X[self.vectorize_col]
            X = X.drop(axis="columns", columns=[self.vectorize_col])

            vector = self.vectorizer()
            vectorize_transform = vector.fit_transform(vectorize_Series).toarray()
            vectorize_dataframe = pd.DataFrame(vectorize_transform)
            X = X.join(vectorize_dataframe)

            model = self.classifier().fit(X, y)

            self.models.append(model)
            self.vectors.append(vector)

    def test(self):
        X, y = self.dataset.get_test()
        predict = self.predict(X)
        score = accuracy_score(y, predict)

        return score

    def predict(self, data):
        data_predicts = []

        for i, model in enumerate(self.models):
            model_data = data.copy()
            model_data.reset_index(inplace=True)
            vectorize_data = model_data[self.vectorize_col]
            model_data = model_data.drop(axis="columns", columns=[self.vectorize_col])

            vector = self.vectors[i]
            data_transform = vector.transform(vectorize_data).toarray()
            data_transform_dataframe = pd.DataFrame(data_transform)
            model_data = model_data.join(data_transform_dataframe)

            predicts = model.predict(model_data)

            for i, predict in enumerate(predicts):
                if i >= len(data_predicts):
                    data_predicts.append([])

                data_predicts[i].append(predict)

        result = [np.round(sum(predict) / len(predict)) for predict in data_predicts]

        return result


In [6]:
dataset = Balanced_dataset(df, target, train_size=70/100)

model = Splited_model(dataset, classifier=DecisionTreeClassifier, vectorizer=CountVectorizer, vectorize_col="company")


In [7]:
accuracy = model.test()

print(f"Accuracy: {accuracy * 100 :.2f}%")


Accuracy: 93.44%
