# First Assignment of Data Mining: Classifiers

## Compacting the files
The first task is converting the excel data into parquet files for faster processing.

In [None]:
from src.write import write_parquet
from src.read import read_xlsx

path = 'data/'

# for the existing customers
fname = 'existing-customers.xlsx'
df = read_xlsx(path + fname)
new_fname = fname.split('.')[0] + '.parquet'
write_parquet(df, path + new_fname)

# for the potential customers
fname = 'potential-customers.xlsx'
df = read_xlsx(path + fname)
new_fname = fname.split('.')[0] + '.parquet'
write_parquet(df, path + new_fname)

## Cleaning and Preprocessing
We are interested in the distributions obtained from the imputation of the data
so we will plot the distributions of the imputed columns before and after
imputation.

In [None]:
import seaborn as sns

from matplotlib import pyplot as plt
from src.read import read_parquet
from src.impute import impute_decision_tree
from src.clean import remove_rowid

path = 'data/'
fname = 'existing-customers.parquet'

# read the dataset
df = read_parquet(path + fname)

# remove the rowid column
df = remove_rowid(df)

# get the null columns
null_columns = df.columns[df.isnull().any()]

# plot the distributions before the imputation
for null_col in null_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[null_col].sort_values(), bins=len(df[null_col].unique()),
                 stat='count', discrete=True)
    plt.title(null_col)
    plt.xticks(rotation=45, ha='right')
    plt.show()

# get the indices of the imputed columns
indices_dict = dict()
for null_col in null_columns:
    indices_dict[null_col] = df[df[null_col].isnull()].index

# impute the missing values
df = impute_decision_tree(df)

# plot the distributions after the imputation
for null_col in null_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df.iloc[indices_dict[null_col]][null_col].sort_values(),
                 bins=len(df[null_col].unique()), stat='count', discrete=True)
    plt.title(null_col + ' (imputed)')
    plt.xticks(rotation=45, ha='center')
    plt.show()


We will now do the rest of the cleaning and preprocessing to make it ready
for the classifiers.

In [None]:
from src.clean import change_class
from src.encode import dummify

# change the class column to a binary column
df = change_class(df)

# dummify the dataframe
df = dummify(df)

## Dataset splits
Our next task is to split the data into training and testing sets.

In [None]:
import pandas as pd
from src.dataset_split import get_stratified_kfold_split, get_stratified_split

# create a train and test set
true_train_X, true_test_X, true_train_y, true_test_y = get_stratified_split(df)
df = pd.concat([true_train_X, true_train_y], axis=1)
split = list(get_stratified_kfold_split(df))

## Check the Classifier Results
import all the models

In [None]:
from src.models.gradient_boosted_trees_classifier import gradient_boosted_trees
from src.models.decision_tree_classifier import decision_tree
from src.models.random_forest_classifier import random_forest
from src.models.knn_classifier import knn

Evaluate all the models and store all the results in a dataframe.

In [None]:
from src.evaluation import evaluate
from src.dataset_split import get_X_y

costs_dict = {'decision tree': list(),
              'gradient boosted trees': list(),
              'knn classifier': list(),
              'random forest': list()}
scores_df = pd.DataFrame(
    {'model': list(), 'accuracy': list(), 'precision': list(),
     'recall': list(), 'f1': list(), 'cost': list(),
     'fold': list()})
for fold, (train_idces, test_idces) in enumerate(split):
    train_df = df.iloc[train_idces]
    test_df = df.iloc[test_idces]
    train_X, train_y = get_X_y(train_df)
    test_X, test_y = get_X_y(test_df)

    model = decision_tree(train_X, train_y)
    score, precision, recall, f1, cost = evaluate(model, test_X, test_y,
                                                  verbose=False)
    scores_df = pd.concat([scores_df, pd.DataFrame(
        {'model': ['decision tree'], 'accuracy': [score],
         'precision': [precision], 'recall': [recall], 'f1': [f1],
         'cost': [cost],
         'fold': [fold]})])
    costs_dict['decision tree'].append(cost)

    model = gradient_boosted_trees(train_X, train_y)
    score, precision, recall, f1, cost = evaluate(model, test_X, test_y,
                                                  verbose=False)
    scores_df = pd.concat([scores_df, pd.DataFrame(
        {'model': ['gradient boosted trees'], 'accuracy': [score],
         'precision': [precision], 'recall': [recall], 'f1': [f1],
         'cost': [cost],
         'fold': [fold]})])
    costs_dict['gradient boosted trees'].append(cost)

    model = knn(train_X, train_y, 5)
    score, precision, recall, f1, cost = evaluate(model, test_X, test_y,
                                                  verbose=False)
    scores_df = pd.concat([scores_df, pd.DataFrame(
        {'model': ['knn classifier'], 'accuracy': [score],
         'precision': [precision], 'recall': [recall], 'f1': [f1],
         'cost': [cost],
         'fold': [fold]})])
    costs_dict['knn classifier'].append(cost)

    model = random_forest(train_X, train_y)
    score, precision, recall, f1, cost = evaluate(model, test_X, test_y,
                                                  verbose=False)
    scores_df = pd.concat([scores_df, pd.DataFrame(
        {'model': ['random forest'], 'accuracy': [score],
         'precision': [precision], 'recall': [recall], 'f1': [f1],
         'cost': [cost], 'fold': [fold]})])
    costs_dict['random forest'].append(cost)


Write the dataframe to a parquet file for later analysis.

In [None]:
scores_df.to_parquet(path + 'scores.parquet')

Print out the average costs of the models and make a violin plot out of the
results of the different folds

In [None]:
import numpy as np

for model, costs in costs_dict.items():
    print(model)
    print(np.average(costs))
    print()
    # make a violin plot out of the costs

Create a violin plot out of the costs of the different models.

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='model', y='cost', data=scores_df)
plt.show()

## Hyperparameter Tuning
We can now clearly see that the lowest cost comes from the gradient boosted
trees classifier. Therefore, we will use this model and tune hyperparameters
on this model.

In [None]:
from src.optimization import simulated_annealing, get_model_from_state

state = simulated_annealing(initial_state=(.4, 15, 20, .8, .2), split=split,
                            df=df)
model = get_model_from_state(state)
model.fit(true_train_X, true_train_y)
evaluate(model, true_test_X, true_test_y)

## Final Results
Obtain the final results by training the calculated model with the state on the
whole labeled dataset.

In [None]:
model = get_model_from_state(state)
model.fit(df.drop(columns='class'), df['class'])

Load the potential customers and make predictions on them, we also need to
impute the values.

In [None]:
path = 'data/'
potential_customers_fname = 'potential-customers.parquet'

# load the dataset
potential_customers_df = read_parquet(path + potential_customers_fname)

# keep the id column somewhere else
potential_customers = potential_customers_df['RowID']

# remove the rowid column
potential_customers_df = remove_rowid(potential_customers_df)

# impute the missing values
potential_customers_df = impute_decision_tree(potential_customers_df)

# dummify the dataframe
potential_customers_df = dummify(potential_customers_df)


Perform prediction on the obtained dataframe `potential_customers_df`.

In [None]:
df_set = set(df.columns)
potential_customers_set = set(potential_customers_df.columns)
df_set.difference(potential_customers_set)
for col in df_set.difference(potential_customers_set):
    if col == 'class':
        continue
    potential_customers_df[col] = 0
predictions = model.predict(potential_customers_df)
predictions

Calculate the costs of the predictions. We take a rate of
0.63 = 1385/(824+1385) = TP/(FP+TP) as the ratio to work with, also known as the
precision. This means that approximately 63% of the positively predicted
customers can win us money.

In [None]:
# get the total positively marked customers
total = sum(predictions)

# calculate the # of true positives
TP = 1385
FP = 824
ratio = TP / (TP + FP)

true_positives = int(total * ratio)
false_positives = total - true_positives

# calculate the costs
cost = (0.05 * 310 * false_positives + 10) - (0.1 * 980 * true_positives + 10)
cost

We are going to gain around 300.000 euros with this model.