# CLASSIFICATION

### CLASSIFICATION PLAN:
We collected the labelled data from 2 different origins:
1. **KMeans Clustering** result (which will be called **original data**): 3 distinct classes, one of which is highly **imbalanced**
2. **Fuzzy Kmeans Clustering** result (which will be called **fuzzy data**): 3 distinct classes, not on same magnitude but also not particularly imbalanced

Different approaches will be evaluated:
* Apply classification on original training set i.e. high imbalance in data
* Apply classification on oversampled original training set (robust oversampling method -> SMOTE)
* Apply classification on oversampled original training set and test set (robust oversampling method -> SMOTE)
* Apply classification on fuzzy training set i.e. more balanced data

In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydotplus

from tqdm.notebook import tqdm
from IPython.display import Image 
from collections import Counter
from imblearn.over_sampling import SMOTE
from scipy.stats import randint as sp_randint

from sklearn import tree, metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.python.keras.utils.np_utils import to_categorical

***

In [None]:
# UTILITY FUNCTIONS

# transform categorical attributes in numerical 
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

# pretty printing of metrics computed on test set
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=classes, zero_division=0)) 
# (to avoid exagerated warnings) zero division = 0 makes sure that no warnings 
# are raised even if no classification on a certain class happens (and it can happen in multiple cases in this notebook)

***

# WORKING ON ORIGINAL DATA

***

In [None]:
# Starting dataset
df = pd.read_csv('datasets/clustered_dataframe.csv', sep='\t', index_col=0)
df.info()

In [None]:
# taking a look at data distribution between classes
for label in df['Label'].unique():
    print(label,"elements in dataset:",len(df[df['Label'] == label]))

In [None]:
# we only have 2 categorical attributes, discretize them and get rid of them 
# (also get rid of attributes which lead in our experiments to bad classification)
df = discretize_data(df,['MaxOrderMonth','MaxOrderDay','Label'])
df.drop(columns=['MaxOrderMonth','MaxOrderDay','Label'], inplace=True, errors='ignore')
df.drop(columns=['SETSaleQta','SESaleQtaOrder','MinPSale','MaxPSale'], inplace=True, errors='ignore')
df_class = df.copy()
df_class.tail()

In [None]:
# we keep true labels apart
label = df_class.pop('Label_num')
# we split dataset in training and test dataset. The use of stratify assures we keep correct class proportions in training and test
train_set, test_set, train_label, test_label = train_test_split(df_class, label, stratify = label, test_size=0.30)

***

## CLASSIFICATION on ORIGINAL TRAINING SET

### Decision Tree (only for illustrating the method, we apply this to make an example of high explainability in classification)

In [None]:
# parameters based on various trial and errors
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=3, 
                                  min_samples_split=3, min_samples_leaf=8)
dt = dt.fit(train_set, train_label)

In [None]:
# visualization of dt
classes = ['High_Spend','Low_Spend','Med_Spend']
dot_data = tree.export_graphviz(dt, out_file=None,
                         feature_names=list(train_set.columns),
                         class_names=classes,  #in transforming to numerical this order is mapped to 0,1,2 because of lexicographical
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
# predict and evaluate results on training and test set
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

print('Accuracy training set ', metrics.accuracy_score(train_label, train_pred_dt))
print('Accuracy test set ', metrics.accuracy_score(test_label, test_pred_dt))
print('Precision training set ', metrics.precision_score(train_label, train_pred_dt, average='weighted'))
print('Recall training set ', metrics.recall_score(train_label, train_pred_dt, average='weighted'))
print('F1 score trainig set ', metrics.f1_score(train_label, train_pred_dt, average='weighted'))
print('Support training set ', metrics.precision_recall_fscore_support(train_label, train_pred_dt))

In [None]:
report_scores(test_label, test_pred_dt)

In [None]:
# plot confusion matrix
plot_confusion_matrix(dt, test_set, test_label)
plt.show() 

### Random Forest

**First do a grid search for correct parameter setting**

In [None]:
# define the parameter ranges we want to try, then run the grid search
num_estimators = 30
param_dist = {"max_depth": [3,5,6,7,8,9,10,11,12,None],
              "max_features": sp_randint(1, 10),
              "min_samples_split": sp_randint(3, 20),
              "min_samples_leaf": sp_randint(5, 20),
              "bootstrap": [True, False],
              "criterion": ["entropy", "gini"]}
n_iter_search = 50
clf = RandomForestClassifier(n_estimators=num_estimators)
grid_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=-1, 
                            scoring=make_scorer(accuracy_score))
grid_search.fit(train_set, train_label)

In [None]:
# visualize insights on best performing model individuated
print('Best setting parameters ', grid_search.cv_results_['params'][0])
print('Mean and std of this setting ', grid_search.cv_results_['mean_test_score'][0], 
      grid_search.cv_results_['std_test_score'][0])

In [None]:
# set and training the specified random forest
rf = RandomForestClassifier(n_estimators=30, 
                             criterion=grid_search.cv_results_['params'][0]['criterion'],
                             max_features=grid_search.cv_results_['params'][0]['max_features'],
                             max_depth=grid_search.cv_results_['params'][0]['max_depth'], 
                             min_samples_split=grid_search.cv_results_['params'][0]['min_samples_split'],
                             min_samples_leaf=grid_search.cv_results_['params'][0]['min_samples_leaf'],
                             bootstrap=grid_search.cv_results_['params'][0]['bootstrap']) 
rf = rf.fit(train_set, train_label)

In [None]:
# predict on test and visualize results
test_pred_rf = rf.predict(test_set)
report_scores(test_label, test_pred_rf)

In [None]:
# forest is composed of lots of dt, visualize one just for fun
dot_data = tree.export_graphviz(rf[0], out_file=None,
                         feature_names=list(train_set.columns),
                         class_names=classes,
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

### Naive Bayes

In [None]:
# define and fit naive bayes model, predict on test and see results
gnb = GaussianNB()
gnb.fit(train_set, train_label)

test_pred_gnb = gnb.predict(test_set)

report_scores(test_label,test_pred_gnb)

### KNN

In [None]:
# define and fit knn model, predict on test and see results
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='minkowski').fit(train_set, train_label)

test_pred_knn = knn.predict(test_set)

report_scores(test_label,test_pred_knn)

### SVM

In [None]:
# define and fit svm model, predict on test and see results
svm = SVC(kernel='sigmoid', C=0.6, gamma='scale', probability=True)
svm.fit(train_set, train_label)

test_pred_svm = svm.predict(test_set)

report_scores(test_label, test_pred_svm)

***

## CLASSIFICATION on OVERSAMPLED TRAINING SET (SMOTE)

**SMOTE** is a different stratification method which applies oversampling to the classes. The difference between this method and the basic oversampling stands in the generation of new data:
* in the basic oversampling, original data is simply copied to create exact (but additional) new data
* in SMOTE oversampling, original data is used to create new data which copies the old one adding a [0,1] **perturbation**, hence generating truly new data

In [None]:
# create oversampled data
strat = {1: Counter(train_label)[1], 2: Counter(train_label)[2], 0: Counter(train_label)[2] }
smote = SMOTE(
    sampling_strategy=strat,    # resample all classes but the majority one
    k_neighbors=5
)

train_set_smote, train_label_smote = smote.fit_resample(train_set, train_label)

In [None]:
# visualize data distribution
Counter(train_label_smote)

### Random Forest

Try grid search again, on this more **balanced** dataset

In [None]:
# define and run the grid search
clf_smote = RandomForestClassifier(n_estimators=num_estimators)
grid_search_smote = RandomizedSearchCV(clf_smote, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=-1, 
                            scoring=make_scorer(accuracy_score))
grid_search_smote.fit(train_set_smote, train_label_smote)

In [None]:
# visualize insights on best performing model individuated
print('Best setting parameters ', grid_search_smote.cv_results_['params'][0])
print('Mean and std of this setting ', grid_search_smote.cv_results_['mean_test_score'][0], 
      grid_search_smote.cv_results_['std_test_score'][0])

In [None]:
# set and training the specified random forest
rf_smote = RandomForestClassifier(n_estimators=num_estimators, 
                             criterion=grid_search_smote.cv_results_['params'][0]['criterion'],
                             max_features=grid_search_smote.cv_results_['params'][0]['max_features'],
                             max_depth=grid_search_smote.cv_results_['params'][0]['max_depth'], 
                             min_samples_split=grid_search_smote.cv_results_['params'][0]['min_samples_split'],
                             min_samples_leaf=grid_search_smote.cv_results_['params'][0]['min_samples_leaf'],
                             bootstrap=grid_search_smote.cv_results_['params'][0]['bootstrap'])
rf_smote = rf_smote.fit(train_set_smote, train_label_smote)

In [None]:
# predict on test and visualize results
test_pred_rf = rf_smote.predict(test_set)
report_scores(test_label, test_pred_rf)

### Naive Bayes

In [None]:
# define and fit naive bayes model, predict on test and see results
gnb = GaussianNB()
gnb.fit(train_set_smote, train_label_smote)

test_pred_gnb = gnb.predict(test_set)

report_scores(test_label,test_pred_gnb)

### KNN

In [None]:
# define and fit knn model, predict on test and see results
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='minkowski').fit(train_set_smote, train_label_smote)

test_pred_knn = knn.predict(test_set)

report_scores(test_label,test_pred_knn)

### SVM

In [None]:
# define and fit svm model, predict on test and see results
svm = SVC(kernel='sigmoid', C=0.6, gamma='scale', probability=True)
svm.fit(train_set_smote, train_label_smote)

test_pred_svm = svm.predict(test_set)

report_scores(test_label, test_pred_svm)

***

## CLASSIFICATION on OVERSAMPLED TRAINING SET and TEST SET (SMOTE)

In [None]:
# before starting classification we apply stratification to test set to have more data in less populated class
# we do this because otherwise our test set would be populated of < 10 elements for High_Spend class, giving a susceptible evaluation  
strat = {1: Counter(test_label)[1], 2: Counter(test_label)[2], 0: Counter(test_label)[2] }
smote = SMOTE(
    sampling_strategy=strat,    # resample all classes but the majority one
    k_neighbors=5
)
test_set, test_label = smote.fit_resample(test_set, test_label)

# visualize data distribution in TEST
Counter(test_label)

In [None]:
plt.scatter(test_set.iloc[:, 4].values, test_set.iloc[:, 0].values, c=test_label.values, s=25, cmap='winter');

In [None]:
# visualize data distribution in TRAINING
Counter(train_label_smote)

### Random Forest

Try grid search again, on this more balanced dataset

In [None]:
# define and run the grid search
clf_smote = RandomForestClassifier(n_estimators=num_estimators)
grid_search_smote = RandomizedSearchCV(clf_smote, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=-1, 
                            scoring=make_scorer(accuracy_score))
grid_search_smote.fit(train_set_smote, train_label_smote)

In [None]:
# visualize insights on best performing model individuated
print('Best setting parameters ', grid_search_smote.cv_results_['params'][0])
print('Mean and std of this setting ', grid_search_smote.cv_results_['mean_test_score'][0], 
      grid_search_smote.cv_results_['std_test_score'][0])

In [None]:
# set and training the specified random forest
rf_smote = RandomForestClassifier(n_estimators=num_estimators, 
                             criterion=grid_search_smote.cv_results_['params'][0]['criterion'],
                             max_features=grid_search_smote.cv_results_['params'][0]['max_features'],
                             max_depth=grid_search_smote.cv_results_['params'][0]['max_depth'], 
                             min_samples_split=grid_search_smote.cv_results_['params'][0]['min_samples_split'],
                             min_samples_leaf=grid_search_smote.cv_results_['params'][0]['min_samples_leaf'],
                             bootstrap=grid_search_smote.cv_results_['params'][0]['bootstrap'])
rf_smote = rf_smote.fit(train_set_smote, train_label_smote)

In [None]:
# predict on test and visualize results
test_pred_rf = rf_smote.predict(test_set)
report_scores(test_label, test_pred_rf)

### Naive Bayes

In [None]:
# define and fit naive bayes model, predict on test and see results
gnb = GaussianNB()
gnb.fit(train_set_smote, train_label_smote)

test_pred_gnb = gnb.predict(test_set)

report_scores(test_label,test_pred_gnb)

### KNN

In [None]:
# define and fit knn model, predict on test and see results
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='minkowski').fit(train_set_smote, train_label_smote)

test_pred_knn = knn.predict(test_set)

report_scores(test_label,test_pred_knn)

### SVM

In [None]:
# define and fit svm model, predict on test and see results
svm = SVC(kernel='sigmoid', C=0.6, gamma='scale', probability=True)
svm.fit(train_set_smote, train_label_smote)

test_pred_svm = svm.predict(test_set)

report_scores(test_label, test_pred_svm)

***

# WORKING ON FUZZY DATA

In [None]:
# Starting dataset
df_f = pd.read_csv('datasets/clustered_fuzzy_dataframe.csv', sep='\t', index_col=0)

In [None]:
# we only have 2 categorical attributes, discretize them and get rid of them 
# (also get rid of attributes which lead in our experiments to bad classification)
df_f = discretize_data(df_f,['MaxOrderMonth','MaxOrderDay','Label'])
df_f.drop(columns=['MaxOrderMonth','MaxOrderDay','Label'], inplace=True, errors='ignore')
df_f.drop(columns=['SETSaleQta','SESaleQtaOrder','MinPSale','MaxPSale'], inplace=True, errors='ignore')
df_f_class = df_f.copy()

In [None]:
# we keep true labels apart
label = df_f_class.pop('Label_num')
# we split dataset in training and test dataset. The use of stratify assures we keep correct class proportions in training and test
train_set_f, test_set_f, train_label_f, test_label_f = train_test_split(df_f_class, label, stratify = label, test_size=0.30)

***

### Decision Tree (only for illustrating the method, we apply this to make an example of high explainability in classification)

In [None]:
# parameters based on various trial and errors
dt_f = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=3, 
                                  min_samples_split=3, min_samples_leaf=8)
dt_f = dt_f.fit(train_set_f, train_label_f)

In [None]:
# visualization of dt
dot_data_f = tree.export_graphviz(dt_f, out_file=None,
                         feature_names=list(train_set_f.columns),
                         class_names=classes,  #in transforming to numerical this order is mapped to 0,1,2 because of lexicographical
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data_f)
Image(graph.create_png())

In [None]:
# predict on training and test set
train_pred_dt_f = dt_f.predict(train_set_f)
test_pred_dt_f = dt_f.predict(test_set_f)

In [None]:
# visualize results
report_scores(test_label_f, test_pred_dt_f)

### Random Forest

**Do a grid search for correct parameter setting**

In [None]:
# define the parameter ranges we want to try, then run the grid search
clf_f = RandomForestClassifier(n_estimators=30)
grid_search_f = RandomizedSearchCV(clf_f, param_distributions=param_dist, 
                            n_iter=n_iter_search, 
                            n_jobs=-1, 
                            scoring=make_scorer(accuracy_score))
grid_search_f.fit(train_set_f, train_label_f)

In [None]:
# visualize insights on best performing model individuated
print('Best setting parameters ', grid_search_f.cv_results_['params'][0])
print('Mean and std of this setting ', grid_search_f.cv_results_['mean_test_score'][0], 
      grid_search_f.cv_results_['std_test_score'][0])

In [None]:
# set and training the specified random forest
rf_f = RandomForestClassifier(n_estimators=num_estimators, 
                             criterion=grid_search_f.cv_results_['params'][0]['criterion'],
                             max_features=grid_search_f.cv_results_['params'][0]['max_features'],
                             max_depth=grid_search_f.cv_results_['params'][0]['max_depth'], 
                             min_samples_split=grid_search_f.cv_results_['params'][0]['min_samples_split'],
                             min_samples_leaf=grid_search_f.cv_results_['params'][0]['min_samples_leaf'],
                             bootstrap=grid_search_f.cv_results_['params'][0]['bootstrap']) 
rf_f = rf_f.fit(train_set_f, train_label_f)

In [None]:
# predict on test and visualize results
test_pred_rf_f = rf_f.predict(test_set_f)
report_scores(test_label_f, test_pred_rf_f)

In [None]:
# forest is composed of lots of dt, visualize one just for fun
dot_data_f = tree.export_graphviz(rf_f[0], out_file=None,
                         feature_names=list(train_set_f.columns),
                         class_names=classes,
                         filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data_f)
Image(graph.create_png())

### Naive Bayes

In [None]:
# define and fit naive bayes model, predict on test and see results
gnb = GaussianNB()
gnb.fit(train_set_f, train_label_f)

test_pred_gnb = gnb.predict(test_set_f)

report_scores(test_label_f,test_pred_gnb)

### KNN

In [None]:
# define and fit knn model, predict on test and see results
knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='minkowski').fit(train_set_f, train_label_f)

test_pred_knn = knn.predict(test_set_f)

report_scores(test_label_f,test_pred_knn)

### SVM

In [None]:
# define and fit svm model, predict on test and see results
svm = SVC(kernel='sigmoid', C=0.6, gamma='scale', probability=True)
svm.fit(train_set_f, train_label_f)

test_pred_svm_f = svm.predict(test_set_f)

report_scores(test_label_f, test_pred_svm_f)

***

# Classification with Neural Network

### Preliminary functions
The following functions are useful in many sections of this notebook, therefore they
are put here at the beginning

Function to normalize the dataset:

In [None]:
def normalize_dataset(df):
    cols = df.columns
    scaler = MinMaxScaler()
    df = scaler.fit_transform(df.values)
    df = pd.DataFrame(df, columns=cols)
    return df

Function to print the dataset's composition:

In [None]:
def print_dataset_composition(train_set, train_labels, test_set, test_labels):
    assert(len(train_set) == len(train_labels))
    assert(len(test_set) == len(test_labels))
    print(f"{len(train_labels)} training samples:")
    print(f"\t- {len(train_labels[train_labels == 0])} samples for the class High_Spend")
    print(f"\t- {len(train_labels[train_labels == 1])} samples for the class Low_Spend")
    print(f"\t- {len(train_labels[train_labels == 2])} samples for the class Med_Spend")
    print(f"\n{len(test_labels)} test samples:")
    print(f"\t- {len(test_labels[test_labels == 0])} samples for the class High_Spend")
    print(f"\t- {len(test_labels[test_labels == 1])} samples for the class Low_Spend")
    print(f"\t- {len(test_labels[test_labels == 2])} samples for the class Med_Spend")

Function to create the NN model:

In [None]:
# Creates and returns a Keras NN model
def create_nn(size):
    """
    Characteristics:
        * 4 fully connected layers
            * the first 3 have 32 units
            * the last one has as many units as the number of classes, thus 3

        * activation function:
            * ReLU for the first 3 layers
            * Softmax for the output layer

        * Dropout of 0.2 is applied after every layer
    """
    model = Sequential()
    model.add(Dense(32, activation='relu', input_shape=(1,size)))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(3, activation='softmax'))
    return model

Function to compile and fit the NN model:

In [None]:
def compile_fit(model, training_set, training_labels, epochs):
    """
    Compile and fits the model

    :param training_labels:
    :param training_set:
    :param model: Keras NN model to train
    :return: history of training to plot the metrics
    """
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    print(training_labels[0])
    hist = model.fit(
        training_set,
        training_labels,
        epochs=epochs,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )
    return hist

Function to plot the training results:

In [None]:
def plot_train_results(train_history):
    acc = train_history.history['accuracy']
    val_acc = train_history.history['val_accuracy']
    loss = train_history.history['loss']
    val_loss = train_history.history['val_loss']
    epochs = range(1, len(acc) + 1)
    plt.plot(epochs, acc, 'b', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'bo', label='Validation Accuracy')
    plt.plot(epochs, loss, 'r', label='Training Loss')
    plt.plot(epochs, val_loss, 'ro', label='Validation Loss')
    plt.title('Training and validation Acc')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy & Loss')
    plt.legend()
    plt.grid()
    plt.show()

## Base dataframe
Labels come from _K-means_ clustering algorithm

In [None]:
# Drop categorical columns and keep only the corresponding numerical ones
df.drop(columns=['MaxOrderMonth','Label'], inplace=True, errors='ignore')
df.info()

In [None]:
# Remove the labels from the dataframe and store them in a variable (pandas.Series)
labels = df.pop('Label_num')

In [None]:
df.boxplot(rot=90)

Normalize the dataset

In [None]:
df = normalize_dataset(df)
df.boxplot(rot=90)

Divide the dataframe to create training and testing sets

In [None]:
train_set, test_set, train_labels, test_labels = train_test_split(df, labels, stratify=labels, test_size=0.30)
print_dataset_composition(train_set, train_labels, test_set, test_labels)

In [None]:
# Encode labels with one-hot
train_labels = to_categorical(train_labels, 3)
test_labels = to_categorical(test_labels, 3)

Create the Neural Network, compile it and train it

In [None]:
nn = create_nn(len(df.columns))
history = compile_fit(nn, train_set, train_labels, epochs=20)

Plot the training results:

In [None]:
plot_train_results(history)

Test the model on the test set

In [None]:
test_predictions = nn.predict_classes(test_set)
test_predictions = to_categorical(test_predictions, 3)
report_scores(test_labels, test_predictions)

## Base dataframe with SMOTE oversampling
Labels come from _K-Means_ clustering algorithm

### Normalize and SMOTE

In [None]:
#Normalize the dataset
cols = df.columns
scaler = MinMaxScaler()
df = scaler.fit_transform(df.values)
df = pd.DataFrame(df, columns=cols)

# Pick validation set from the complete df
df, val_set, labels, val_labels = train_test_split(df, labels, stratify=labels, test_size=0.30)

# Divide the dataframe to create training and testing sets
train_set, test_set, train_labels, test_labels = train_test_split(df, labels, stratify=labels, test_size=0.30)

# SMOTE --> balance the number of entries of each class
smote = SMOTE(
    sampling_strategy='not majority',    # resample all classes but the majority one
    k_neighbors=4
)
train_set, train_labels = smote.fit_resample(train_set, train_labels)
test_set, test_labels = smote.fit_resample(test_set, test_labels)

assert(len(train_set) == len(train_labels))
assert(len(val_set) == len(val_labels))
assert(len(test_set) == len(test_labels))
print(f"\n{len(train_labels)} training samples:")
print(f"\t- {len(train_labels[train_labels == 0])} samples for the class High_Spend")
print(f"\t- {len(train_labels[train_labels == 1])} samples for the class Low_Spend")
print(f"\t- {len(train_labels[train_labels == 2])} samples for the class Med_Spend")
print(f"\n{len(test_labels)} validation samples:")
print(f"\t- {len(val_labels[val_labels == 0])} samples for the class High_Spend")
print(f"\t- {len(val_labels[val_labels == 1])} samples for the class Low_Spend")
print(f"\t- {len(val_labels[val_labels == 2])} samples for the class Med_Spend")
print(f"\n{len(test_labels)} test samples:")
print(f"\t- {len(test_labels[test_labels == 0])} samples for the class High_Spend")
print(f"\t- {len(test_labels[test_labels == 1])} samples for the class Low_Spend")
print(f"\t- {len(test_labels[test_labels == 2])} samples for the class Med_Spend")

# Encode labels with one-hot
train_labels = to_categorical(train_labels, 3)
val_labels = to_categorical(val_labels, 3)
test_labels = to_categorical(test_labels, 3)

Create the Neural Network, compile it and train it

In [None]:
nn = Sequential([
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

nn.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
history = nn.fit(
    train_set,
    train_labels,
    epochs=27,
    batch_size=32,
    validation_data=(val_set, val_labels),
    verbose=1
)

Plot the training results:

In [None]:
plot_train_results(history)

Test the model on the test set

In [None]:
test_predictions = nn.predict_classes(test_set)
test_predictions = to_categorical(test_predictions, 3)
report_scores(test_labels, test_predictions)


## Dataframe coming from Fuzzy C-Means
Labels come from _Fuzzy C-Means_ clustering algorithm

In [None]:
# Remove the labels from the dataframe and store them in a variable (pandas.Series)
labels = df_f.pop('Label_num')

#Normalize the dataset
df_f = normalize_dataset(df_f)

# Divide the dataframe to create training and testing sets
train_set, test_set, train_labels, test_labels = train_test_split(df_f, labels, stratify=labels, test_size=0.30)
print_dataset_composition(train_set, train_labels, test_set, test_labels)

# Encode labels with one-hot
train_labels = to_categorical(train_labels, 3)
test_labels = to_categorical(test_labels, 3)

Create the Neural Network, compile it and train it

In [None]:
nn = create_nn(len(df_f.columns))
history = compile_fit(nn, train_set, train_labels, epochs=20)

Plot the training results:

In [None]:
plot_train_results(history)

Test the model on the test set

In [None]:
test_predictions = nn.predict_classes(test_set)
test_predictions = to_categorical(test_predictions, 3)
report_scores(test_labels, test_predictions)

# Alternative Features

For this analysis the dataset *customer_dataframe_big.csv* will be used, since it contains all the features for the customers.
Then, joining with the dataset created from Fuzzy K-Means we add the labeling to the various customers.   

In [None]:
df = pd.read_csv('datasets/clustered_fuzzy_dataframe.csv', sep='\t', index_col=0)
df_customer = pd.read_csv('datasets/customer_dataframe_big.csv', sep='\t', index_col=0)

df_customer.set_index("CustomerID", inplace = True)
df_customer['Label'] = df['Label']

# Size of different classes in the dataset
print("-------------------------")
for i in ['Low_Spend', 'Med_Spend', 'High_Spend']:
    print(i, len(df_customer[df_customer['Label'] == i]))

The target of this analysis is to use all the features that are not exploited by the clustering to assign the customers to a class.

In [None]:
# Drop all the features that are not time dependent and not usefull to generalize the behaviour of a customer
df_customer = discretize_data(df_customer,['MaxOrderMonth','Label'])
df_aux = df_customer.drop(columns=['TProd', 'MaxPO', 'MinPO', 'MeanProdOrder', 'TSaleWRet', 'MinPSale', 'MaxPSale', 'MeanSaleOrder',
                                   'MeanPSale', 'MaxOrderMonth', 'MaxOrderMonth', 'MaxOrderDay', 'Label', 'TRProd', 
                                   'SETSaleQta', 'SESaleQtaOrder', 'SEShoppingDays', 'DProd', 'TSale', 'TOrder'], errors='ignore')
df_class = df_aux.copy()
df_class.tail()

# Decision Tree

In [None]:
label = df_class.pop('Label_num')
train_set, test_set, train_label, test_label = train_test_split(df_class, label, stratify =label, test_size=0.35)
print(f"Class HighSpend\nTraining: {len(train_label[train_label == 0])}\nTest: {len(test_label[test_label == 0])}")

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=3, min_samples_split=17, min_samples_leaf=20)
dt = dt.fit(train_set, train_label)
classes = ['High_Spend','Low_Spend','Med_Spend']
dot_data = tree.export_graphviz(dt, out_file=None, feature_names=list(train_set.columns), class_names=classes, filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)
report_scores(test_label, test_pred_dt)

***