# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 3.2) Predictive Analysis: classification
### *Antonio Strippoli, Valerio Mariani*

In [None]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, folder="predictive", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

In [None]:
#evaulate the accuracy on the train set and the test set
#metrics also contains precision, recall, f1 and the support
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix

def compute_scores(estimator, train_set, test_set, train_label, train_pred, test_label, test_pred):
  train = True
  for label, pred in zip([train_label, test_label], [train_pred, test_pred]):
    if train:
      print("=== TRAINING SET ===")
      train = False
    else:
      print("\n=== TEST SET ===")
    print('Accuracy:', metrics.accuracy_score(label, pred))
    print('Precision:', metrics.precision_score(label, pred, average='weighted'))
    print('Recall:', metrics.recall_score(label, pred, average='weighted'))
    print('F1 Score:', metrics.f1_score(label, pred, average='weighted'))
    print('Support:', metrics.precision_recall_fscore_support(label, pred))
  
  print("\n=== CLASSIFICATION REPORT ===")
  print(metrics.classification_report(
      test_label,
      test_pred,
      target_names=['low-spending', 'medium-spending', 'high-spending']
    )
  )

  if estimator:
    print("\n=== CONFUSION MATRIX ===")
    metrics.plot_confusion_matrix(estimator, test_set, test_label)
    plt.show()

In [None]:
# Split the dataset into training set and test set
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# Load dataset and set hyper-parameter
cldf = pd.read_csv("customer_classification.csv", index_col=0)
oversampling = True

# Extract labels and normalize values
label = cldf.pop('Labels')
X = MinMaxScaler().fit_transform(cldf.values)

# Prepare training set and test set
cldf = pd.DataFrame(X, columns=cldf.columns)
train_set, test_set, train_label, test_label = train_test_split(cldf, label, stratify=label, test_size=.3)

# Perform oversampling?
if oversampling == True:
  train_set, train_label = SMOTE(random_state=22).fit_sample(train_set, train_label)
  train_set = pd.DataFrame(train_set, columns=cldf.columns)
  train_label = pd.DataFrame(train_label)

### Decision Tree

In [None]:
# We define a Decision Tree based on the result of a Grid Search
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Model validation
params = {
    'criterion': ['gini','entropy'],
    'max_depth': list(range(10,31)),
    'max_leaf_nodes': list(range(2, 51)),
    'min_samples_split': list(range(1, 5)),
    'splitter': ['best', 'random'],
    'class_weight': ['balanced', None, {0: 0.1, 1: 0.3, 2: 0.6}]
}
dt = DecisionTreeClassifier(random_state=22)

grid_search_cv = GridSearchCV(dt, params, verbose=1, cv=3, n_jobs=-1)
grid_search_cv.fit(train_set, train_label)

dt = grid_search_cv.best_estimator_
dt

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Model assessment
if oversampling == False:
  dt = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=10, max_features=None, max_leaf_nodes=48,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=22, splitter='best')
else:
  dt = DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=48,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=22, splitter='best')
dt.fit(train_set, train_label)

In [None]:
# Visualize the actual decision tree obtained 
import pydotplus 
from sklearn import tree
from IPython.display import Image

dot_data = tree.export_graphviz(
  dt,
  out_file=None,
  feature_names=list(train_set.columns),
  class_names=['low-spending', 'medium-spending', 'high-spending'],
  filled=True,
  rounded=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
# Predict using the decision tree
train_pred = dt.predict(train_set)
test_pred = dt.predict(test_set)

In [None]:
compute_scores(dt, train_set, test_set, train_label, train_pred, test_label, test_pred)

### RANDOM FOREST

In [None]:
# We define a Random Forest based on the result of a Grid Search
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model validation
param_dist = {
  "max_depth": list(range(2, 20)),
  "min_samples_split": list(range(1, 10)),
  "min_samples_leaf": list(range(1, 10)),
  "bootstrap": [True, False],
  "criterion": ["entropy", "gini"],
  "class_weight": ['balanced', None, {0: 0.1, 1: 0.3, 2: 0.6}]
}
clf = RandomForestClassifier(n_estimators=30)

grid_search = GridSearchCV(clf, param_dist, verbose=1, cv=3, n_jobs=-1)
grid_search.fit(train_set, train_label)

dt = grid_search.best_estimator_
dt

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Model assessment
if oversampling == False:
  dt = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
else:
  dt = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=19, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
dt.fit(train_set, train_label)

In [None]:
# Predict using the decision tree
train_pred = dt.predict(train_set)
test_pred = dt.predict(test_set)

In [None]:
compute_scores(dt, train_set, test_set, train_label, train_pred, test_label, test_pred)

### Naive Bayes

In [None]:
#import, define and fit the model
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB(priors=[0.47017189, 0.38574317, 0.14408493])
gnb.fit(train_set, train_label)

In [None]:
# Predict using the decision tree
train_pred = gnb.predict(train_set)
test_pred = gnb.predict(test_set)

In [None]:
compute_scores(gnb, train_set, test_set, train_label, train_pred, test_label, test_pred)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Model validation
params = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,30],
    'weights': ['uniform', 'distance'],
    'algorithm': [ 'ball_tree', 'kd_tree', 'brute' ],
    'metric': ['minkowski'],
    'p': [1,2]  # 1=manhattan, 2=euclidean
}
knn = KNeighborsClassifier()

grid_search_cv = GridSearchCV(knn, params, verbose=1, cv=3, n_jobs=-1)
grid_search_cv.fit(train_set, train_label)

knn = grid_search_cv.best_estimator_
knn

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Model assessment
if oversampling == False:
  knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=12, p=1,
                     weights='distance')
else:
  knn = KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=1,
                     weights='distance')
knn.fit(train_set, train_label)

In [None]:
# Predict using the decision tree
train_pred = knn.predict(train_set)
test_pred = knn.predict(test_set)

In [None]:
compute_scores(knn, train_set, test_set, train_label, train_pred, test_label, test_pred)

### SVC

In [None]:
from sklearn.svm import SVC

# Model validation
params = {
    'C': np.append( np.arange(0.01,0.9,0.1),0),
    'kernel': [ 'rbf', 'sigmoid'],
    'gamma': ['scale','auto'],
    'coef0': np.arange(0.01,0.9,0.1),
    'shrinking': [True,False],
}
svm = SVC(probability=True, random_state=22)

grid_search_cv = GridSearchCV(svm, params, verbose=1, cv=3, n_jobs=-1)
grid_search_cv.fit(train_set, train_label)

svm = grid_search_cv.best_estimator_
svm

In [None]:
from sklearn.svm import SVC

# Model assessment
if oversampling == False:
  svm = SVC(C=0.81, break_ties=False, cache_size=200, class_weight=None, coef0=0.01,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=22, shrinking=True, tol=0.001,
    verbose=False)
else:
  svm = SVC(C=0.81, break_ties=False, cache_size=200, class_weight=None, coef0=0.01,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=22, shrinking=True, tol=0.001,
    verbose=False)
svm.fit(train_set, train_label)

In [None]:
# Predict using the decision tree
train_pred = svm.predict(train_set)
test_pred = svm.predict(test_set)

In [None]:
compute_scores(svm, train_set, test_set, train_label, train_pred, test_label, test_pred)

In [None]:
!pip install scikit-plot

In [None]:
# Roc curve
import scikitplot as skplt

test_pred_proba = svm.predict_proba(test_set)
plot(skplt.metrics.plot_roc_curve(test_label.values, test_pred_proba))

### Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# Model creation
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(1, X.shape[1])),
  tf.keras.layers.Dense(512, activation='sigmoid'),
  tf.keras.layers.Dense(256, activation='sigmoid'),
  # tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(3, activation='sigmoid')
])
model.compile(optimizer='adamax',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train!
history = model.fit(train_set, to_categorical(train_label.astype('float32')),
                    epochs=200,
                    batch_size=32,
                    validation_split=0.2, verbose=0)

In [None]:
from sklearn.metrics import classification_report

# Plot accuracy
for todo in ['accuracy', 'loss']:
  acc = history.history[todo]
  epochs = range(1, len(acc) + 1)
  val_acc = history.history[f'val_{todo}']
  plt.plot(epochs, acc, 'b', label=f'Training {todo}')
  plt.plot(epochs, val_acc, 'r', label=f'Validation {todo}')
  plt.title(f'Training and validation {todo}')
  plt.xlabel('Epochs')
  plt.ylabel(todo)
  plt.legend()
  plt.show()

# Compute scores
train_pred = model.predict_classes(train_set)
test_pred = model.predict_classes(test_set)

compute_scores(None, train_set, test_set, train_label.values, train_pred, test_label.values, test_pred)