# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 3.1) Predictive Analysis: classification
### *Antonio Strippoli, Valerio Mariani*

In [1]:
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('mode.chained_assignment', None)

In [2]:
def plot(ax, folder="predictive", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

In [3]:
# Load dataset
cldf = pd.read_csv("customer_classification.csv", index_col=0)
cldf

Unnamed: 0_level_0,TotItems,MaxSale,MeanSale,MeanItemSale,E-Sale,Cat0,Cat1,Cat2,Labels
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12347,319,711,711,3,4.76,186,133,0,1
12348,1254,892,892,7,1.37,1248,0,6,1
12352,26,144,144,12,1.56,12,9,5,0
12356,1216,2271,2271,4,4.39,956,198,62,1
12359,356,1109,1109,5,5.13,160,165,31,2
...,...,...,...,...,...,...,...,...,...
18263,328,307,307,1,3.59,316,12,0,0
18272,276,340,340,1,4.24,232,44,0,1
18273,20,51,51,2,-0.00,0,20,0,0
18283,67,106,106,2,5.30,46,21,0,0


### Classification - Decision Tree

In [4]:
# to classify, we first need to split the dataset into train and test dataset.
# we can do so using train_test_split, in this case we select a stratified split
from sklearn.model_selection import train_test_split

label = cldf.pop('Labels')
train_set, test_set, train_label, test_label = train_test_split(cldf, label, stratify=label, test_size=0.30)

In [5]:
#define a decision tree and fit it
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                  max_depth=10, 
                                  min_samples_split=3, min_samples_leaf=4)
dt = dt.fit(train_set, train_label)

In [7]:
#visualize the actual decision tree obtained 
import pydotplus 
from IPython.display import Image  
dot_data = tree.export_graphviz(dt, out_file=None, 
                         feature_names=list(train_set.columns),  
                         class_names=['<=50', '>50'],  
                         filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

IndexError: list index out of range

In [None]:
#predict using the decision tree
#the predict function returns the actual predicted labels: we need them for the evaluation phase
train_pred_dt = dt.predict(train_set)
test_pred_dt = dt.predict(test_set)

In [None]:
#evaulate the accuracy on the train set and the test set
#metrics also contains precision, recall, f1 and the support
from sklearn import metrics
print('Accuracy train set ', metrics.accuracy_score(train_label, train_pred_dt))
print('Accuracy test set ', metrics.accuracy_score(test_label, test_pred_dt))
print('Precision train set ', metrics.precision_score(train_label, train_pred_dt, average='weighted'))
print('Recall train set ', metrics.recall_score(train_label, train_pred_dt, average='weighted'))
print('F1 score train set ', metrics.f1_score(train_label, train_pred_dt, average='weighted'))
print('Support train set ', metrics.precision_recall_fscore_support(train_label, train_pred_dt))

In [None]:
#metrics computed on the test set
from sklearn.metrics import classification_report
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['<=50', '>50']))

In [None]:
report_scores(test_label, test_pred_dt)

In [None]:
#cross validation
#the score array for test/train scores on each cv split
#the time for fitting/scoring the estimator on the train set for each cv split
from sklearn.model_selection import cross_validate
import statistics 
scores = cross_validate(dt, train_set, train_label, cv=3, return_train_score= True)
print('Fit time ', statistics.mean(scores['fit_time']))
print('Score time ', statistics.mean(scores['score_time']))
print('Test score ', statistics.mean(scores['test_score']))
print('Train score ', statistics.mean(scores['train_score']))

In [None]:
#compute confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_label, test_pred_dt)
cm

In [None]:
#it is possible to plot the confusion matrix 
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
plot_confusion_matrix(dt, test_set, test_label)
plt.show() 

In [None]:
from sklearn.datasets import make_blobs
plt.scatter(test_set.iloc[:, 4].values, test_set.iloc[:, 2].values, c=test_label.values, s=25, cmap='RdBu');

In [None]:
plt.scatter(test_set.iloc[:, 4].values, test_set.iloc[:, 2].values, c=test_pred_dt, s=25, cmap='RdBu');