# Exercises in Classification (Solutions)

In the code you will see:

> &nbsp;  
> \# YOUR CODE HERE  
> ...
> <br><br>

Insert your code to complete the exercise.

<br><br>

### Exercise: Performance Assessment for a Decision Tree based Model

Examine the quality of the decision tree example from last lecture. Compare the quality measures that are calculated on the basis of the test data with those created by cross-validation.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from sklearn.inspection import DecisionBoundaryDisplay


# Load input data
input_file = './data/data_decision_trees.txt'
data = np.loadtxt(input_file, delimiter=',')

# YOUR CODE HERE
# correct the assigment
X, y = data[:], data[:]


In [None]:

# Separate input data into two classes based on labels
class_0 = np.array(X[y==0])
class_1 = np.array(X[y==1])

# Visualize input data
plt.figure()
plt.scatter(class_0[:, 0], class_0[:, 1], s=75, facecolors='black', linewidth=1, marker='x')
plt.scatter(class_1[:, 0], class_1[:, 1], s=75, facecolors='white', 
        edgecolors='black', linewidth=1, marker='o')
plt.title('Input data')
plt.show()

In [None]:
# Split data into training and testing datasets (75% vs. 25%)

# YOUR CODE HERE
# complete the code
X_train, X_test, y_train, y_test = model_selection.train_test_split(random_state=23)

# Decision Trees classifier 
params = {'random_state': 0, 'max_depth': 10}
dt_classifier = DecisionTreeClassifier(**params)

# YOUR CODE HERE
# complete the code
dt_classifier.fit()

In [None]:
# Draw decision boundaries
DecisionBoundaryDisplay.from_estimator(dt_classifier, X, alpha=0.4, response_method="predict")
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, edgecolor="k")
plt.title("Decision boundary for Decision Tree Classifier")
plt.show()

In [None]:
# Calculate accuracy on the basis of the test data
y_pred = dt_classifier.predict(X_test)

# generate a confusion matrix
# YOUR CODE HERE
cm = None


In [None]:

# visualize the confusion matrix
ax = plt.axes()
sns.heatmap(cm, annot=True, annot_kws={"size": 30}, cmap="Greens", ax=ax)
ax.set_title('Confusion Matrix')
plt.show()

print('Accuracy:')
print(f' Train: {accuracy_score(y_train, dt_classifier.predict(X_train))*100:.2f} %')
print(f' Test:  {accuracy_score(y_test, y_pred)*100:.2f} %')


In [None]:
# Calculate accuracy through cross-validation
from sklearn import model_selection

# this defines the cross-validation strategy (cv parameter)
num_folds = 7

# Scoring functions of sklearn (original data: X and y!)
accuracy_values = model_selection.cross_val_score(dt_classifier, X, y, scoring='accuracy', cv=num_folds)
print(f"Accuracy:  {accuracy_values.mean():.2f}% -> {accuracy_values}")

precision_values = model_selection.cross_val_score(dt_classifier, X, y, scoring='precision_weighted', cv=num_folds)
print(f"Precision: {precision_values.mean():.2f}% -> {precision_values}")

recall_values = model_selection.cross_val_score(dt_classifier, X, y, scoring='recall_weighted', cv=num_folds)
print(f"Recall:    {recall_values.mean():.2f}% -> {recall_values}")

f1_values = model_selection.cross_val_score(dt_classifier, X, y, scoring='f1_weighted', cv=num_folds)
print(f"F1:        {f1_values.mean():.2f}  -> {f1_values}")

Exercise: Parameter Tuning for a kNN based Model
----------------------------------------------------------------------------------------------------

Vary the parameter k for a kNN based prediction model in order to maximize the accuracy. Calculate the measures using cross validation with 5 sets (folds).

In [None]:
# Load required libraries
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load input data
input_file = "./data/data_decision_trees.txt"
data = np.loadtxt(input_file, delimiter=',')
X, y = data[:, :-1], data[:, -1]

In [None]:
# Separate input data into two classes based on labels
class_0 = np.array(X[y==0])
class_1 = np.array(X[y==1])

# Visualize input data
plt.figure()
plt.scatter(class_0[:, 0], class_0[:, 1], s=75, facecolors='red', linewidth=1, marker='x')
plt.scatter(class_1[:, 0], class_1[:, 1], s=75, facecolors='lightblue', edgecolors='black', linewidth=1, marker='o')
plt.title('Input data')
plt.show()

In [None]:
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.inspection import DecisionBoundaryDisplay

# Split up the data into training and test data (ration: 3:1).
# YOUR CODE HERE
# correct the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=?, random_state=42)


In [None]:
print("Shape of data-set:", X.shape, ", training data:",X_train.shape, ", test data:", X_test.shape)
print("Shape of label-data:", y.shape, ", training labels:",y_train.shape, ", test labels:", y_test.shape)

In [None]:
# Train a kNN based classifier with k=1 and weights being distance and use the visualize_classifier
# function to analyze the quality of the classifier.

# YOUR CODE HERE
knn_clf = neighbors.KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

In [None]:
# generate a confusion matrix
y_pred = knn_clf.predict(X_test)

# YOUR CODE HERE
cm = None


In [None]:

# visualize the confusion matrix
ax = plt.axes()
sns.heatmap(cm, annot=True, annot_kws={"size": 30}, cmap="Greens", ax=ax)
ax.set_title('Confusion Matrix')
plt.show()

print('Accuracy:')
print(f' Train: {accuracy_score(y_train, knn_clf.predict(X_train))*100:.2f} %')
print(f' Test:  {accuracy_score(y_test, y_pred)*100:.2f} %')


In [None]:
# Calculate accuracy through cross-validation
from sklearn import model_selection

# this defines the cross-validation strategy (cv parameter)
num_folds = 3

# Scoring functions of sklearn (original data: X and y!)
accuracy_values = model_selection.cross_val_score(knn_clf, X, y, scoring='accuracy', cv=num_folds)
print(f"Accuracy:  {accuracy_values.mean():.2f}% -> {accuracy_values}")

precision_values = model_selection.cross_val_score(knn_clf, X, y, scoring='precision_weighted', cv=num_folds)
print(f"Precision: {precision_values.mean():.2f}% -> {precision_values}")

recall_values = model_selection.cross_val_score(knn_clf, X, y, scoring='recall_weighted', cv=num_folds)
print(f"Recall:    {recall_values.mean():.2f}% -> {recall_values}")

f1_values = model_selection.cross_val_score(knn_clf, X, y, scoring='f1_weighted', cv=num_folds)
print(f"F1:        {f1_values.mean():.2f}  -> {f1_values}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# run k from 1 to 15
for k in np.arange(1, 16):
    knn_clf = neighbors.KNeighborsClassifier(k, weights="distance")
    knn_clf.fit(X_train, y_train)
    print(knn_clf)
    
    y_pred = knn_clf.predict(X_test)
    accuracy_values = model_selection.cross_val_score(knn_clf, X, y, scoring='accuracy', cv=num_folds)

    print('Accuracy:')
    print(f' Train: {accuracy_score(y_train, knn_clf.predict(X_train))*100:.2f}%')
    print(f' Test:  {accuracy_score(y_test, y_pred)*100:.2f}%')
    print(f" CV:    {accuracy_values.mean()*100:.2f}%")

    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print()


# Exercise: Multiclass ensemble classification

##### Like some Pizza?

<img src="./img/5_pizza.webp" width="500px"><br><br>

Can we distinguish the brand from the ingredients?

### The Pizza dataset

__Attributes__:

1. brand ... Pizza Brand [A ... J] (no ads allowed)
1. id .......... Sample ID
1. mois ..... Moisture, amount of water per 100g in the sample
1. prot ...... Proteins per 100g
1. fat ........ Fat per 100g
1. ash ....... Ash (yes) per 100g
1. sodium . Sodium per 100g
1. carb ..... Carbohydrates per 100
1. cal ........ Calories per 100

Let's get started.

https://www.kaggle.com/code/luishpinto/multiclass-classification/notebook

In [None]:
# imports
from itertools import product
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# model preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.inspection import DecisionBoundaryDisplay

The data contains mixed text and numerical values.

In `Numpy`, the easiest way to load the data is to select features and targets directly ...

In [None]:
# load data using numpy
X = np.loadtxt("./data/Pizza.csv", skiprows=1, usecols=(2, 3, 4, 5, 6, 7, 8), delimiter=",")
brand = np.loadtxt("./data/Pizza.csv", skiprows=1, dtype="str", usecols=(0), delimiter=",")

# convert string labels to numerical targets
brand_encoder = LabelEncoder()
y = brand_encoder.fit_transform(brand)
brand_names = sorted(list(set(brand)))

X, y, brand_names

... while in `Pandas` you can read the data directly and select the required columns later

In [None]:
# read data from csv file
data = pd.read_csv("./data/Pizza.csv")

X = data.iloc[:, 2:9].to_numpy()
# in pandas converting alphanumerical categories into numerical, use factorize
y = pd.factorize(data.iloc[:, 0])[0]

X, y

In any case, you have `X` and `y` as numpy arrays.

Let's do some basic statistics to get acquainted to the data.

In [None]:
# generate a dataframe for analysis
df = pd.concat([data, pd.Series(y, name="brand_id")], axis=1)
df.drop(labels=["brand", "id"], axis=1, inplace=True)

# generate pair plot
sns.set_theme(style='darkgrid')
g = sns.pairplot(df, hue='brand_id', palette=sns.color_palette())


Too many dimensions! We will learn to reduce them in `Clustering`.

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
names = ["kNN", "Logistic Regression", "SVM", "Decission Tree", "RandomForest"]
# YOUR CODE HERE
# # define classifiers with reasonable parameters:
#   KNeighborsClassifier -->           k=3
#   LogisticRegression -->             max_iter=2000
#   Decision Trees and RandomFores --> max_depth=5

classifiers = [KNeighborsClassifier(), LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]


In [None]:

# make copy for later use
bounds_clf = copy.deepcopy(classifiers)
scores = []
cms = []
crs = []

models_trained = False

# prepare the train / test split, 80% training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

try:
    for name, classifier in zip(names, classifiers):

        # train the classifier

        # YOUR CODE HERE
        clf = None
        
        # predict results

        # YOUR CODE HERE
        y_pred = None

        # compute the confusion matrix

        # YOUR CODE HERE
        cm = None
        
        # compute the classification report

        # YOUR CODE HERE
        cr = classification_report(?, ?, target_names=brand_names, zero_division=0)

        # collect data about classifiers
        scores.append(clf.score(X_test, y_test))
        cms.append(cm)
        crs.append(cr)
    
    models_trained = True
except:
    print("You should complete the code")

In [None]:
if models_trained:
    f = plt.figure(figsize=(21, 3))
    for i in range(len(names)):
        ax = plt.subplot(1, 5, i+1)
        sns.heatmap(cm, annot=True, cmap="Greens", cbar=False, ax=ax, xticklabels=brand_names, yticklabels=brand_names)
        ax.set_title(f"{names[i]}")

In [None]:
if models_trained:
    fi = classifiers[4].feature_importances_
    print(f"Feature importance:\n{fi}")

    top_features = sorted(fi, reverse=True)[:2]
    feature_1 = [i for i, x in enumerate(top_features) if x == top_features[0]][0]
    feature_2 = [i for i, x in enumerate(top_features) if x == top_features[1]][0]

    print(f"Features used in calculating decision bounds: {feature_1}, {feature_2}")

    X_select = df.iloc[:, [feature_1, feature_2]].to_numpy()
    X_1 = X_select[:, 0]
    X_2 = X_select[:, 1]

    # bounds_clf = classifiers.copy()
    bounds_clf[1].set_params(**{"max_iter": 10000})

    f = plt.figure(figsize=(21, 3))
    for i in range(len(names)):
        clf = bounds_clf[i].fit(X_select, y)
        ax = plt.subplot(1, 5, i+1)
        DecisionBoundaryDisplay.from_estimator(clf, X_select, ax=ax, alpha=0.3, response_method="predict")
        ax.scatter(X_1, X_2, s=20, edgecolors="darkblue")
        ax.set_title(names[i])


In [None]:
if models_trained:
    best_model = [i for i, x in enumerate(scores) if x == max(scores)][0]
    print(f"Accuracy scores of all models:\n  {scores}\n")
    print(f"Best results:\n  {names[best_model]} with {max(scores):.4f}%\n")
    print(f"Classification Report of {names[best_model]}:\n\n{crs[best_model]}")


Assume we want a juicy pizza that is rich of proteins.

Which brand would our model suggest?

In [None]:
# set prediction parameters
mois = 48.0                     # try 30.0 for dry pizza
prot = 22.5                     # try 8.0 for lean pizza

if models_trained:
    y_pred = bounds_clf[best_model].predict([[mois, prot]])

    print(f"Our model would suggest brand {brand_names[y_pred[0]]}")
else:
    print("Our model would suggest something if it were trained properly.\nPlease train the model first")

### Gratulations!

If your notebook runs without errors till here, you have successfully commenced the exercise.
