# Lab3 Classification and Regression

This week's exercise will be a bit shorter. The objective is to implement the core of the k-nn algorithm (most of the pre-processing and evaluation is given), followed by a Q&A moment

Schedule:
* Classify data using k-means
* Use a confusion matrix to evluate models
* Q&A on the last lectures

## Reminders
* [GitHub repo](https://github.com/Faur/ITU-Data-Science-in-Games-Exercises)
* **Shut down notebooks** when you are done. Otherwise the server will run out of resources, and we will be forced to restart the them.
* Server storage is volatile! I.e. you must **save everything locally** that you don't want to loose.

In [None]:
# ! git pull

In [None]:
# Makes matplotlib plots work better with Jupyter
%matplotlib inline

# Import the necessary libraries. 
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

## Take a look at the data

In [None]:
# Check that data and data path is present
basedir = "../"
file = "fifa.csv"
assert os.path.isdir(f"{basedir}data") and os.path.exists(f"{basedir}data/{file}"), 'Data not found. Make sure to have the most recent version!'

data = pd.read_csv(f'{basedir}/data/fifa.csv', sep=",")

In [None]:
sns.FacetGrid(data, height=5,hue="Position").map(plt.scatter,"SprintSpeed","Agility").add_legend()
sns.FacetGrid(data, height=5,hue="Position", col='Preferred Foot').map(plt.scatter,"ShotPower","Strength").add_legend()

# Classification problem

0. pick a value for K (number of clusters) and N (number of neighbors)
1. split the data in train and validation set
2. normalize fields (in our case the data are already normalized)
3. foreach `datapoint` in `validation set`:
  1. find the N nearest neighbors
  2. set as label of `datapoint` the label that appears most between its neighbors

In [None]:
features = ['Crossing','Finishing','HeadingAccuracy','ShortPassing','Volleys','Dribbling','Curve','FKAccuracy','LongPassing','BallControl','Acceleration','SprintSpeed','Agility','Reactions','Balance','ShotPower','Jumping','Stamina','Strength','LongShots','Aggression','Interceptions','Positioning','Vision','Penalties','Composure','Marking','StandingTackle','SlidingTackle','GKDiving','GKHandling','GKKicking','GKPositioning','GKReflexes']
class_col = 'Position'

# cleaning: remove all the lines that contain a NaN in one of the feature columns
data = data.dropna(subset=fields)

In [None]:
# 0: define K and N
K = 5
N = 20

In [None]:
# 1: split

# random split. It's always a good idea (maybe the dataset is sorted, and so on).
# df.sample has a parameter random_state that allows you to always get the same split (useful for testing). Check docs
train_set = data.sample(frac=0.995)
valid_set = data.drop(train_set.index)

valid_set

In [None]:
## YOUR CODE HERE
# 3: classify
nearest_neighbors = lambda x: (train_set[features].sub(x[features])
                               .pow(2).sum(1).pow(0.5)
                               .nsmallest(N)  # 
                               )

classify = lambda x: (train_set.iloc[nearest_neighbors(x)]
                      [class_col]
                      .mode()[0])

classified_set = valid_set
# classified_set.reset_index()
# classified_set
classified_set['Calculated Position'] = valid_set.apply(classify, axis=1)
classified_set[['Position', 'Calculated Position']]
# classified_set.reset_index()

In [None]:
N = 3

data['Goalkeeper'] = data['Position'] == 'GK'
data

class_col = 'Goalkeeper'

train_set = data.sample(frac=0.99)
valid_set = data.drop(train_set.index)

classified_set = valid_set
classified_set['Calculated Goalkeeper'] = valid_set.apply(classify, axis=1)
classified_set[['Goalkeeper', 'Calculated Goalkeeper']]

In [None]:
# len(classified_set[classified_set['Goalkeeper'] == classified_set['Calculated Goalkeeper']]) / len(classified_set)

# Classification evaluation

1. generate confusion matrix
2. evaluate the classification

In [None]:
from sklearn.metrics import confusion_matrix

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

labels = data[class_col].unique()
classified_set[[class_col, f'Calculated {class_col}']]
cm = confusion_matrix(classified_set[class_col], classified_set[f'Calculated {class_col}'], labels=labels)

print_confusion_matrix(
    cm,
    labels
)

In [None]:
tn, fp, fn, tp = cm.ravel()

accuracy = (tp + tn) / (tn + fp + fn + tp)
err_rate = (fp + fn) / (tn + fp + fn + tp)
sensitiv = (tp) / (fp + tp)
specific = (tn) / (tn + fn)

print(cm.ravel())
print(f'Accuracy: {accuracy}')
print(f'Error rate: {err_rate}')
print(f'Sensitivity: {sensitiv}')
print(f'Specificity: {specific}')

# Linear regression?