# <font color="grey">Utilities</font>

Execute `%pip install -r requirements.txt` in order to install the dependencies.

## <font color="grey">Imports</font>

In [None]:
# Data wrangling
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'
import seaborn as sns
sns.set_style("darkgrid")

# Misc
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
from IPython.display import HTML

# Model Selection & Evaluation
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

## <font color="grey">Helper Functions</font>

In [None]:
def evaluate_model(y_true, y_pred):
    metrics = {}
    
    metrics['accuracy_score'] = accuracy_score(y_true, y_pred)
    metrics['f1_score_micro'] = f1_score(y_true, y_pred,average='micro')
    metrics['precision_score_micro'] = precision_score(y_true, y_pred, average='micro')
    
    return metrics

In [None]:
def get_grid_search_results(clf):
    return {k: v for k, v in sorted(dict(zip([str(a) for a in clf.cv_results_['params']], clf.cv_results_['mean_test_score'])).items(), reverse=True, key=lambda item: item[1])}

In [None]:
def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'background-color: {}'.format(color)
    #remove % and cast to float
    data = data.replace('%','', regex=True).astype(float)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

# Load Data

In [None]:
df = pd.read_csv('datasets/letter-recognition.csv')

In [None]:
df.columns =['lettr',
'x-box',
'y-box',
'width',
'high',
'onpix',
'x-bar',
'y-bar',
'x2bar',
'y2bar',
'xybar',
'x2ybr',
'xy2br',
'x-ege',
'xegvy',
'y-ege',
'yegvx'
]

# Exploratory Data Analysis

## Descriptive Statistics

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isna().any().any()

We can conclude that there are not missing values in our data.

In [None]:
df.info()

## Feature distributions

### Distribution of the classes

In [None]:
un = df['lettr'].value_counts().to_frame()
un.columns = ['counts']

In [None]:
fig = px.histogram(x=un.index.values, y=un.counts)
HTML(fig.to_html())

The classes are nearly uniformly distributed, so using accuracy as metric would provide significant results.

### Distributions of the features

In [None]:
def __get_distplot(df, feature):
    return sns.displot(df[feature], kde=True)

In [None]:
for feature in df.columns:
    if feature != 'lettr':
        __get_distplot(df, feature)

### Boxplots

In [None]:
for feature in df.columns:
    if feature != 'lettr':
        fig = px.box(df[feature])
        display(HTML(fig.to_html()))

## Between feature correlation

### Pair Plots

In [None]:
sns.pairplot(df)

### Joint-plots

In [None]:
for feature_x in df.columns:
    for feature_y in df.columns:
        if feature_x != feature_y:
            if not (feature_x == 'lettr' or feature_y == 'lettr'):
#                 print(f"{feature_x} => {feature_y}")
                sns.set_palette("gist_rainbow_r")
                sns.jointplot(x=feature_x, y=feature_y, kind="hex",data=df )
                plt.show()

## Data in vector space with reduced dimensions

In [None]:
pca = PCA(n_components = 3)
df_pca_3d = pca.fit_transform(df.drop(columns=['lettr']))

In [None]:
# Encode the class labels in order to use them as color map
fig = px.scatter_3d(x=df_pca_3d[:,0], y=df_pca_3d[:,1], z=df_pca_3d[:,2], color=LabelEncoder().fit_transform(df.lettr).astype(int), labels={
                    "x" : "Component 1",
                    "y" : "Component 2",
                    "z" : "Component 3",
                    "color" : "Class"
                 },
                     title="3D")
HTML(fig.to_html())

# Preprocessing

## Encoding

During Exploratory Data Analysis we have observed that all of the features are numeric, i.e. of type `int64`, but the class label is nominal. The classes are letters so some order is naturally expected, so we are going to leverage `LabelEncoder`.

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['lettr'])

## Features and target split

In [None]:
y

In [None]:
X = df.drop(columns=['lettr'])

In [None]:
X.shape

In [None]:
y.shape

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Train Test Split

In [None]:
from ipywidgets import interact, widgets
test_size = interact(lambda x : x+0.05,x=widgets.FloatSlider(min=0,max=0.50,value=0.15))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(test_size.widget.result))

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")

# Classification Models

In order to find the best model for our problem, several models which were covered during the course are going to be leveraged. Hyperparameters of each model are going to be tuned using grid search, provided by `sklearn`, and for the final model the combination which results in the best accuracy is going to be chosen. All of the metrics for the models are going to be calculated using the `evaluate_model()` function defined in *Helper Functions*. 

In [None]:
# Dictionary containing all the metrics for trained and evaluated models
# key: model_name
# value: collection of metrics provided from `evaluate_model()`
metrics = {}

## Training and Evaluation

### Logistic Regression

In [None]:
# Grid Search for the best hyper-parameters
lr = LogisticRegression()
parameters = {'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
clf = GridSearchCV(lr, parameters, n_jobs=6)
%time clf.fit(X_train, y_train)

In [None]:
get_grid_search_results(clf)

From the Grid Search and cross-validation we can observe that using `newton-cg` as solver provides the best mean test score.

In [None]:
lr = LogisticRegression(solver='newton-cg')
lr.fit(X, y)
y_pred = lr.predict(X_test)
metrics['logistic_regression'] = evaluate_model(y_test, y_pred)

### QDA

In [None]:
# Grid Search for the best hyper-parameters
qda = QuadraticDiscriminantAnalysis()
parameters = {'reg_param' : np.arange(0, 1, 0.05)}
clf = GridSearchCV(qda, parameters)
clf.fit(X_train, y_train)

In [None]:
get_grid_search_results(clf)

In [None]:
qda = QuadraticDiscriminantAnalysis(reg_param=0)
qda.fit(X_train, y_train)
y_pred = qda.predict(X_test)
metrics['quadratic_discriminant_analysis'] = evaluate_model(y_test, y_pred)

### LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Grid Search for the best hyper-parameters
lda = LinearDiscriminantAnalysis()
parameters = {'solver' : ['svd', 'lsqr', 'eigen']}
clf = GridSearchCV(lda, parameters)
clf.fit(X_train, y_train)

In [None]:
get_grid_search_results(clf)

In [None]:
lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
metrics['linear_discriminant_analysis'] = evaluate_model(y_test, y_pred)

### Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC

# Grid Search for the best hyper-parameters
svm = SVC()
parameters = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 
              'degree': [2, 3, 4]
             }

clf = GridSearchCV(svm, parameters)
%time clf.fit(X_train, y_train)

In [None]:
get_grid_search_results(clf)

In [None]:
svm = SVC(kernel='rbf') # degree is going to be discarted because it is only used in combination with polynomial kernel
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
metrics['svm'] = evaluate_model(y_test, y_pred)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Grid Search for the best hyper-parameters
dt = DecisionTreeClassifier()
parameters = {'criterion' : ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [5, 10, 15, 20], 'min_samples_split' : [2, 4, 6, 8]}
clf = GridSearchCV(dt, parameters)
clf.fit(X_train, y_train)

In [None]:
get_grid_search_results(clf)

In [None]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_split=2, splitter='best')
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
metrics['decision_tree'] = evaluate_model(y_test, y_pred)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Grid Search for the best hyper-parameters
rf = RandomForestClassifier()
parameters = {'n_estimators':[50, 100, 200], 
              'criterion' : ['gini', 'entropy'], 
              'max_depth': [5, 10, 15, 20], 
             }
clf = GridSearchCV(rf, parameters)
%time clf.fit(X_train, y_train)

In [None]:
get_grid_search_results(clf)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
metrics['random_forest'] = evaluate_model(y_test, y_pred)

### Multi Layered Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(verbose=True)
parameters = {'hidden_layer_sizes':[(100,), (32, 64), (16, 32, 32, 32, 16, 8)], 
              'solver' : ['adam'],
              'activation': ['relu', 'tanh']
             }
clf = GridSearchCV(mlp, parameters)
%time clf.fit(X_train, y_train)

In [None]:
get_grid_search_results(clf)

In [None]:
mlp = MLPClassifier(random_state=1, hidden_layer_sizes = (100,), max_iter=300, activation='tanh')
%time mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
metrics['mlp'] = evaluate_model(y_test, y_pred)

In [None]:
y_pred = mlp.predict(X_test)
evaluate_model(y_test, y_pred)

## Comparison

In [None]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
metrics_df.style.apply(highlight_max)

# Conclusion

`RandomForestClassifier` with 200 estimators provided the best results on the test set. Test set was determined with bootstrapping in several iterations the results were the best

## Train RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)

This trained model can be used to classify other unseen samples.

## Classify arbitrary sample

In [None]:
sample = [ 0.5103336 ,  0.89711149,  0.43585819,  1.16191044,  0.68208278,
        1.03772653, -1.93553538, -0.97372441,  0.34500404,  0.28847025,
       -1.69285679,  0.51475238, -0.44856532, -2.15867493, -0.26955622,
        0.74116389]

In [None]:
f'The sample belongs to class {rf.predict([sample])[0]}'