# Data Analytics II - Final Project Part I

## Methodology:

- Feature scaling

- Feature importance using Gini Index

- Deletion of unnecessary features

- Correlation between features

For each classifier:

    - SFS analysis using standard hyperparameters
    
    - Gridsearch using the selected features

    - Analysis of classification metrics

- Fitting of the best model with all the training data

- Predictions to the test dataset

## Imports

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
import json
from time import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_selection import SequentialFeatureSelector 
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore")
# !pip install -U kaleido

## Downloading and unziping datasets

In [None]:
# !wget https://uni-muenster.sciebo.de/s/bmzyEnwSscZ0tam/download?path=%2F&files=train_set.csv
# !unzip -qq /content/download?path=%2F

## Mounting drive to save results

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## Training dataset preprocessing

In [None]:
df = pd.read_csv('/home/dimi/Downloads/task_1_datasets/train_set.csv') #'/content/task_1/train_set.csv'
y  = df['y']
X  = pd.DataFrame(
    data    = StandardScaler().fit_transform(df.drop('y', axis=1)),
    columns = df.columns.drop('y')
)

In [None]:
df_label_counts = pd.DataFrame(
    data    = np.transpose(np.unique(y, return_counts=True)),
    columns = ['label', 'count'] 
).sort_values(
    by        = 'count',
    ascending = False
).reset_index(
    drop = True
)

df_label_counts['pct'] = 100*df_label_counts['count']/y.size

df_label_counts

## Feature Importance

In [None]:
# rf = RandomForestClassifier(
#     n_estimators = 1000,
#     n_jobs       = -1,
#     max_samples  = 0.66
# )
# rf.fit(X, y)

In [None]:
# df_feature_importances = pd.DataFrame(
#     data    = zip(df.columns.drop('y'), rf.feature_importances_),
#     columns = ['feature', 'importance'] 
# ).sort_values(
#     by        = 'importance',
#     ascending = False,
# ).reset_index(
#     drop = True
# )

# df_feature_importances.to_csv(
#     'results/task_1/feature_importance_final_project_part_i.csv', #'/content/drive/MyDrive/Colab Notebooks/feature_importance_final_project_part_i.csv', 
#     index = False
# )

In [None]:
max_features = 30
df_feature_importances = pd.read_csv('results/task_1/feature_importance_final_project_part_i.csv')
fig = px.bar(
    data_frame = df_feature_importances[:max_features],
    x          = 'feature',
    y          = 'importance'
)
fig.update_xaxes(tickangle=90)
fig.write_image('results/task_1/feature_importance_final_project_part_i.pdf')
fig.show()

## Correlation between features

In [None]:
fig, ax = plt.subplots(figsize=(18,14))
sb.heatmap(X[df_feature_importances.loc[:8, 'feature']].corr(), cmap="Blues", annot=True, linewidths=0.1)
fig.savefig('results/task_1/feature_correlations_final_project_part_i.pdf')
fig.show()

Features are not correlated: no need to remove correlated features to prevent overfitting!

## SFS to each classifier

OBS: no need for SFS when using Random Forests.

In [None]:
n_jobs               = -1
cv                   = 10
verbose              = 3
n_features_to_select = 100
plot_step            = 10
scoring              = 'accuracy'

In [None]:
def run_sfs(estimator, X, y, n_features_to_select, cv, scoring, n_jobs, sorted_features=None, plot_step=1):
    
    if sorted_features is None:
        sfs = SequentialFeatureSelector(
            estimator            = estimator,
            direction            = 'forward',
            n_features_to_select = n_features_to_select,
            n_jobs               = n_jobs,
            cv                   = cv,
            scoring              = scoring,
        )
        sfs.fit(X, y)
        sorted_features = sfs.get_feature_names_out()
    
    kf               = KFold(n_splits=cv, shuffle=True)
    accs             = []
    stds             = []
    n_features_range = np.arange(plot_step, n_features_to_select+1, plot_step)

    for n_features in n_features_range:
        print(n_features)
        features   = sorted_features[:n_features]
        inner_accs = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X.loc[train_index][features], X.loc[test_index][features]
            y_train, y_test = y.loc[train_index], y.loc[test_index]
            clf = estimator
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            inner_accs.append(accuracy_score(y_test, y_pred))

        accs.append(np.mean(inner_accs))
        stds.append(np.std(inner_accs))

    fig = px.line(
        x       = n_features_range,
        y       = accs,
        error_y = stds,
        title   = 'Sequential Feature Selector: ' + estimator.__class__.__name__,
        labels  = dict(
            x = 'Number of features', 
            y = 'Accuracy'
        )
    )
    fig.update_xaxes(tickvals=n_features_range)
    fig.write_image('results/task_1/sfs_{}_final_project_part_i.pdf'.format(estimator.__class__.__name__))
    fig.show()
    
    np.savetxt(
        fname = 'results/task_1/sfs_{}_final_project_part_i.csv'.format(estimator.__class__.__name__), 
        X     = sorted_features, 
        fmt   = '%s'
    )

### LDA

In [None]:
%%time
run_sfs(
    LinearDiscriminantAnalysis(),
    X[df_feature_importances.loc[:100, 'feature']],
    y,
    n_features_to_select,
    cv,
    scoring,
    n_jobs,
    df_feature_importances['feature'],
    plot_step
)

### KNN

In [None]:
%%time
run_sfs(
    KNeighborsClassifier(n_jobs=n_jobs),
    X[df_feature_importances.loc[:100, 'feature']],
    y,
    n_features_to_select,
    cv,
    scoring,
    n_jobs,
    df_feature_importances['feature'],
    plot_step
)

### SVM

In [None]:
%%time
run_sfs(
    SVC(),
    X[df_feature_importances.loc[:100, 'feature']],
    y,
    n_features_to_select,
    cv,
    scoring,
    n_jobs,
    df_feature_importances['feature'],
    plot_step
)

### LR

In [None]:
%%time
run_sfs(
    LogisticRegression(n_jobs=n_jobs),
    X[df_feature_importances.loc[:100, 'feature']],
    y,
    n_features_to_select,
    cv,
    scoring,
    n_jobs,
    df_feature_importances['feature'],
    plot_step
)

## Gridsearch

In [None]:
def run_gridsearch(estimator, X, y, param_grid, cv, scoring, n_jobs, verbose):
    gs = GridSearchCV(
        estimator  = estimator,
        n_jobs     = n_jobs,
        cv         = cv,
        verbose    = verbose,
        scoring    = scoring,
        param_grid = param_grid
    )
    gs.fit(X, y)
    results = {
        'Estimator': str(estimator.__class__.__name__),
        'Number of features': X.shape[1],
        'Best result': '{:.3f} +- {:.3f}'.format(
            float(gs.cv_results_['mean_test_score'][gs.best_index_]),
            float(gs.cv_results_['std_test_score'][gs.best_index_]),
        ),
        'Best hyperparameters': gs.best_params_
    }
    with open(
        file = 'results/task_1/gridsearch_{}_final_project_part_i.json'.format(estimator.__class__.__name__),
        mode = 'w'
    ) as file:
        json.dump(results, file, indent=4)
    print(results)
    return gs

### Random Forest

In [None]:
%%time

param_grid = dict(
    n_estimators      = [1000],
    criterion         = ['gini'],
    max_depth         = [10, 11, 12],
    class_weight      = [None],
    max_samples       = [0.66, 0.75, 1]
)

run_gridsearch(
    RandomForestClassifier(), 
    X[df_feature_importances.loc[:100, 'feature']], 
    y, 
    param_grid, 
    cv, 
    scoring, 
    n_jobs, 
    verbose
)

### KNN

In [None]:
%%time

param_grid = dict(
    n_neighbors = [5, 10, 20, 30], #large K prevents overfitting!
)

run_gridsearch(
    KNeighborsClassifier(), 
    X[df_feature_importances.loc[:60, 'feature']], 
    y, 
    param_grid, 
    cv, 
    scoring, 
    1, # use just one kernel with KNN, otherwise, memory fuuuuuull 
    verbose
)

### SVM

In [None]:
%%time

param_grid = dict(
    kernel = ['linear', 'poly', 'rbf'],
    degree = [2, 3]
)

run_gridsearch(
    SVC(), 
    X[df_feature_importances.loc[:50, 'feature']], 
    y, 
    param_grid, 
    cv, 
    scoring, 
    n_jobs, 
    verbose
)

### LR

In [None]:
%%time

param_grid = dict(
    solver   = ['saga'],
    penalty  = ['elasticnet'],
    C        = [0.5, 0.75, 1, 1.25, 1.5],
    l1_ratio = [0.01, 0.25, 0.5, 0.75, 0.99]
)

run_gridsearch(
    LogisticRegression(), 
    X[df_feature_importances.loc[:80, 'feature']], 
    y, 
    param_grid, 
    cv, 
    scoring, 
    n_jobs, 
    verbose
)

### LDA

In [None]:
%%time

param_grid = dict()

run_gridsearch(
    LinearDiscriminantAnalysis(), 
    X[df_feature_importances.loc[:90, 'feature']], 
    y, 
    param_grid, 
    cv, 
    scoring, 
    n_jobs, 
    verbose
)