# Machine Learning - Homework 1


In [None]:
#importing libraries
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import *
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt

## Loading the dataset

In [None]:
dataset = pd.read_csv("train_set.tsv", sep='\t', header=0)
X = dataset.iloc[:,:-2]
y = dataset.iloc[:,-2]
print('File loaded: %d samples.' %(len(dataset)))


## Normalize the dataset

In [None]:
#normalize dataset with min-max normalization

dataset = (dataset - dataset.min()) / (dataset.max() - dataset.min())


In [None]:
from matplotlib import colors

def get_hist(data, feature, min_value = None, max_value = None, bins = 100):
    if min_value is None:
        min_value = data[feature].min()
    if max_value is None:
        max_value = data[feature].max()
    tmp = data[(data[feature] >= min_value) & (data[feature] <= max_value)]
    plt.hist(tmp.loc[:, feature], bins = bins)
    plt.title('histogram distribution of {} (min_value: {}, max_value: {}, bins: {})'.format(feature, min_value, max_value, bins))
    plt.show()
    plt.close()
    plt.hist2d(pd.Series(np.array([i for i in range(tmp.loc[:, feature].shape[0])])), tmp.loc[:, feature], bins=bins, norm = colors.LogNorm())
    plt.title('2D histogram distribution of {} (min_value: {}, max_value: {}, bins: {})'.format(feature, min_value, max_value, bins))
    plt.show()
    plt.close()
    return

get_hist(dataset, 'num_collisions')
get_hist(dataset, 'num_collisions',max_value=300)
get_hist(dataset, 'num_collisions',max_value=100,bins=200)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=20, n_jobs=-1)
rf.fit(X, y)
rf.feature_importances_

# plot feature_importances
sorted_idx = rf.feature_importances_.argsort()
plt.barh(X.columns[sorted_idx], rf.feature_importances_[sorted_idx])

In [None]:
# remove features with feature_importances < 0.01
for importance, feature in zip(rf.feature_importances_, X.columns):
    if importance < 0.01:
        X = X.drop(feature, axis=1)

# compute feature_importances again
rf.fit(X, y)
rf.feature_importances_

# plot feature_importances again
sorted_idx = rf.feature_importances_.argsort()
plt.barh(X.columns[sorted_idx], rf.feature_importances_[sorted_idx])


In [None]:
new_df = dataset.copy()
columns = []
for col in dataset.columns:
    if col not in X.columns and col != 'num_collisions':
        new_df = new_df.drop(col, axis=1, errors='ignore')
    else:
        columns.append(col)
entry_number_step1 = new_df.shape[0]

# for col in new_df.columns:
#     get_hist(new_df, col)



### Removing the outliers

In [None]:
from scipy.stats import zscore
#remove outliers
 

z_scores = zscore(new_df)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 5).all(axis=1)
new_df = new_df[filtered_entries]

entry_number_step2 = new_df.shape[0]
print('Samples before dropping outliers: {}'.format(entry_number_step1))
print('Samples after dropping outliers: {}'.format(entry_number_step2))
print('Samples removed:',format(entry_number_step1-entry_number_step2))






In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer

X = new_df.drop('num_collisions', axis=1)
y = new_df.num_collisions

scaler = MinMaxScaler()
x_scal = scaler.fit_transform(X)

discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')
y_disc = discretizer.fit_transform(y.values.reshape(-1,1))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_scal, y_disc,shuffle= True, stratify=y_disc, test_size=0.33, random_state=20)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier


random_state = 20

models = {
    'DT' : DecisionTreeClassifier(random_state=random_state),
    'RF' : RandomForestClassifier(random_state=random_state),
    'SVM' : SVC(random_state=random_state),
    'KNN' : KNeighborsClassifier(),
    'BOOST' : AdaBoostClassifier(random_state=random_state),
    'MLPN' : MLPClassifier(random_state=random_state),
}  


In [None]:
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns


def fit_models(models, X_train, y_train):
    for name, model in models.items():
        model.fit(X_train, y_train.ravel())
        print(name, 'trained.')
    return

def evaluate_models(models, X_test, y_test, x_scal, y_disc):
    for name, model in models.items():
        y_pred = model.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred,zero_division=0))
        cv_accuracy = cross_val_score(model, x_scal, y_disc.ravel(), n_jobs=-1, scoring='accuracy')
        cv_f1_macro = cross_val_score(model, x_scal, y_disc.ravel(), n_jobs=-1, scoring='f1_macro')
        print(cross_val_score(model, X_test, y_test, scoring='accuracy'))
        print("%0.4f (+/- %0.4f)" % (cv_accuracy.mean(), cv_accuracy.std() * 2))
        print(cross_val_score(model, X_test, y_test, scoring='f1_macro'))
        print("%0.4f (+/- %0.4f)" % (cv_f1_macro.mean(), cv_f1_macro.std() * 2))
        #plot confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10,7))
        sns.heatmap(cm, annot=True)
        plt.xlabel('Predicted')
        plt.ylabel('Truth')

        plt.show()


        print('------------------------------------')
    return  


fit_models(models, X_train, y_train)
evaluate_models(models, X_test, y_test,x_scal, y_disc)

### Under Sampling

In [None]:
c0 = min(np.count_nonzero(y_train == 0), int((1/3)*y_train.shape[0]))
c1 = min(np.count_nonzero(y_train == 1), int((1/4)*y_train.shape[0]))
c2 = min(np.count_nonzero(y_train == 2), int((1/5)*y_train.shape[0]))
c3 = min(np.count_nonzero(y_train == 3), int((1/6)*y_train.shape[0]))
c4 = min(np.count_nonzero(y_train == 4), int((1/7)*y_train.shape[0]))

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy={0:c0, 1:c1, 2:c2, 3:c3, 4:c4}, random_state=20)
x_train_under, y_train_under = rus.fit_resample(X_train, y_train)

x_train_under_dim = x_train_under.shape[0]
print('Samples before undersampling: {}'.format(X_train.shape[0]))
print('Samples after undersampling: {}'.format(x_train_under_dim))
print('Samples removed: {}'.format(X_train.shape[0]-x_train_under_dim))


plt.hist(y_train_under, bins=5)

### Over Sampling 

In [None]:

c0 = max(np.count_nonzero(y_train_under == 0), int((1/3)*y_train.shape[0]))
c1 = max(np.count_nonzero(y_train_under == 1), int((1/4)*y_train.shape[0]))
c2 = max(np.count_nonzero(y_train_under == 2), int((1/5)*y_train.shape[0]))
c3 = max(np.count_nonzero(y_train_under == 3), int((1/6)*y_train.shape[0]))
c4 = max(np.count_nonzero(y_train_under == 4), int((1/7)*y_train.shape[0]))

from imblearn.over_sampling import SMOTE

sm = SMOTE(k_neighbors=1, n_jobs=-1, sampling_strategy={0:c0,1:c1,2:c2,3:c3,4:c4}, random_state=20)
x_train_over, y_train_over = sm.fit_resample(x_train_under, y_train_under)
entry_number_step4 = x_train_over.shape[0]
print('Samples before oversampling: {}'.format(x_train_under_dim))
print('Samples after oversampling: {}'.format(entry_number_step4))
print('Samples added: {}'.format(entry_number_step4-x_train_under_dim))


plt.hist(y_train_over, bins=5)


In [None]:
train_perc = round((x_train_over.shape[0]/(x_train_over.shape[0]+X_test.shape[0]))*100, 2)
print('train set dimention: {} ({}%)'.format(x_train_over.shape[0], train_perc))
print('test set dimention: {} ({}%)'.format(X_test.shape[0], 100-train_perc))

fit_models(models, x_train_over, y_train_over)
evaluate_models(models, X_test, y_test,x_scal,y_disc)

### Model Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

def tune_model(model, param_grid, scoring, x_train, y_train, grid_jobs):
    print('tuning...')
    clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, verbose=1, n_jobs=grid_jobs)
    clf.fit(x_train, y_train.ravel())
    print('done')
    print()
    print("Best: %f using %s" % (clf.best_score_, clf.best_params_))
    best_params = clf.best_params_.copy()
    return best_params

c0 = min(np.count_nonzero(y_train == 0), int((1/6)*y_train.shape[0]))
c1 = min(np.count_nonzero(y_train == 1), int((1/8)*y_train.shape[0]))
c2 = min(np.count_nonzero(y_train == 2), int((1/10)*y_train.shape[0]))
c3 = min(np.count_nonzero(y_train == 3), int((1/12)*y_train.shape[0]))
c4 = min(np.count_nonzero(y_train == 4), int((1/14)*y_train.shape[0]))


     

rus = RandomUnderSampler(sampling_strategy={0:c0, 1:c1, 2:c2, 3:c3,4:c4}, random_state=20)
x_tuning, y_tuning = rus.fit_resample(X_train, y_train)


     

c0 = max(np.count_nonzero(y_tuning == 0), int((1/6)*y_train.shape[0]))
c1 = max(np.count_nonzero(y_tuning == 1), int((1/8)*y_train.shape[0]))
c2 = max(np.count_nonzero(y_tuning == 2), int((1/10)*y_train.shape[0]))
c3 = max(np.count_nonzero(y_tuning == 3), int((1/12)*y_train.shape[0]))
c4 = max(np.count_nonzero(y_tuning == 4), int((1/14)*y_train.shape[0]))


     

sm = SMOTE(k_neighbors=1, n_jobs=-1, sampling_strategy={0:c0,1:c1,2:c2,3:c3,4:c4}, random_state=20)
x_tuning, y_tuning = sm.fit_resample(x_tuning, y_tuning)


     

plt.hist(y_tuning, bins=5)

SVM_param_grid = {
    'C': [0.5, 1],
    'kernel': ['rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'decision_function_shape': ['ovo', 'ovr'],
    'random_state': [20]
}

best_accuracy_params = tune_model(SVC(), SVM_param_grid, 'accuracy', x_tuning, y_tuning, -1)

best_f1macro_params = tune_model(SVC(), SVM_param_grid, 'f1_macro',  x_tuning, y_tuning, -1)

tuned_models = {
    'SVM_accuracy': SVC(**best_accuracy_params),
    'SVM_f1macro': SVC(**best_f1macro_params)
}

fit_models(tuned_models, x_train_over, y_train_over)

evaluate_models(tuned_models, X_test, y_test, x_scal, y_disc)



## Classification

## Regression