## ML project 2

In [None]:
from collections import Counter
import pandas as pd
import numpy as np
import IPython.display
import importlib
import src.utils
from src.utils.preprocessing import import_data, standardize, dummy_code
from src.utils.model_helpers import roc_w_cross_val
# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Preprocessing
from imblearn.over_sampling import SMOTE
# ML models
from sklearn.metrics import plot_roc_curve, auc, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as Lda
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
# dir(src.utils.preprocessing)

In [None]:
DATA_PATH = '../../data'

## Load Data

In [None]:
# Load data
# Labels: # 1 - Wet Cough # 0 - Dry Cough
coarse_data, coarse_labels = import_data(DATA_PATH, segmentation_type = 'coarse', is_user_features=False)
fine_data, fine_labels = import_data(DATA_PATH, segmentation_type = 'fine', is_user_features=False)
no_data, no_labels = import_data(DATA_PATH, segmentation_type = 'no', is_user_features=False)

# display(coarse_data.head(2), fine_data.head(2), no_data.head(2))
# display(coarse_labels.head(2), fine_labels.head(2), no_labels.head(2))



3.Deal with missing metadata appropriately

2.Perform exploratory data analysis and feature engineering (ex. examine the effects of normalizing features, recursive feature elimination)

4
2.Make sure that data from a single subject does not end up in both the training and validation groups


## EDA

In [None]:
# see data_exploration.ipynb

## Data Preprocessing

### Normalization

In [None]:
# For case where is_user_features=False
X_coarse = standardize(coarse_data, 0)
X_fine = standardize(fine_data, 0)
X_no = standardize(no_data, 1)

### Inspect class imbalance

In [None]:
print(no_labels['Label'].sum() / no_labels['Label'].shape[0], coarse_labels['Label'].sum() / coarse_labels['Label'].shape[0],
      fine_labels['Label'].sum() / fine_labels['Label'].shape[0])

In [None]:
# Do something about it: SMODE
#TODO
oversample = SMOTE(random_state=42)
fn_oversampled, ln_oversampled = oversample.fit_resample(fn, ln)

fn_oversampled = pd.DataFrame(fn_oversampled, columns=fn.columns)
ln_oversampled = pd.DataFrame(ln_oversampled, columns=ln.columns)

sns.countplot(x = 'Label', data=ln_oversampled)
print('Resampled dataset shape %s' % Counter(ln_oversampled["Label"]))

### Inspect Missing Data

## Feature Engineering

### Find interesting features

In [None]:
no_data.columns

In [None]:
#no_data_X_tr.iloc[:,0:-5]
# possibly nice features:
     
c0 = ['EEPD50_100', 'EEPD100_150', 'EEPD150_200', 'EEPD200_250', 'EEPD250_300', 'EEPD300_350', 'EEPD350_400', 'EEPD400_450',
       'EEPD450_500', 'EEPD500_550', 'EEPD550_600', 'EEPD600_650','EEPD650_700', 'EEPD700_750', 'EEPD750_800', 'EEPD800_850',
       'EEPD850_900', 'EEPD900_950', 'EEPD950_1000', 'Cough_Length']


c1 = ['Zero_Crossing_Rate',
       'RMS_Power', 'Dominant_Freq', 'Spectral_Centroid', 'Spectral_Rolloff',
       'Spectral_Spread', 'Spectral_Skewness', 'Spectral_Kurtosis',
       'Spectral_Bandwidth', 'Spectral_Flatness', 'Spectral_StDev',
       'Spectral_Slope', 'Spectral_Decrease']

c2 = ['MFCC_mean0', 'MFCC_mean1',
       'MFCC_mean2', 'MFCC_mean3', 'MFCC_mean4', 'MFCC_mean5', 'MFCC_mean6',
       'MFCC_mean7', 'MFCC_mean8', 'MFCC_mean9', 'MFCC_mean10', 'MFCC_mean11',
       'MFCC_mean12', 'MFCC_std0', 'MFCC_std1', 'MFCC_std2', 'MFCC_std3',
       'MFCC_std4', 'MFCC_std5', 'MFCC_std6', 'MFCC_std7', 'MFCC_std8',
       'MFCC_std9', 'MFCC_std10', 'MFCC_std11', 'MFCC_std12']

c3 = ['Crest_Factor',
       'Cough_Length', 'SNR']

c5 = ['PSD_225-425', 'PSD_450-550', 'PSD_1325-1600',
       'PSD_1600-2000', 'PSD_2500-2900', 'PSD_3100-3700']

c6 = c3 + c5 # best one for Linear Regression

# best columns without expert features
c = ['File_Name', 'Crest_Factor', 'Cough_Length', 'SNR', 'PSD_225-425', 'PSD_450-550', 'PSD_1325-1600',
       'PSD_1600-2000', 'PSD_2500-2900', 'PSD_3100-3700']
cc = ['File_Name', 'Crest_Factor', 'Cough_Length', 'PSD_225-425', 'PSD_450-550', 'PSD_1325-1600',
       'PSD_1600-2000', 'PSD_2500-2900', 'PSD_3100-3700']

## Train and Test data

### Choose features

In [None]:
# Reverse loading for choosing diff features
coarse_data, coarse_labels = import_data(DATA_PATH, segmentation_type = 'coarse', is_user_features=True)
fine_data, fine_labels = import_data(DATA_PATH, segmentation_type = 'fine', is_user_features=True)
no_data, no_labels = import_data(DATA_PATH, segmentation_type = 'no', is_user_features=True)

In [None]:
# Choose some features
no_data = no_data[c]
coarse_data = coarse_data[cc]
fine_data = fine_data[cc]

In [None]:
random_state = 1

# Split no segmentation data
no_data_X_tr, no_data_X_te, no_labels_y_tr, no_labels_y_te = train_test(no_data, no_labels, segmentation = False)

# Divide coarse and fine data
coarse_data_X_tr, coarse_data_X_te, coarse_labels_y_tr, coarse_labels_y_te = train_test(coarse_data, coarse_labels, random_state = random_state)

fine_data_X_tr, fine_data_X_te, fine_labels_y_tr, fine_labels_y_te = train_test(fine_data, fine_labels, random_state = random_state)


## Methoden

### Logistic regression

In [None]:
logistic = LogisticRegression()
logistic.fit(no_data_X_tr,no_labels_y_tr['Label'])

fig, ax = plt.subplots()
plot_roc_curve(logistic, no_data_X_te, no_labels_y_te['Label'],
                         alpha=0.3, lw=1, ax=ax)
plt.show()

In [None]:
roc_w_cross_val(no_data_X_tr, no_labels_y_tr, LogisticRegression()) #AUC 62

### SVM

In [None]:
#Create a svm Classifier
clf = SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(no_data_X_tr, no_labels_y_tr['Label'])

#Predict the response for test dataset
y_pred = clf.predict(no_data_X_te)

In [None]:
roc_w_cross_val(no_data_X_tr, no_labels_y_tr, SVC(kernel='linear'))

### Linear Discriminant Analysis

In [None]:
clf = Lda()
clf.fit(no_data_X_tr, no_labels_y_tr['Label'])
Lda(n_components=None, priors=None, shrinkage=None, solver='svd',
  store_covariance=False, tol=0.0001)


roc_w_cross_val(no_data_X_tr, no_labels_y_tr, Lda())

### k Nearest Neighbour

In [None]:
roc_w_cross_val(no_data_X_tr, no_labels_y_tr, KNeighborsClassifier(n_neighbors=16))

### Gaussian Naive Bayes

In [None]:
roc_w_cross_val(no_data_X_tr, no_labels_y_tr, GaussianNB())

### Decision Tree

In [None]:
roc_w_cross_val(no_data_X_tr, no_labels_y_tr, DecisionTreeClassifier(random_state=0))

### RandomForest

In [None]:
roc_w_cross_val(no_data_X_tr, no_labels_y_tr, RandomForestClassifier(max_depth=7, random_state=0))

### eXtreme Gradient Boosting

In [None]:
roc_w_cross_val(no_data_X_tr, no_labels_y_tr, GradientBoostingClassifier(random_state=0))