# IE 48B Homework 03 - Dorukhan Kılınç 2017402093

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from tslearn.piecewise import PiecewiseAggregateApproximation
import cvxpy as cp
import warnings
warnings.filterwarnings("ignore")

## Task: Comparison of NN Classifiers with Alternative Representations and Distance Measures

The aim of this task is to compare alternative representations and distance measures for classification. To achieve this, 5 different data sets from [http://www.timeseriesclassification.com/](http://www.timeseriesclassification.com/) are taken and nn classifiers with 4 distance measures(manhattan, euclidean, chebyshev, and minkowski distance with p = 5) are applied to 3 different representations (raw, principle components, and piecewise aggregate approximation). 

### Preperation of the Functions

Since we will build the models on 5 different data sets, to avoid excessive coding, it is reasonable to write functions to build the models at first. Code below will take the original train and test data and firstly build models using the representations and distance measures mentioned above. Then, each model will be evaluated based on their repeated cross-validation score. In this task, 10-fold cross-validation with 5 repeats is used. Finally, based on these cross-validation scores, for each representation, models with the best scores will be built to make predictions on the test data and their accuracy scores will be compared. 

In [5]:
#Get mean cross validation score for each model
def cv(model, X, y):
    
    rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=0)
    scores = []
    
    
    for train_index, test_index in rskf.split(X, y):
    
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        scores.append(accuracy_score(y_test, model.predict(X_test)))
    
    return np.mean(scores)

In [6]:
#get pca representations for train and test set.
#selection of number of components to use is automatized such that
#the minimum number of components that are able to explain 90% of the variance is chosen.  
def PCA_representation(X_train, X_test):
    pca = PCA()
    components_train = pca.fit_transform(X_train)
    n_components = None
    for n_components in range(1,X_train.shape[1]):
        if sum(pca.explained_variance_ratio_[:2]) > 90: break
    components_train = components_train[:, :n_components]
    components_test = pca.transform(X_test)[:, :n_components]
    return(components_train, components_test)

In [7]:
#paa representation such that each segment represents 10 observations
def PAA_representation(X_train, X_test):
    n_paa_segments = int(X_train.shape[1] / 10)
    paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
    shape_train = (len(X_train), n_paa_segments)
    shape_test = (len(X_test), n_paa_segments)
    return (paa.fit_transform(X_train).reshape(shape_train),
            paa.fit_transform(X_test).reshape(shape_test))

In [8]:
#apply NN classifiers with k = 1,3,5 to each representation using the distance measures
k_vals = [1, 3, 5]

distance_measures = ["manhattan","euclidean","chebyshev", "p = 5"]

def model_representation(X_rep, y):
    scores = []

    for k in k_vals:
        score = []
        for measure in distance_measures:
            model = None
            if measure == "p = 5": model = KNeighborsClassifier(n_neighbors = k, p = 5)
            else: model = KNeighborsClassifier(n_neighbors = k, metric = measure)
            score.append(cv(model, X_rep, y))
    
        scores.append(score)
        
    return np.array(scores)

In [9]:
#perform cv to select the best parameters and then make predicions on the test data
representations = ["raw", "paa", "pca"]

def print_results(X_train, y_train, X_test, y_test):
    #representations for X_train and X_test
    train, test = (None, None)
    for representation in representations:
        
        if representation == "raw": 
            train, test = (X_train, X_test)
        elif representation == "paa":
            train, test = PAA_representation(X_train, X_test)
        else: train, test = PCA_representation(X_train, X_test)
        
        #Calculate cv scores and find the best parameter set
        scores = model_representation(train, y_train)
        best_ind = np.unravel_index(np.argmax(scores, axis=None), scores.shape)
        
        best_k = best_ind[0]
        best_k = k_vals[best_k]
        
        best_measure = best_ind[1]
        best_measure = distance_measures[best_measure]
        
        #build the model using the best parameter set
        model = None
        if best_measure == "p = 5": model = KNeighborsClassifier(n_neighbors = best_k, p = 5)
        else: model = KNeighborsClassifier(n_neighbors = best_k, metric = best_measure)  
        
        model.fit(train, y_train)
        y_pred = model.predict(test)
        
        row = {"representation": representation, "cv mean accuracy": scores[best_ind],
              "best k parameter": best_k, "best distance measure": best_measure,
              "test accuracy": accuracy_score(y_test, y_pred)}
        print(row)


## Dataset 1: Earthquakes

The earthquake classification problem involves predicting whether a major event is about to occur based on the most recent readings in the surrounding area. The data is taken from Northern California Earthquake Data Center and each data is an averaged reading for one hour, with the first reading taken on Dec 1st 1967, the last in 2003.

In [10]:
#Read the data and prepare train and test matrices 
earthquakes_train = []
for line in open("Earthquakes_TRAIN.txt"):
    earthquakes_train.append(line.split())

earthquakes_train = np.array(earthquakes_train, dtype = "float")

earthquakes_test = []
for line in open("Earthquakes_TEST.txt"):
    earthquakes_test.append(line.split())

earthquakes_test = np.array(earthquakes_test, dtype = "float")

In [11]:
X1_train = earthquakes_train[:,1:]
X1_test = earthquakes_test[:,1:]

y1_train = earthquakes_train[:,0]
y1_test = earthquakes_test[:,0]

In [12]:
print_results(X1_train, y1_train, X1_test, y1_test)

{'representation': 'raw', 'cv mean accuracy': 0.796875, 'best k parameter': 5, 'best distance measure': 'euclidean', 'test accuracy': 0.7266187050359713}
{'representation': 'paa', 'cv mean accuracy': 0.8043371212121212, 'best k parameter': 5, 'best distance measure': 'manhattan', 'test accuracy': 0.7410071942446043}
{'representation': 'pca', 'cv mean accuracy': 0.8105871212121213, 'best k parameter': 5, 'best distance measure': 'manhattan', 'test accuracy': 0.7697841726618705}


Here above is the best model parameters for each representation and corresponding test accuracies. The best model is the last one with test accuracy near 0.77, with pca represenation, k = 5 and manhattan distance as the distance measure.   

## Dataset 2: ECG200

This dataset was formatted by R. Olszewski as part of his thesis “Generalized feature extraction for structural	pattern recognition in time-series data,” at Carnegie Mellon University, 2001. Each series traces the electrical activity recorded during one
heartbeat. The two classes are a normal heartbeat and a Myocardial Infarction.

In [13]:
ecg_train = []
for line in open("ECG200_TRAIN.txt"):
    ecg_train.append(line.split())

ecg_train = np.array(ecg_train, dtype = "float")

ecg_test = []
for line in open("ECG200_TEST.txt"):
    ecg_test.append(line.split())

ecg_test = np.array(ecg_test, dtype = "float")

In [14]:
X2_train = ecg_train[:,1:]
X2_test = ecg_test[:,1:]

y2_train = ecg_train[:,0]
y2_test = ecg_test[:,0]

In [15]:
print_results(X2_train, y2_train, X2_test, y2_test)

{'representation': 'raw', 'cv mean accuracy': 0.91, 'best k parameter': 3, 'best distance measure': 'euclidean', 'test accuracy': 0.9}
{'representation': 'paa', 'cv mean accuracy': 0.866, 'best k parameter': 1, 'best distance measure': 'p = 5', 'test accuracy': 0.87}
{'representation': 'pca', 'cv mean accuracy': 0.91, 'best k parameter': 3, 'best distance measure': 'euclidean', 'test accuracy': 0.9}


Here above are the best model parameters for each representation and corresponding test accuracies. In this case, there is a tie between raw and pca representations. 

## Dataset 3: MixedShapesSmallTrain

The data consist of "pseudo" time series obtained by converting two-dimensional shapes into one-dimensional time series from Wang, Xiaoyue, et al. "Annotating historical archives of images." Proceedings of the 8th ACM/IEEE-CS joint conference on Digital libraries. ACM, 2008 and Keogh, Eamonn, et al. "LB_Keogh supports exact indexing of shapes under rotation invariance with arbitrary representations and distance measures." Proceedings of the 32nd international conference on Very large data bases. VLDB Endowment, 2006.

There are five classes corresponding to different shapes.

- Class 1: Arrowhead
- Class 2: Butterfly
- Class 3: Fish
- Class 4: Seashell
- Class 5: Shield

There are two datasets out of these data. The two datasets share a same test set and only differ in the number of training instances. 


In [16]:
shapes_train = []
for line in open("MixedShapesSmallTrain_TRAIN.txt"):
    shapes_train.append(line.split())

shapes_train = np.array(shapes_train, dtype = "float")

shapes_test = []
for line in open("MixedShapesSmallTrain_TEST.txt"):
    shapes_test.append(line.split())

shapes_test = np.array(shapes_test, dtype = "float")

In [17]:
X3_train = shapes_train[:,1:]
X3_test = shapes_test[:,1:]

y3_train = shapes_train[:,0]
y3_test = shapes_test[:,0]

In [18]:
print_results(X3_train, y3_train, X3_test, y3_test)

{'representation': 'raw', 'cv mean accuracy': 0.8460000000000002, 'best k parameter': 1, 'best distance measure': 'p = 5', 'test accuracy': 0.8255670103092784}
{'representation': 'paa', 'cv mean accuracy': 0.8460000000000002, 'best k parameter': 1, 'best distance measure': 'p = 5', 'test accuracy': 0.8263917525773196}
{'representation': 'pca', 'cv mean accuracy': 0.836, 'best k parameter': 1, 'best distance measure': 'euclidean', 'test accuracy': 0.8354639175257732}


Here above are the best model parameters for each representation and corresponding test accuracies. In this case, the best one is the pca representation with k = 1 and euclidian metric as the distance measure.

## Dataset 4: Plane

A data set of plane outlines

In [19]:
plane_train = []
for line in open("Plane_TRAIN.txt"):
    plane_train.append(line.split())

plane_train = np.array(plane_train, dtype = "float")

plane_test = []
for line in open("Plane_TEST.txt"):
    plane_test.append(line.split())

plane_test = np.array(plane_test, dtype = "float")

In [20]:
X4_train = plane_train[:,1:]
X4_test = plane_test[:,1:]

y4_train = plane_train[:,0]
y4_test = plane_test[:,0]

In [21]:
print_results(X4_train, y4_train, X4_test, y4_test)

{'representation': 'raw', 'cv mean accuracy': 0.9814545454545456, 'best k parameter': 3, 'best distance measure': 'manhattan', 'test accuracy': 0.9619047619047619}
{'representation': 'paa', 'cv mean accuracy': 0.9740000000000001, 'best k parameter': 1, 'best distance measure': 'manhattan', 'test accuracy': 0.9619047619047619}
{'representation': 'pca', 'cv mean accuracy': 0.9814545454545456, 'best k parameter': 3, 'best distance measure': 'euclidean', 'test accuracy': 0.9619047619047619}


Here above are the best model parameters for each representation and corresponding test accuracies. Notice that there is a tie between all the representations.

## Dataset 5: GunPointOldVersusYoung

This dataset is a remake of the famous GunPoint dataset released in 2003. We strive to mimic in every aspect the recording of the original GunPoint. The actors include one male and one female. They are the same actors who created the original GunPoint.

We record two scenarios, Gun and Point (also known as Gun and NoGun). In each scenario, the actors aim at a eye-level target. The difference between Gun and Point is that for the Gun scenario, the actors hold a gun, and in the Point scenario, the actors point with just their fingers. A complete Gun action involves the actor moves hand from an initial rest position, points the gun at target, puts gun back to waist holster and then brings free hand to the initial rest position. Each complete action conforms to a five-second cycle. With 30fps, this translates into 150 frames per action. We extract the centroid of the hand from each frame and use its x-axis coordinate to form a time series.

In [22]:
gun_train = []
for line in open("GunPointOldVersusYoung_TRAIN.txt"):
    gun_train.append(line.split())

gun_train = np.array(gun_train, dtype = "float")

gun_test = []
for line in open("GunPointOldVersusYoung_TEST.txt"):
    gun_test.append(line.split())

gun_test = np.array(gun_test, dtype = "float")

In [23]:
X5_train = plane_train[:,1:]
X5_test = plane_test[:,1:]

y5_train = plane_train[:,0]
y5_test = plane_test[:,0]

In [24]:
print_results(X5_train, y5_train, X5_test, y5_test)

{'representation': 'raw', 'cv mean accuracy': 0.9814545454545456, 'best k parameter': 3, 'best distance measure': 'manhattan', 'test accuracy': 0.9619047619047619}
{'representation': 'paa', 'cv mean accuracy': 0.9740000000000001, 'best k parameter': 1, 'best distance measure': 'manhattan', 'test accuracy': 0.9619047619047619}
{'representation': 'pca', 'cv mean accuracy': 0.9814545454545456, 'best k parameter': 3, 'best distance measure': 'euclidean', 'test accuracy': 0.9619047619047619}


Here above are the best model parameters for each representation and corresponding test accuracies. Notice that there is a tie between all the representations.