# Task 3 Skeleton Code

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
import re
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import scipy.io as sio
plt.rcParams['figure.figsize'] = 10,10

import sklearn.datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Original Data

In [2]:
my_iris_data = sklearn.datasets.load_iris()
print ("my_iris_data.data.shape:",my_iris_data.data.shape)
print ("labels:",my_iris_data.target)

my_iris_data.data.shape: (150, 4)
labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [3]:
X_train = np.concatenate((my_iris_data.data[10:50,:],my_iris_data.data[60:100,:], my_iris_data.data[110:150,:]))
X_train = np.concatenate((np.ones((X_train.shape[0],1)),X_train),axis=1) # Append bias term 1
y_train = np.concatenate((my_iris_data.target[10:50],my_iris_data.target[60:100], my_iris_data.target[110:150]))
print ("X_train.shape:", X_train.shape)
print ("y_train.shape:", y_train.shape)

X_test = np.concatenate((my_iris_data.data[40:50,:],my_iris_data.data[90:100,:], my_iris_data.data[140:150,:]))
X_test = np.concatenate((np.ones((X_test.shape[0],1)),X_test),axis=1) # Append bias term 1
y_test = np.concatenate((my_iris_data.target[40:50],my_iris_data.target[90:100], my_iris_data.target[140:150]))
print ("X_test.shape:", X_test.shape)
print ("y_test.shape:", y_test.shape)

X_train.shape: (120, 5)
y_train.shape: (120,)
X_test.shape: (30, 5)
y_test.shape: (30,)


## Preprocess the labels to get 3 datasets

In [4]:
y_train1 = np.copy(y_train); y_test1 = np.copy(y_test)
y_train2 = np.copy(y_train); y_test2 = np.copy(y_test)
y_train3 = np.copy(y_train); y_test3 = np.copy(y_test)

y_train1[y_train == 1] = -1
y_train1[y_train == 2] = -1
y_train1[y_train == 0] = 1
y_test1[y_test == 1] = -1
y_test1[y_test == 2] = -1
y_test1[y_test == 0] = 1

y_train2[y_train == 1] = 1
y_train2[y_train == 2] = -1
y_train2[y_train == 0] = -1
y_test2[y_test == 1] = 1
y_test2[y_test == 2] = -1
y_test2[y_test == 0] = -1

y_train3[y_train == 1] = -1
y_train3[y_train == 2] = 1
y_train3[y_train == 0] = -1
y_test3[y_test == 1] = -1
y_test3[y_test == 2] = 1
y_test3[y_test == 0] = -1

## Task 3a: Off-the-shelf Libraries for OvA and Explicit for Multiclass on iris

### 3a.1 OvA on iris

In [5]:
K=3

rf_1 = RandomForestClassifier(n_estimators=100, max_depth=12)
bds_real1 = rf_1.fit(X_train, y_train1)

rf_2 = RandomForestClassifier(n_estimators=100, max_depth=12)
bds_real2 = rf_2.fit(X_train, y_train2)

rf_3 = RandomForestClassifier(n_estimators=100, max_depth=12)
bds_real3 = rf_3.fit(X_train, y_train3)

### Results on training set

In [6]:
accuracy1 = bds_real1.score(X_train, y_train1)
accuracy2 = bds_real2.score(X_train, y_train2)
accuracy3 = bds_real3.score(X_train, y_train3)

print ("The train accuracy1:", accuracy1*100, "%.")
print ("The train accuracy2:", accuracy2*100, "%.")
print ("The train accuracy3:", accuracy3*100, "%.")

The train accuracy1: 100.0 %.
The train accuracy2: 100.0 %.
The train accuracy3: 100.0 %.


In [7]:
preds = np.array([bds_real1.predict(X_train), bds_real2.predict(X_train), bds_real3.predict(X_train)]).T
pred = np.argmax(preds, axis=1)
print ("The train accuracy:", accuracy_score(pred, y_train)*100, "%.")

The train accuracy: 100.0 %.


### Results on test set

In [8]:
preds = np.array([bds_real1.predict(X_test), bds_real2.predict(X_test), bds_real3.predict(X_test)]).T
pred = np.argmax(preds, axis=1)
print ("The test accuracy:", accuracy_score(pred, y_test)*100, "%.")

The test accuracy: 100.0 %.


### 3a.2 Explicit Multiclass on iris

In [9]:
K=10
rf_real = RandomForestClassifier(n_estimators=100, max_depth=12)
bds_real = rf_real.fit(X_train, y_train)

### Results on training set

In [10]:
print ("The training accuracy:", bds_real.score(X_train, y_train)*100, "%.")

The training accuracy: 100.0 %.


### Results on test set

In [11]:
print ("The test accuracy:",  bds_real.score(X_test, y_test)*100, "%.")

The test accuracy: 100.0 %.


## Task 3b: Off-the-shelf Libraris for OvA and Explicit for Multiclass on DNA

### DNA dataset processing

In [12]:
def read_dna(filename, n_examples, n_features):
    with open(filename) as F:
        labels = np.zeros(n_examples)
        data = np.zeros([n_examples, n_features])

        for i, str_line in enumerate(F.readlines()):
            line0 = list(map(int, filter(None, re.split(r'\s|:1', str_line.strip()))))  #convert into list
            labels[i] = line0.pop(0) - 1

            for j in line0:
                data[i][j-1] += 1.0
        return labels, data

In [13]:
y_train_DNA, X_train_DNA = read_dna("dna.scale.txt",2000,180)
y_test_DNA, X_test_DNA = read_dna("dna.scale.t",1186,180)

print ("X_train_DNA.shape:", X_train_DNA.shape)
print ("y_train_DNA.shape:", y_train_DNA.shape)
print ("X_test_DNA.shape:", X_test_DNA.shape)
print ("y_test_DNA.shape:", y_test_DNA.shape)

X_train_DNA.shape: (2000, 180)
y_train_DNA.shape: (2000,)
X_test_DNA.shape: (1186, 180)
y_test_DNA.shape: (1186,)


### Preprocess the labels to get 3 datasets

In [14]:
y_train1_DNA = np.copy(y_train_DNA); y_test1_DNA = np.copy(y_test_DNA)
y_train2_DNA = np.copy(y_train_DNA); y_test2_DNA = np.copy(y_test_DNA)
y_train3_DNA = np.copy(y_train_DNA); y_test3_DNA = np.copy(y_test_DNA)

y_train1_DNA[y_train_DNA == 1] = -1
y_train1_DNA[y_train_DNA == 2] = -1
y_train1_DNA[y_train_DNA == 0] = 1
y_test1_DNA[y_test_DNA == 1] = -1
y_test1_DNA[y_test_DNA == 2] = -1
y_test1_DNA[y_test_DNA == 0] = 1

y_train2_DNA[y_train_DNA == 1] = 1
y_train2_DNA[y_train_DNA == 2] = -1
y_train2_DNA[y_train_DNA == 0] = -1
y_test2_DNA[y_test_DNA == 1] = 1
y_test2_DNA[y_test_DNA == 2] = -1
y_test2_DNA[y_test_DNA == 0] = -1

y_train3_DNA[y_train_DNA == 1] = -1
y_train3_DNA[y_train_DNA == 2] = 1
y_train3_DNA[y_train_DNA == 0] = -1
y_test3_DNA[y_test_DNA == 1] = -1
y_test3_DNA[y_test_DNA == 2] = 1
y_test3_DNA[y_test_DNA == 0] = -1

### 3b.1 OvA  on DNA

In [26]:
K=100

rf_1 = RandomForestClassifier(n_estimators=100, max_depth=15)
bdt_real1 = rf_1.fit(X_train_DNA, y_train1_DNA)

rf_2 = RandomForestClassifier(n_estimators=100, max_depth=15)
bdt_real2 = rf_2.fit(X_train_DNA, y_train2_DNA)

rf_3 = RandomForestClassifier(n_estimators=100, max_depth=15)
bdt_real3 = rf_3.fit(X_train_DNA, y_train3_DNA)

### Results on training set

In [27]:
accuracy1 = bdt_real1.score(X_train_DNA, y_train1_DNA)
accuracy2 = bdt_real2.score(X_train_DNA, y_train2_DNA)
accuracy3 = bdt_real3.score(X_train_DNA, y_train3_DNA)

print ("The train accuracy1:", accuracy1*100, "%.")
print ("The train accuracy2:", accuracy2*100, "%.")
print ("The train accuracy3:", accuracy3*100, "%.")

The train accuracy1: 100.0 %.
The train accuracy2: 100.0 %.
The train accuracy3: 100.0 %.


In [28]:
preds = np.array([bdt_real1.predict(X_train_DNA), bdt_real2.predict(X_train_DNA), bdt_real3.predict(X_train_DNA)]).T
pred = np.argmax(preds, axis=1)
print ("The training accuracy:", accuracy_score(pred, y_train_DNA)*100, "%.")

The training accuracy: 100.0 %.


### Results on test set

In [29]:
preds = np.array([bdt_real1.predict(X_test_DNA), bdt_real2.predict(X_test_DNA), bdt_real3.predict(X_test_DNA)]).T
pred = np.argmax(preds, axis=1)
print ("The test accuracy:", accuracy_score(pred, y_test_DNA)*100, "%.")

The test accuracy: 93.00168634064082 %.


### 3b.2 Explicit MultiClass on DNA

In [19]:
K = 100
rf_real = RandomForestClassifier(n_estimators=100, max_depth=12)
bdt_real = rf_real.fit(X_train_DNA, y_train_DNA)

### Results on training set

In [20]:
print ("The training accuracy:", bdt_real.score(X_train_DNA, y_train_DNA)*100, "%.")

The training accuracy: 99.9 %.


### Results on test set

In [21]:
print ("The test accuracy:", bdt_real.score(X_test_DNA, y_test_DNA)*100, "%.")

The test accuracy: 94.35075885328837 %.
