# EE 559 - Homework 6

## Name: Aristotelis-Angelos Papadopoulos
## USC ID: 3804-2945-23

In [235]:
# Question a

import pandas as pd
from sklearn.model_selection import train_test_split

# Create a dataframe with the dataset
dataset = pd.read_csv('Frogs_MFCCs.csv', sep = ",", header = 'infer')

# Take 70% of the data for training and the rest for test
train_set, test_set = train_test_split(dataset, test_size=0.3)

**Question b -> i**

The Exact Match ratio and the Hamming loss methods for evaluating multi-label 
classification problems are well presented in the paper "A Literature Survey on Algorithms for Multi-label
Learning" by Mohammad S. Sorower which can be found [here](https://www.researchgate.net/profile/Mohammad_Sorower/publication/266888594_A_Literature_Survey_on_Algorithms_for_Multi-label_Learning/links/58d1864392851cf4f8f4b72a/A-Literature-Survey-on-Algorithms-for-Multi-label-Learning.pdf).

In [236]:
# Question b -> ii

from sklearn.svm import SVC

# As the problem suggests, we will train a classifier for each label.
# So, let us first extract the columns corresponding to these 3 labels.
family_train = train_set['Family'] # Label 1
genus_train = train_set['Genus'] # Label 2
species_train = train_set['Species'] # Label 3
features_train = train_set.iloc[:,0:22] 

family_test = test_set['Family'] # Label 1
genus_test = test_set['Genus'] # Label 2
species_test = test_set['Species'] # Label 3
features_test = test_set.iloc[:,0:22] 

In [240]:
# Question b -> ii (Continue)

# In this cell, we will train a SVM for the label "Family".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the parameters C and gamma. Since, I did not have a problem
# with the computational power, I created a grid of 400 points and picked
# the values that resulted in the lowest CV error.

import numpy as np
from sklearn.model_selection import StratifiedKFold

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []


C_range = np.logspace(-3, 6, num=10) # C ranges from 10^-3 to 10^6 with a log increment 
gamma_range = np.linspace(0.1, 4, 40) # gamma ranges from 0.1 to 4 with linear increment


for C1 in C_range:
    for gamm in gamma_range: 
        # I will use 10-fold cross validation and I will shuffle
        # the data before each split!
        skf = StratifiedKFold(n_splits=10, shuffle=True) 
        skf.get_n_splits(features_train)
        # Initialize a list in order to get the errors from 10-fold CV
        list_10fold = []
        for train_index, test_index in skf.split(features_train,family_train):
            X_train, X_test = features_train.values[train_index], features_train.values[test_index]
            y_train, y_test = family_train.values[train_index], family_train.values[test_index]
            
            # Now, I will fit my SVM model in the k-1 folds!
            classif = SVC(C=C1, kernel='rbf', gamma=gamm, decision_function_shape='ovr')
            classif.fit(X_train, y_train)
            list_10fold.append(1 - classif.score(X_test, y_test))
        # Calculate the average error of list_10fold and save it in ave_CV_errors list
        ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(10,40) # Indexes are (C, gamma)
# Take the indexes of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 10 different values of C
# and the 40 different values of gamma!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The best gamma is gamma=", gamma_range[ind[1]])
print("The CV error for those values is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best values 
# calculated from the 10-fold cross validation!
classif = SVC(C=C_range[ind[0]], kernel='rbf', gamma=gamma_range[ind[1]], decision_function_shape='ovr')
classif.fit(features_train, family_train)

print("For the label 'Family', we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, family_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Family' is: ", classif.score(features_test, family_test)*100, "%\n")

The minimum CV error happens for : C = 10000.0
The best gamma is gamma= 2.3
The CV error for those values is:  0.0057587362197410565
For the label 'Family', we have: 

The accuracy of the model on the training set is:  100.0 %
The cross-validation accuracy of the model is:  99.42412637802589 %
The test accuracy for the label 'Family' is:  99.21259842519686 %



In [244]:
# Question b -> ii (Continue)

# In this cell, we will train a SVM for the label "Genus".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the parameters C and gamma. Since, I did not have a problem
# with the computational power, I created a grid of 400 points and picked
# the values that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []


C_range = np.logspace(-3, 6, num=10) # C ranges from 10^-3 to 10^6 with a log increment 
gamma_range = np.linspace(0.1, 4, 40) # gamma ranges from 0.1 to 4 with linear increment


for C1 in C_range:
    for gamm in gamma_range: 
        # I will use 10-fold cross validation and I will shuffle
        # the data before each split!
        skf = StratifiedKFold(n_splits=10, shuffle=True) 
        skf.get_n_splits(features_train)
        # Initialize a list in order to get the errors from 10-fold CV
        list_10fold = []
        for train_index, test_index in skf.split(features_train,genus_train):
            X_train, X_test = features_train.values[train_index], features_train.values[test_index]
            y_train, y_test = genus_train.values[train_index], genus_train.values[test_index]
            
            # Now, I will fit my SVM model in the k-1 folds!
            classif = SVC(C=C1, kernel='rbf', gamma=gamm, decision_function_shape='ovr')
            classif.fit(X_train, y_train)
            list_10fold.append(1 - classif.score(X_test, y_test))
        # Calculate the average error of list_10fold and save it in ave_CV_errors list
        ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(10,40) # Indexes are (C, gamma)
# Take the indexes of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 10 different values of C
# and the 40 different values of gamma!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The best gamma is gamma=", gamma_range[ind[1]])
print("The CV error for those values is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best values 
# calculated from the 10-fold cross validation!
classif = SVC(C=C_range[ind[0]], kernel='rbf', gamma=gamma_range[ind[1]], decision_function_shape='ovr')
classif.fit(features_train, genus_train)


print("For the label 'Genus', we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, genus_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Genus' is: ", classif.score(features_test, genus_test)*100, "%\n")

The minimum CV error happens for : C = 10000.0
The best gamma is gamma= 2.1999999999999997
The CV error for those values is:  0.008136772350812272
For the label 'Genus', we have: 

The accuracy of the model on the training set is:  100.0 %
The cross-validation accuracy of the model is:  99.18632276491877 %
The test accuracy for the label 'Genus' is:  98.98100972672533 %



In [246]:
# Question b -> ii (Continue)

# In this cell, we will train a SVM for the label "Species".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the parameters C and gamma. Since, I did not have a problem
# with the computational power, I created a grid of 400 points and picked
# the values that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []


C_range = np.logspace(-3, 6, num=10) # C ranges from 10^-3 to 10^6 with a log increment 
gamma_range = np.linspace(0.1, 4, 40) # gamma ranges from 0.1 to 4 with linear increment


for C1 in C_range:
    for gamm in gamma_range: 
        # I will use 10-fold cross validation and I will shuffle
        # the data before each split!
        skf = StratifiedKFold(n_splits=10, shuffle=True) 
        skf.get_n_splits(features_train)
        # Initialize a list in order to get the errors from 10-fold CV
        list_10fold = []
        for train_index, test_index in skf.split(features_train,species_train):
            X_train, X_test = features_train.values[train_index], features_train.values[test_index]
            y_train, y_test = species_train.values[train_index], species_train.values[test_index]
            
            # Now, I will fit my SVM model in the k-1 folds!
            classif = SVC(C=C1, kernel='rbf', gamma=gamm, decision_function_shape='ovr')
            classif.fit(X_train, y_train)
            list_10fold.append(1 - classif.score(X_test, y_test))
        # Calculate the average error of list_10fold and save it in ave_CV_errors list
        ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(10,40) # Indexes are (C, gamma)
# Take the indexes of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 10 different values of C
# and the 40 different values of gamma!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The best gamma is gamma=", gamma_range[ind[1]])
print("The CV error for those values is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best values 
# calculated from the 10-fold cross validation!
classif = SVC(C=C_range[ind[0]], kernel='rbf', gamma=gamma_range[ind[1]], decision_function_shape='ovr')
classif.fit(features_train, species_train)


print("For the label 'Species', we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, species_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Species' is: ", classif.score(features_test, species_test)*100, "%\n")

The minimum CV error happens for : C = 100.0
The best gamma is gamma= 2.3
The CV error for those values is:  0.008339586823110168
For the label 'Species', we have: 

The accuracy of the model on the training set is:  100.0 %
The cross-validation accuracy of the model is:  99.16604131768898 %
The test accuracy for the label 'Species' is:  98.98100972672533 %



In [247]:
# Question b -> ii (Continue)

# Classifier for label 'Family'
classif1 = SVC(C=10000, decision_function_shape='ovr', gamma=2.3, kernel='rbf')
classif1.fit(features_train, family_train)

# Classifier for label 'Genus'
classif2 = SVC(C=10000, decision_function_shape='ovr', gamma=2.2, kernel='rbf')
classif2.fit(features_train, genus_train)

# Classifier for label 'Species'
classif3 = SVC(C=100, decision_function_shape='ovr', gamma=2.3, kernel='rbf')
classif3.fit(features_train, species_train)

# Now, I will concatenate the 3 predicted labels into 1 numpy array
y_pred = np.concatenate((np.expand_dims(classif1.predict(features_test), axis=1), 
                         np.expand_dims(classif2.predict(features_test), axis=1),
                         np.expand_dims(classif3.predict(features_test), axis=1)), axis=1)

# Now, I will convert the true labels into a numpy array
y_true = np.concatenate((np.expand_dims(np.asarray(family_test), axis=1), 
                         np.expand_dims(np.asarray(genus_test), axis=1),
                         np.expand_dims(np.asarray(species_test), axis=1)), axis=1)

# Calculate the Exact Match score and the Hamming Loss
EMscore = 0
count_Ham = 0
for i in range(0,y_true.shape[0]):
    count_EM = 0
    for j in range(0,y_true.shape[1]):
        if y_true[i][j]==y_pred[i][j]:
            count_EM += 1
        else:
            count_Ham += 1
    if count_EM==3:
        EMscore += 1
EMscore = EMscore / y_true.shape[0]
Hamming_Loss = count_Ham / (y_true.shape[0] * y_true.shape[1])
            
print("So, using Binary Relevance with SVM with Gaussian Kernel, we have:","\n")
print("The Exact Match score is:", EMscore)
print("The Hamming Loss is:", Hamming_Loss)

So, using Binary Relevance with SVM with Gaussian Kernel, we have: 

The Exact Match score is: 0.9856415006947661
The Hamming Loss is: 0.00941794040450826


In [252]:
# Question b -> iii

# In this question, we will apply the L1-penalized SVM algorithm.
# Note that we will not normalize the attributes since they were
# already normalized!

from sklearn.svm import LinearSVC

# In this cell, we will train a SVM for the label "Family".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the L1 regularization coefficient. 
# Since I did not have a problem with the computational power, I created a grid 
# of 100 points and picked the value that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []

C_range = np.logspace(-3, 6, num=100) # C ranges from 10^-3 to 10^6 with a log increment 

for C1 in C_range: 
    # I will use 10-fold cross validation and I will shuffle
    # the data before each split!
    skf = StratifiedKFold(n_splits=10, shuffle=True) 
    skf.get_n_splits(features_train)
    # Initialize a list in order to get the errors from 10-fold CV
    list_10fold = []
    for train_index, test_index in skf.split(features_train,family_train):
        X_train, X_test = features_train.values[train_index], features_train.values[test_index]
        y_train, y_test = family_train.values[train_index], family_train.values[test_index]
            
        # Now, I will fit my SVM model in the k-1 folds!
        classif = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=C1, multi_class='ovr')
        classif.fit(X_train, y_train)
        list_10fold.append(1 - classif.score(X_test, y_test))
    # Calculate the average error of list_10fold and save it in ave_CV_errors list
    ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(100,1) # Indexes are (C)
# Take the index of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 100 different values of C!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The CV error for this value is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best value 
# calculated from the 10-fold cross validation!
classif = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=C_range[ind[0]], multi_class='ovr')
classif.fit(features_train, family_train)


print("For the label 'Family', we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, family_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Family' is: ", classif.score(features_test, family_test)*100, "%\n")

The minimum CV error happens for : C = 2.848035868435802
The CV error for this value is:  0.061545746729767446
For the label 'Family', we have: 

The accuracy of the model on the training set is:  93.86417791898332 %
The cross-validation accuracy of the model is:  93.84542532702326 %
The test accuracy for the label 'Family' is:  93.42288096340899 %



In [253]:
# Question b -> iii (Continue)

# In this cell, we will train a SVM for the label "Genus".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the L1 regularization coefficient. 
# Since I did not have a problem with the computational power, I created a grid 
# of 100 points and picked the value that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []

C_range = np.logspace(-3, 6, num=100) # C ranges from 10^-3 to 10^6 with a log increment 

for C1 in C_range: 
    # I will use 10-fold cross validation and I will shuffle
    # the data before each split!
    skf = StratifiedKFold(n_splits=10, shuffle=True) 
    skf.get_n_splits(features_train)
    # Initialize a list in order to get the errors from 10-fold CV
    list_10fold = []
    for train_index, test_index in skf.split(features_train,genus_train):
        X_train, X_test = features_train.values[train_index], features_train.values[test_index]
        y_train, y_test = genus_train.values[train_index], genus_train.values[test_index]
            
        # Now, I will fit my SVM model in the k-1 folds!
        classif = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=C1, multi_class='ovr')
        classif.fit(X_train, y_train)
        list_10fold.append(1 - classif.score(X_test, y_test))
    # Calculate the average error of list_10fold and save it in ave_CV_errors list
    ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(100,1) # Indexes are (C)
# Take the index of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 100 different values of C!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The CV error for this value is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best value 
# calculated from the 10-fold cross validation!
classif = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=C_range[ind[0]], multi_class='ovr')
classif.fit(features_train, genus_train)


print("For the label 'Genus', we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, genus_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Genus' is: ", classif.score(features_test, genus_test)*100, "%\n")

The minimum CV error happens for : C = 18738.174228603868
The CV error for this value is:  0.04605887310243938
For the label 'Genus', we have: 

The accuracy of the model on the training set is:  95.59173947577443 %
The cross-validation accuracy of the model is:  95.39411268975606 %
The test accuracy for the label 'Genus' is:  95.83140342751274 %



In [254]:
# Question b -> iii (Continue)

# In this cell, we will train a SVM for the label "Species".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the L1 regularization coefficient. 
# Since I did not have a problem with the computational power, I created a grid 
# of 100 points and picked the value that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []

C_range = np.logspace(-3, 6, num=100) # C ranges from 10^-3 to 10^6 with a log increment 

for C1 in C_range: 
    # I will use 10-fold cross validation and I will shuffle
    # the data before each split!
    skf = StratifiedKFold(n_splits=10, shuffle=True) 
    skf.get_n_splits(features_train)
    # Initialize a list in order to get the errors from 10-fold CV
    list_10fold = []
    for train_index, test_index in skf.split(features_train,species_train):
        X_train, X_test = features_train.values[train_index], features_train.values[test_index]
        y_train, y_test = species_train.values[train_index], species_train.values[test_index]
            
        # Now, I will fit my SVM model in the k-1 folds!
        classif = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=C1, multi_class='ovr')
        classif.fit(X_train, y_train)
        list_10fold.append(1 - classif.score(X_test, y_test))
    # Calculate the average error of list_10fold and save it in ave_CV_errors list
    ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(100,1) # Indexes are (C)
# Take the index of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 100 different values of C!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The CV error for this value is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best value 
# calculated from the 10-fold cross validation!
classif = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=C_range[ind[0]], multi_class='ovr')
classif.fit(features_train, species_train)


print("For the label 'Species', we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, species_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Species' is: ", classif.score(features_test, species_test)*100, "%\n")

The minimum CV error happens for : C = 1519.9110829529332
The CV error for this value is:  0.03931985181947569
For the label 'Species', we have: 

The accuracy of the model on the training set is:  96.44559173947577 %
The cross-validation accuracy of the model is:  96.06801481805243 %
The test accuracy for the label 'Species' is:  96.34089856415007 %



In [258]:
# Question b -> iii (Continue)

# Classifier for label 'Family'
classif4 = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=2.848, multi_class='ovr')
classif4.fit(features_train, family_train)

# Classifier for label 'Genus'
classif5 = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=18738.2, multi_class='ovr')
classif5.fit(features_train, genus_train)

# Classifier for label 'Species'
classif6 = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=1519.9, multi_class='ovr')
classif6.fit(features_train, species_train)

# Now, I will concatenate the 3 predicted labels into 1 numpy array
y_pred2 = np.concatenate((np.expand_dims(classif4.predict(features_test), axis=1), 
                         np.expand_dims(classif5.predict(features_test), axis=1),
                         np.expand_dims(classif6.predict(features_test), axis=1)), axis=1)

# Now, I will convert the true labels into a numpy array
y_true = np.concatenate((np.expand_dims(np.asarray(family_test), axis=1), 
                         np.expand_dims(np.asarray(genus_test), axis=1),
                         np.expand_dims(np.asarray(species_test), axis=1)), axis=1)

# Calculate the Exact Match score and the Hamming Loss
EMscore = 0
count_Ham = 0
for i in range(0,y_true.shape[0]):
    count_EM = 0
    for j in range(0,y_true.shape[1]):
        if y_true[i][j]==y_pred2[i][j]:
            count_EM += 1
        else:
            count_Ham += 1
    if count_EM==3:
        EMscore += 1
EMscore = EMscore / y_true.shape[0]
Hamming_Loss = count_Ham / (y_true.shape[0] * y_true.shape[1])
            
print("So, using Binary Relevance with L1-penalized SVM, we have:","\n")
print("The Exact Match score is:", EMscore)
print("The Hamming Loss is:", Hamming_Loss)

So, using Binary Relevance with L1-penalized SVM, we have: 

The Exact Match score is: 0.9170912459471978
The Hamming Loss is: 0.04801605681642736


In [255]:
# Question b -> iv

# In this question, we are asked to use SMOTE in order to remedy
# the class imbalance appearing in the dataset.

from imblearn.over_sampling import SMOTE

# In this question, we will apply the L1-penalized SVM algorithm.
# Note that we will not normalize the attributes since they were
# already normalized!

# In this cell, we will train a SVM for the label "Family".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the L1 regularization coefficient. 
# Since I did not have a problem with the computational power, I created a grid 
# of 100 points and picked the value that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []

C_range = np.logspace(-3, 6, num=100) # C ranges from 10^-3 to 10^6 with a log increment 

for C1 in C_range: 
    # I will use 10-fold cross validation and I will shuffle
    # the data before each split!
    skf = StratifiedKFold(n_splits=10, shuffle=True) 
    skf.get_n_splits(features_train)
    # Initialize a list in order to get the errors from 10-fold CV
    list_10fold = []
    for train_index, test_index in skf.split(features_train,family_train):
        X_train, X_test = features_train.values[train_index], features_train.values[test_index]
        y_train, y_test = family_train.values[train_index], family_train.values[test_index]
        
        # Use SMOTE on the training data
        sm = SMOTE(random_state=12, ratio = 'all')
        X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
            
        # Now, I will fit my SVM model in the k-1 folds!
        classif = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=C1, multi_class='ovr')
        classif.fit(X_train_res, y_train_res)
        list_10fold.append(1 - classif.score(X_test, y_test))
    # Calculate the average error of list_10fold and save it in ave_CV_errors list
    ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(100,1) # Indexes are (C)
# Take the index of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 100 different values of C!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The CV error for this value is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best value 
# calculated from the 10-fold cross validation!
classif = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=C_range[ind[0]], multi_class='ovr')
classif.fit(features_train, family_train)


print("For the label 'Family' using SMOTE, we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, family_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Family' is: ", classif.score(features_test, family_test)*100, "%\n")

The minimum CV error happens for : C = 18738.174228603868
The CV error for this value is:  0.07843023762711585
For the label 'Family' using SMOTE, we have: 

The accuracy of the model on the training set is:  93.88403494837172 %
The cross-validation accuracy of the model is:  92.15697623728842 %
The test accuracy for the label 'Family' is:  93.23761000463178 %



In [256]:
# Question b -> iv (Continue)

# In this question, we are asked to use SMOTE in order to remedy
# the class imbalance appearing in the dataset.

# In this cell, we will train a SVM for the label "Genus".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the L1 regularization coefficient. 
# Since I did not have a problem with the computational power, I created a grid 
# of 100 points and picked the value that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []

C_range = np.logspace(-3, 6, num=100) # C ranges from 10^-3 to 10^6 with a log increment 

for C1 in C_range: 
    # I will use 10-fold cross validation and I will shuffle
    # the data before each split!
    skf = StratifiedKFold(n_splits=10, shuffle=True) 
    skf.get_n_splits(features_train)
    # Initialize a list in order to get the errors from 10-fold CV
    list_10fold = []
    for train_index, test_index in skf.split(features_train,genus_train):
        X_train, X_test = features_train.values[train_index], features_train.values[test_index]
        y_train, y_test = genus_train.values[train_index], genus_train.values[test_index]
        
        # Use SMOTE on the training data
        sm = SMOTE(random_state=12, ratio = 'all')
        X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
            
        # Now, I will fit my SVM model in the k-1 folds!
        classif = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=C1, multi_class='ovr')
        classif.fit(X_train_res, y_train_res)
        list_10fold.append(1 - classif.score(X_test, y_test))
    # Calculate the average error of list_10fold and save it in ave_CV_errors list
    ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(100,1) # Indexes are (C)
# Take the index of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 100 different values of C!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The CV error for this value is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best value 
# calculated from the 10-fold cross validation!
classif = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=C_range[ind[0]], multi_class='ovr')
classif.fit(features_train, genus_train)


print("For the label 'Genus' using SMOTE, we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, genus_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Genus' is: ", classif.score(features_test, genus_test)*100, "%\n")

The minimum CV error happens for : C = 4.328761281083062
The CV error for this value is:  0.08538843580758301
For the label 'Genus' using SMOTE, we have: 

The accuracy of the model on the training set is:  95.35345512311359 %
The cross-validation accuracy of the model is:  91.46115641924169 %
The test accuracy for the label 'Genus' is:  95.78508568781844 %



In [257]:
# Question b -> iv (Continue)

# In this question, we are asked to use SMOTE in order to remedy
# the class imbalance appearing in the dataset.

# In this cell, we will train a SVM for the label "Species".
# As suggested, we will use 10-fold cross validation in order to choose the 
# best values for the L1 regularization coefficient. 
# Since I did not have a problem with the computational power, I created a grid 
# of 100 points and picked the value that resulted in the lowest CV error.

# In this list, I will save the different CV errors that I am going to compare.
ave_CV_errors = []

C_range = np.logspace(-3, 6, num=100) # C ranges from 10^-3 to 10^6 with a log increment 

for C1 in C_range: 
    # I will use 10-fold cross validation and I will shuffle
    # the data before each split!
    skf = StratifiedKFold(n_splits=10, shuffle=True) 
    skf.get_n_splits(features_train)
    # Initialize a list in order to get the errors from 10-fold CV
    list_10fold = []
    for train_index, test_index in skf.split(features_train,species_train):
        X_train, X_test = features_train.values[train_index], features_train.values[test_index]
        y_train, y_test = species_train.values[train_index], species_train.values[test_index]
        
        # Use SMOTE on the training data
        sm = SMOTE(random_state=12, ratio = 'all')
        X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
            
        # Now, I will fit my SVM model in the k-1 folds!
        classif = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, C=C1, multi_class='ovr')
        classif.fit(X_train_res, y_train_res)
        list_10fold.append(1 - classif.score(X_test, y_test))
    # Calculate the average error of list_10fold and save it in ave_CV_errors list
    ave_CV_errors.append(sum(list_10fold) / len(list_10fold))

# Convert the ave_CV_errors list into a numpy array
Arrray = np.asarray(ave_CV_errors)
Arrray = Arrray.reshape(100,1) # Indexes are (C)
# Take the index of the minimum element of the Arrray
ind = np.unravel_index(np.argmin(Arrray, axis=None), Arrray.shape)

# The list ave_CV_errors contains the average CV errors obtained by the 100 different values of C!
# In order to pick the best values, we are going to pick the lowest value out of this list!!
print("The minimum CV error happens for : C =", C_range[ind[0]])
print("The CV error for this value is: ", min(ave_CV_errors))

# At this point, we have to train our model using the best value 
# calculated from the 10-fold cross validation!
classif = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=C_range[ind[0]], multi_class='ovr')
classif.fit(features_train, species_train)


print("For the label 'Species' using SMOTE, we have:","\n")
print("The accuracy of the model on the training set is: ", classif.score(features_train, species_train)*100, "%")
print("The cross-validation accuracy of the model is: ", (1 - min(ave_CV_errors))*100, "%")
# And now, we will test it on the test set
print("The test accuracy for the label 'Species' is: ", classif.score(features_test, species_test)*100, "%\n")

The minimum CV error happens for : C = 6.5793322465756825
The CV error for this value is:  0.04227783612861772
For the label 'Species' using SMOTE, we have: 

The accuracy of the model on the training set is:  96.36616362192217 %
The cross-validation accuracy of the model is:  95.77221638713823 %
The test accuracy for the label 'Species' is:  96.29458082445576 %



In [259]:
# Question b -> iv (Continue)

# Use SMOTE
X_train_res7, y_train_res7 = sm.fit_sample(features_train, family_train)

# Classifier for label 'Family'
classif7 = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=18738.2, multi_class='ovr')
classif7.fit(X_train_res7, y_train_res7)

# Use SMOTE
X_train_res8, y_train_res8 = sm.fit_sample(features_train, genus_train)

# Classifier for label 'Genus'
classif8 = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=4.3, multi_class='ovr')
classif8.fit(X_train_res8, y_train_res8)

# Use SMOTE
X_train_res9, y_train_res9 = sm.fit_sample(features_train, species_train)

# Classifier for label 'Species'
classif9 = LinearSVC(penalty='l1',loss='squared_hinge', dual=False, C=6.6, multi_class='ovr')
classif9.fit(X_train_res9, y_train_res9)

# Now, I will concatenate the 3 predicted labels into 1 numpy array
y_pred3 = np.concatenate((np.expand_dims(classif7.predict(features_test), axis=1), 
                         np.expand_dims(classif8.predict(features_test), axis=1),
                         np.expand_dims(classif9.predict(features_test), axis=1)), axis=1)

# Now, I will convert the true labels into a numpy array
y_true = np.concatenate((np.expand_dims(np.asarray(family_test), axis=1), 
                         np.expand_dims(np.asarray(genus_test), axis=1),
                         np.expand_dims(np.asarray(species_test), axis=1)), axis=1)

# Calculate the Exact Match score and the Hamming Loss
EMscore = 0
count_Ham = 0
for i in range(0,y_true.shape[0]):
    count_EM = 0
    for j in range(0,y_true.shape[1]):
        if y_true[i][j]==y_pred3[i][j]:
            count_EM += 1
        else:
            count_Ham += 1
    if count_EM==3:
        EMscore += 1
EMscore = EMscore / y_true.shape[0]
Hamming_Loss = count_Ham / (y_true.shape[0] * y_true.shape[1])
            
print("So, using Binary Relevance with L1-penalized SVM and the SMOTE technique on the training dataset, we have:","\n")
print("The Exact Match score is:", EMscore)
print("The Hamming Loss is:", Hamming_Loss)

So, using Binary Relevance with L1-penalized SVM and the SMOTE technique on the training dataset, we have: 

The Exact Match score is: 0.8703103288559518
The Hamming Loss is: 0.06855025474756832
