# Assignment 1: Multi-label Classification

## Import Packages Etc

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import copy

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import SVC
from sklearn import metrics

from skmultilearn.problem_transform import LabelPowerset
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# import other useful packages

Using TensorFlow backend.


## Task 0: Load the Yeast Dataset

In [2]:
# Write your code here
dataset = pd.read_csv('yeast.csv')
display(dataset.head())

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.04185,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.00797,0.049113,-0.03058,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.00767,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0


In [3]:
dataset.shape

(2417, 117)

In [4]:
# Check missing values, there is no missing values in this dataset
print("Missing Values")
dataset.isnull().sum()

Missing Values


Att1       0
Att2       0
Att3       0
Att4       0
Att5       0
          ..
Class10    0
Class11    0
Class12    0
Class13    0
Class14    0
Length: 117, dtype: int64

In [5]:
X = dataset[dataset.columns[0:103]]
Y = np.array(dataset[dataset.columns[103:]])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size = 0.30, train_size = 0.7)

## Task 1: Implement the Binary Relevance Algorithm

In [7]:
# Write your code here
class BinaryRelevance(BaseEstimator, ClassifierMixin):
    
    #constructor for the classifier object, default model is SVC
    def __init__(self, classifier = SVC()):
        self.classifier = classifier
     
    #train classifier, create a classifier for each model
    def fit(self, X, y):
        
        self.classifiers_= []
        self._label_count = y.shape[1]
        self.label_ = list(range(y.shape[1]))
        for i in range(self._label_count):
            classifier = copy.deepcopy(self.classifier)
            y_subset = y[:,i]
            classifier.fit(X,y_subset)
            self.classifiers_.append(classifier)
        
        return self
    
    #use the classifers that create during the train to make predition 
    def predict(self, X):
        
        predictions = np.zeros([X.shape[0],self._label_count],dtype=np.int)
        for label in range(self._label_count):
            predictions[:,label] = self.classifiers_[label].predict(X)
        return predictions
         
    def predict_proba(self, X):
        
        result = np.zeros([X.shape[0],self._label_count], dtype='float')
        for label_assignment, classifier in zip(self.label_, self.classifiers_):
            result[:, label_assignment] = classifier.predict_proba(X)[:, 1]
            
        return result

In [8]:
my_model = BinaryRelevance(tree.DecisionTreeClassifier(criterion='entropy',min_samples_leaf=10))
my_model.fit(X_train, y_train)

BinaryRelevance(classifier=DecisionTreeClassifier(class_weight=None,
                                                  criterion='entropy',
                                                  max_depth=None,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=10,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  presort=False,
                                                  random_state=None,
                                                  splitter='best'))

In [9]:
# 14 classifiers in total
my_model.classifiers_

[DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=10, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=10, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                        max_features=None, max_leaf_nodes=None,
                        

In [10]:
y_pred = my_model.predict(X_test)
y_pred

array([[1, 0, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 1, 1, 0],
       [1, 1, 0, ..., 1, 1, 0],
       ...,
       [0, 1, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
proba = my_model.predict_proba(X_test)
proba

array([[0.88235294, 0.        , 0.47368421, ..., 1.        , 0.84210526,
        0.        ],
       [0.        , 0.55555556, 0.1875    , ..., 0.9375    , 1.        ,
        0.        ],
       [0.86666667, 1.        , 0.30769231, ..., 1.        , 1.        ,
        0.1       ],
       ...,
       [0.5       , 1.        , 0.        , ..., 1.        , 0.75      ,
        0.        ],
       [0.        , 0.        , 0.8       , ..., 0.8       , 0.        ,
        0.07142857],
       [0.5       , 0.26315789, 0.        , ..., 0.05882353, 0.3125    ,
        0.        ]])

## Task 2: Implement the Binary Relevance Algorithm with Under-Sampling

In [12]:
# Write your code here
class BinaryRelevanceWithResampling(BaseEstimator, ClassifierMixin):
    
    #constructor for the classifier object
    def __init__(self, classifier = SVC(), under_sampling = True):
        self.classifier = classifier
        self.under_sampling = under_sampling
    
    #train classifier, create a classifier for each model,under-sampling all X and each y
    def fit(self, X, y):

        self.classifiers_= []
        self._label_count = y.shape[1]
        self.label_ = list(range(y.shape[1]))
        
        for i in range(self._label_count):
            classifier = copy.deepcopy(self.classifier)
            X_resampled = X
            y_resampled = y[:,i]
            if(self.under_sampling):
                rus = RandomUnderSampler(random_state=2020)
                X_resampled, y_resampled = rus.fit_sample(X_resampled, y_resampled)
            classifier.fit(X_resampled,y_resampled)
            self.classifiers_.append(classifier)
        return self
    
    def predict(self, X):
        
        predictions = np.zeros([X.shape[0],self._label_count],dtype=np.int)
        for label in range(self._label_count):
            predictions[:,label] = self.classifiers_[label].predict(X)
        return predictions
    
    def predict_proba(self, X):
        
        result = np.zeros([X.shape[0],self._label_count], dtype='float')
        for label_assignment, classifier in zip(self.label_, self.classifiers_):
            result[:, label_assignment] = classifier.predict_proba(X)[:, 1]
            
        return result

In [13]:
#set parameter probability=True for SVC predict_proba
my_model = BinaryRelevanceWithResampling(SVC(probability=True))
my_model.fit(X_train, y_train)

BinaryRelevanceWithResampling(classifier=SVC(C=1.0, cache_size=200,
                                             class_weight=None, coef0=0.0,
                                             decision_function_shape='ovr',
                                             degree=3, gamma='auto_deprecated',
                                             kernel='rbf', max_iter=-1,
                                             probability=True,
                                             random_state=None, shrinking=True,
                                             tol=0.001, verbose=False),
                              under_sampling=True)

In [14]:
y_pred = my_model.predict(X_test)
y_pred

array([[1, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [1, 0, 0, ..., 1, 1, 0],
       ...,
       [1, 1, 0, ..., 1, 1, 0],
       [0, 1, 1, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1]])

In [15]:
proba = my_model.predict_proba(X_test)
proba

array([[0.56595594, 0.44311136, 0.27509489, ..., 0.52749454, 0.52341669,
        0.53699457],
       [0.31418826, 0.2998366 , 0.56444155, ..., 0.53029676, 0.52201696,
        0.52959052],
       [0.55392834, 0.36275746, 0.18171631, ..., 0.52228667, 0.52030165,
        0.53434461],
       ...,
       [0.63545146, 0.5       , 0.0870271 , ..., 0.51347719, 0.51202427,
        0.54090636],
       [0.41596636, 0.60552626, 0.5       , ..., 0.5       , 0.5       ,
        0.52796494],
       [0.39760679, 0.50731808, 0.73223041, ..., 0.48464408, 0.48690314,
        0.52459955]])

## Task 3: Compare the Performance of Different Binary Relevance Approaches

SVC

In [16]:
# Write your code here
my_model = BinaryRelevance()
my_model.fit(X_train, y_train)

BinaryRelevance(classifier=SVC(C=1.0, cache_size=200, class_weight=None,
                               coef0=0.0, decision_function_shape='ovr',
                               degree=3, gamma='auto_deprecated', kernel='rbf',
                               max_iter=-1, probability=False,
                               random_state=None, shrinking=True, tol=0.001,
                               verbose=False))

In [17]:
y_pred = my_model.predict(X_test)
y_pred

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]])

In [18]:
metrics.f1_score(y_test, y_pred, average='macro')

0.1515313275511919

In [19]:
my_model = BinaryRelevanceWithResampling()
my_model.fit(X_train, y_train)

BinaryRelevanceWithResampling(classifier=SVC(C=1.0, cache_size=200,
                                             class_weight=None, coef0=0.0,
                                             decision_function_shape='ovr',
                                             degree=3, gamma='auto_deprecated',
                                             kernel='rbf', max_iter=-1,
                                             probability=False,
                                             random_state=None, shrinking=True,
                                             tol=0.001, verbose=False),
                              under_sampling=True)

In [20]:
y_pred = my_model.predict(X_test)
y_pred

array([[1, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [1, 0, 0, ..., 1, 1, 0],
       ...,
       [1, 1, 0, ..., 1, 1, 0],
       [0, 1, 1, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1]])

In [21]:
metrics.f1_score(y_test, y_pred, average='macro')

0.41199454814010406

DecisonTree

In [22]:
my_model = BinaryRelevance(tree.DecisionTreeClassifier())
my_model.fit(X_train, y_train)

BinaryRelevance(classifier=DecisionTreeClassifier(class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  presort=False,
                                                  random_state=None,
                                                  splitter='best'))

In [23]:
y_pred = my_model.predict(X_test)
y_pred

array([[0, 0, 1, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [24]:
metrics.f1_score(y_test, y_pred, average='macro')

0.3856360464409511

In [25]:
my_model = BinaryRelevanceWithResampling(tree.DecisionTreeClassifier())
my_model.fit(X_train, y_train)

BinaryRelevanceWithResampling(classifier=DecisionTreeClassifier(class_weight=None,
                                                                criterion='gini',
                                                                max_depth=None,
                                                                max_features=None,
                                                                max_leaf_nodes=None,
                                                                min_impurity_decrease=0.0,
                                                                min_impurity_split=None,
                                                                min_samples_leaf=1,
                                                                min_samples_split=2,
                                                                min_weight_fraction_leaf=0.0,
                                                                presort=False,
                                                                r

In [26]:
y_pred = my_model.predict(X_test)
y_pred

array([[0, 1, 0, ..., 0, 0, 1],
       [0, 1, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 1, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 1],
       [1, 0, 1, ..., 0, 0, 1]])

In [27]:
metrics.f1_score(y_test, y_pred, average='macro')

0.41289697153754007

LogisticRegression

In [28]:
my_model = BinaryRelevance(LogisticRegression(random_state=2020))
my_model.fit(X_train, y_train)

BinaryRelevance(classifier=LogisticRegression(C=1.0, class_weight=None,
                                              dual=False, fit_intercept=True,
                                              intercept_scaling=1,
                                              l1_ratio=None, max_iter=100,
                                              multi_class='warn', n_jobs=None,
                                              penalty='l2', random_state=2020,
                                              solver='warn', tol=0.0001,
                                              verbose=0, warm_start=False))

In [29]:
y_pred = my_model.predict(X_test)
y_pred

array([[1, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [1, 1, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [30]:
metrics.f1_score(y_test, y_pred, average='macro')

0.3363784317668851

In [31]:
my_model = BinaryRelevanceWithResampling(LogisticRegression(random_state=2020))
my_model.fit(X_train, y_train)

BinaryRelevanceWithResampling(classifier=LogisticRegression(C=1.0,
                                                            class_weight=None,
                                                            dual=False,
                                                            fit_intercept=True,
                                                            intercept_scaling=1,
                                                            l1_ratio=None,
                                                            max_iter=100,
                                                            multi_class='warn',
                                                            n_jobs=None,
                                                            penalty='l2',
                                                            random_state=2020,
                                                            solver='warn',
                                                            tol=0.0001,
                   

In [32]:
y_pred = my_model.predict(X_test)
y_pred

array([[1, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [1, 0, 0, ..., 1, 1, 0],
       ...,
       [1, 1, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 1],
       [0, 0, 1, ..., 0, 0, 1]])

In [33]:
metrics.f1_score(y_test, y_pred, average='macro')

0.44053920793095835

## Task 4: Implement the Classifier Chains Algorithm

In [34]:
# Write your code here
class ClassifierChain(BaseEstimator, ClassifierMixin):
    
    #constructor for the classifier object
    def __init__(self, classifier = SVC(), order = None):
        self.classifier = classifier
        self.order = order
    
    #train classifier, after each y be used combine it with X, use new X dataset and next y train   
    def fit(self, X, y, order = None):
        
        X_temp = X
        self._label_count = y.shape[1]
        self.classifiers_ = [None for x in range(self._label_count)]
        
        if(self.order is None):
            self.order_ = list(range(self._label_count))
        else:
            self.order_ = self.order
        
        for label in self.order_:
            self.classifier = copy.deepcopy(self.classifier)
            y_subset = y[:,label]
            self.classifiers_[label] = self.classifier.fit(X_temp, y_subset)
            X_temp = np.column_stack((X_temp, y_subset))
        
        return self

    #use the classifers that create during the train to make predition
    #the last _label_count columns in x are the labels that predicted
    def predict(self, X):
        
        X_temp = X
        
        for label in self.order_:
            prediction = self.classifiers_[label].predict(X_temp)
            X_temp = np.column_stack((X_temp, prediction))
            
        return X_temp[:,-self._label_count:].astype(int)
    
    def predict_proba(self, X):
        
        X_temp = X
        proba = []
        
        for label in self.order_:
            prediction = self.classifiers_[label].predict(X_temp)
            prediction_proba = self.classifiers_[label].predict_proba(X_temp)[:, 1]
            X_temp = np.column_stack((X_temp, prediction))
            
        proba.append(prediction_proba)
        
        return proba

In [35]:
my_model = ClassifierChain(SVC(probability=True))
my_model.fit(X_train, y_train)

ClassifierChain(classifier=SVC(C=1.0, cache_size=200, class_weight=None,
                               coef0=0.0, decision_function_shape='ovr',
                               degree=3, gamma='auto_deprecated', kernel='rbf',
                               max_iter=-1, probability=True, random_state=None,
                               shrinking=True, tol=0.001, verbose=False),
                order=None)

In [36]:
y_pred = my_model.predict(X_test)
y_pred

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]])

In [37]:
proba = my_model.predict_proba(X_test)
proba

[array([0.00554066, 0.0062078 , 0.00411866, 0.00992003, 0.00174294,
        0.00425465, 0.00696259, 0.00348581, 0.00441725, 0.012424  ,
        0.00389347, 0.00513095, 0.00490546, 0.002363  , 0.00354636,
        0.00428867, 0.00205866, 0.00409301, 0.0038408 , 0.00749712,
        0.00356184, 0.00404392, 0.00540702, 0.00322106, 0.00667238,
        0.00510923, 0.00872554, 0.00383181, 0.01429821, 0.00679696,
        0.0026579 , 0.0054603 , 0.00470578, 0.00562098, 0.00603201,
        0.00784397, 0.00498093, 0.02125224, 0.00878044, 0.00611914,
        0.00477153, 0.00291235, 0.00664552, 0.00314877, 0.00497616,
        0.01140059, 0.00629231, 0.00406809, 0.00260006, 0.00713641,
        0.00638752, 0.00408564, 0.00474677, 0.01494761, 0.00551679,
        0.00355627, 0.00246327, 0.0122709 , 0.00809326, 0.00327685,
        0.00752649, 0.00302533, 0.00444142, 0.00606553, 0.00646647,
        0.01512597, 0.00635295, 0.00327263, 0.00282627, 0.00287178,
        0.00592062, 0.00420672, 0.00305759, 0.00

## Task 5: Evaluate the Performance of the Classifier Chains Algorithm

SVC

In [38]:
# Write your code here
my_model = ClassifierChain()
my_model.fit(X_train, y_train)

ClassifierChain(classifier=SVC(C=1.0, cache_size=200, class_weight=None,
                               coef0=0.0, decision_function_shape='ovr',
                               degree=3, gamma='auto_deprecated', kernel='rbf',
                               max_iter=-1, probability=False,
                               random_state=None, shrinking=True, tol=0.001,
                               verbose=False),
                order=None)

In [39]:
y_pred = my_model.predict(X_test)
y_pred

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]])

In [40]:
metrics.f1_score(y_test, y_pred,average='macro')

0.14486146718765816

DecisionTree

In [41]:
my_model = ClassifierChain(tree.DecisionTreeClassifier())
my_model.fit(X_train, y_train)

ClassifierChain(classifier=DecisionTreeClassifier(class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  presort=False,
                                                  random_state=None,
                                                  splitter='best'),
                order=None)

In [42]:
y_pred = my_model.predict(X_test)
y_pred

array([[0, 1, 1, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [43]:
metrics.f1_score(y_test, y_pred, average='macro')

0.3871600644877598

LogisticRegression

In [44]:
my_model = ClassifierChain(LogisticRegression(random_state=2020))
my_model.fit(X_train, y_train)

ClassifierChain(classifier=LogisticRegression(C=1.0, class_weight=None,
                                              dual=False, fit_intercept=True,
                                              intercept_scaling=1,
                                              l1_ratio=None, max_iter=100,
                                              multi_class='warn', n_jobs=None,
                                              penalty='l2', random_state=2020,
                                              solver='warn', tol=0.0001,
                                              verbose=0, warm_start=False),
                order=None)

In [45]:
y_pred = my_model.predict(X_test)
y_pred

array([[1, 1, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [1, 1, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [46]:
metrics.f1_score(y_test, y_pred, average='macro')

0.3842887858153895

Try to find a label order to Classifier Chain to improve accuracy

In [47]:
my_model = BinaryRelevance(LogisticRegression(random_state=2020))
my_model.fit(X_train, y_train)
y_pred = my_model.predict(X_test)
target_names = ['class 0', 'class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6', 'class 7', 'class 8',
                'class 9', 'class 10', 'class 11','class 12', 'class 13']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.75      0.48      0.59       238
     class 1       0.58      0.41      0.48       321
     class 2       0.58      0.58      0.58       283
     class 3       0.62      0.56      0.59       244
     class 4       0.67      0.39      0.49       217
     class 5       0.50      0.16      0.25       176
     class 6       0.67      0.02      0.03       115
     class 7       0.25      0.01      0.01       139
     class 8       0.00      0.00      0.00        58
     class 9       0.00      0.00      0.00        75
    class 10       0.50      0.01      0.02        80
    class 11       0.73      0.96      0.83       532
    class 12       0.73      0.95      0.82       527
    class 13       0.00      0.00      0.00        10

   micro avg       0.68      0.56      0.61      3015
   macro avg       0.47      0.32      0.34      3015
weighted avg       0.61      0.56      0.54      3015
 samples avg       0.68   

In [48]:
order = [11,12,2,3,0,4,1,5,6,7,8,9,10,13]
my_model = ClassifierChain(LogisticRegression(random_state=2020),order)
my_model.fit(X_train, y_train)

ClassifierChain(classifier=LogisticRegression(C=1.0, class_weight=None,
                                              dual=False, fit_intercept=True,
                                              intercept_scaling=1,
                                              l1_ratio=None, max_iter=100,
                                              multi_class='warn', n_jobs=None,
                                              penalty='l2', random_state=2020,
                                              solver='warn', tol=0.0001,
                                              verbose=0, warm_start=False),
                order=[11, 12, 2, 3, 0, 4, 1, 5, 6, 7, 8, 9, 10, 13])

In [49]:
y_pred = my_model.predict(X_test)
metrics.f1_score(y_test, y_pred, average='macro')

0.24885631450336673

In [50]:
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.33      0.96      0.49       238
     class 1       0.44      0.96      0.60       321
     class 2       0.59      0.63      0.61       283
     class 3       0.55      0.62      0.58       244
     class 4       0.12      0.10      0.11       217
     class 5       0.30      0.34      0.32       176
     class 6       0.19      0.41      0.26       115
     class 7       0.28      0.28      0.28       139
     class 8       0.11      0.12      0.12        58
     class 9       0.10      0.08      0.09        75
    class 10       0.00      0.00      0.00        80
    class 11       0.80      0.01      0.01       532
    class 12       0.80      0.01      0.02       527
    class 13       0.00      0.00      0.00        10

   micro avg       0.37      0.35      0.36      3015
   macro avg       0.33      0.32      0.25      3015
weighted avg       0.50      0.35      0.27      3015
 samples avg       0.37   

## Task 6: Reflect on the Performance of the Different Models Evaluated

*Write your reflection here (max 300 words)*

According to Daniels, Z.A. and Metaxas, D.N.(2017), the class imbalance is a serious issue in multilabel classification. For this issue macro averaged F-score evaluation methods be used. Macro averaged F-score is the arithmetic mean of F-score and F-score is for balance between precision and recall. 

Compare BinaryRelevance and BinaryRelevanceWithResampling:

|          SVC                   | Macro averaged F1  |
|  ----------------------------  | -----------------  |
| BinaryRelevance                |      0.151         |
| BinaryRelevanceWithResampling  |      0.411         |



|        Decision Tree           | Macro averaged F1  |
|  ----------------------------  | -----------------  |
| BinaryRelevance                |      0.385         |
| BinaryRelevanceWithResampling  |      0.409         |


|        LogisticRegression      | Macro averaged F1  |
|  ----------------------------  | -----------------  |
| BinaryRelevance                |      0.336         |
| BinaryRelevanceWithResampling  |      0.440         |


The Macro averaged F1 of BinaryRelevanceWithResampling obviously higher than BinaryRelevance. We expect high macro averaged F1 score. I use:
```
start = time.time()
      ...
end = time.time()
print(end-start)
```
measured the training time complexity of two approaches. BinaryRevance used 2.549s and BinaryRelevanceWithResampling used 1.297s. BinaryRelevanceWithResampling is much faster than the BinaryRevance method. In my opinion, this is because of the resampled dataset smaller than the whole dataset.

Compare BinaryRelevance, BinaryRelevanceWithResampling and ClassifierChain:

|          SVC                   | Macro averaged F1  | 
|  ----------------------------  | -----------------  |
| BinaryRelevance                |      0.151         |
| BinaryRelevanceWithResampling  |      0.411         |
| ClassifierChain                |      0.144         |

|        Decision Tree           | Macro averaged F1  |
|  ----------------------------  | -----------------  |
| BinaryRelevance                |      0.385         |
| BinaryRelevanceWithResampling  |      0.409         |
| ClassifierChain                |      0.389         |

|        LogisticRegression      | Macro averaged F1  |
|  ----------------------------  | -----------------  |
| BinaryRelevance                |      0.336         |
| BinaryRelevanceWithResampling  |      0.440         |
| ClassifierChain                |      0.384         |

Use the decision tree model. Sometimes I got the score of ClassifierChain higher than BinaryRelevance. I find decision tree model macro averaged f1-score always change, maybe because every time training the dataset has a slight change, decision tree a little bit unstable. 

ClassifierChain performance should be better than BinaryRelevance. Classifier chains considering the relevance of labels, using predictions of some labels as inputs in later label predictions. It worse than the under-sampling BinaryRelevance. The training time of ClassifierChain is 2.477s shorter than BinaryRelevance and longer than under-sampling BinaryRelevance.

In the experiment, I use the default label order. Compare with this order or random order some specific chain order may improve the accuracy of ClassifierChain. I used classification_report to get f1-score for each label and use the order of f1-score from high to low but as a result, I got lower macro averaged f1-score. Maybe when that label independent, it has high accuracy. When associating them, each label be influenced by the previous label, the accuracy decrease. I may do some research about the label order next.

Reference:
Daniels, Z.A. and Metaxas, D.N., 2017, February. Addressing imbalance in multi-label classification using structured hellinger forests. In Thirty-First AAAI Conference on Artificial Intelligence.