In [24]:
# reference: https://www.analyticsvidhya.com/blog/2017/08/introduction-to-multi-label-classification/

In [67]:
import scipy 
from scipy.io import arff
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 


from sklearn.model_selection import train_test_split

## Prepare data, 2 ways for illustration. The second way is used throughout the tutorial

### 1 

In [5]:
data, meta = arff.loadarff('dataset_yeast/yeast-train.arff')
df = pd.DataFrame(data)

### 2

In [68]:
from sklearn.datasets import make_multilabel_classification

X,y = make_multilabel_classification(sparse=True, n_labels=20, return_indicator="sparse", allow_unlabeled=False)

# sparse=True returns a sparse matrix where there are many zero elements. sparse matrix值為0不會顯示出來
# n_labels:  The average number of labels for each instance.
# return_indicator: If ‘sparse’ return Y in the sparse binary indicator format.
# allow_unlabeled: If True, some instances might not belong to any class.

In [69]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

## Techniques for Solving a Multi-Label classification problem, 3 methods

### 1, Problem Transformation

In [25]:
# In this method, we will try to transform our multi-label problem into single-label problem(s).

# This method can be carried out in three different ways as:
# Binary Relevance
# Classifier Chains
# Label Powerset


###### <font color="red"> Binary Relevance </font>

In [84]:
# treats each label as a separate single class classification problem.
# it doesn’t consider labels correlation because it treats every target variable independently.

In [95]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

In [96]:
# This function calculates subset accuracy, 
# meaning the predicted set of labels should exactly match with the true set of labels.

from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.5757575757575758

###### <font color="red">  Classifier Chains </font>

In [97]:
# the first classifier is trained just on the input data and then 
# each next classifier is trained on the input space and all the previous classifiers in the chain.  

In [98]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

classifier = ClassifierChain(GaussianNB())

classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

accuracy_score(y_test, predictions)

0.45454545454545453

###### <font color="red">   Label Powerset </font>

In [100]:
# label powerset gives a unique class to every possible label combination that is present in the training set.

In [103]:
# using label powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB

# initialize label powerset multilabel classifier
# with a gaussian naive bayes base classifier
classifier = LabelPowerset(GaussianNB())

classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

accuracy_score(y_test, predictions)

0.60606060606060608

### 2, Adapted Algorithm

In [106]:
# Adapted algorithm, as the name suggests, adapting the algorithm to directly perform multi-label classification, 
# rather than transforming the problem into different subsets of problems as before. 

In [110]:
# Sci-kit learn provides inbuilt support of multi-label classification in some of the algorithm like Random Forest and Ridge regression. 
# So, you can directly call them and predict the output

# for now, we will use MLkNN from skmultilearn

In [111]:
from skmultilearn.adapt import MLkNN

classifier = MLkNN(k=20)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
accuracy_score(y_test, predictions)

0.63636363636363635

### 3, Ensemble methods

In [112]:
# check out the skmultilearn documents