Most of this code is from this very helpful blog that you should checkout! <br>
https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff

# Simple Approach to Multi-Label Classification

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

## Data Pre-Processing

In [3]:
data = pd.read_csv('lunchHateLove.csv')
data.head()

Unnamed: 0,comment_text,lunch_talk,love_talk,hate_talk
0,"I hate that kind of food, let’s not have it fo...",1,0,1
1,"I hate that you love that kind of food, okay, ...",1,0,1
2,I kind of hate that I love you. Let’s get lunch.,1,1,1
3,"That food hates me, but I love it. I’m getting...",1,1,1


### Cleaning Data

In [5]:
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)
data.head()

Unnamed: 0,comment_text,lunch_talk,love_talk,hate_talk
0,i hate that kind of food let s not have it for...,1,0,1
1,i hate that you love that kind of food okay we...,1,0,1
2,i kind of hate that i love you let s get lunch,1,1,1
3,that food hates me but i love it i m getting i...,1,1,1


### Removing Stop Words

In [6]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['comment_text'] = data['comment_text'].apply(removeStopWords)
data.head()

Unnamed: 0,comment_text,lunch_talk,love_talk,hate_talk
0,hate kind food let lunch,1,0,1
1,hate love kind food okay lunch,1,0,1
2,kind hate love let get lunch,1,1,1
3,food hates love getting lunch,1,1,1


### Stemming

In [7]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['comment_text'] = data['comment_text'].apply(stemming)
data.head()

Unnamed: 0,comment_text,lunch_talk,love_talk,hate_talk
0,hate kind food let lunch,1,0,1
1,hate love kind food okay lunch,1,0,1
2,kind hate love let get lunch,1,1,1
3,food hate love get lunch,1,1,1


### Train-Test Split

In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)

print(train.shape)
print(test.shape)

(2, 4)
(2, 4)


In [9]:
train_text = train['comment_text']
test_text = test['comment_text']

### TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['comment_text'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['comment_text'], axis=1)

In [12]:
print(train_text, x_train)

0        hate kind food let lunch
2    kind hate love let get lunch
Name: comment_text, dtype: object   (0, 19)	0.3793034928087496
  (0, 12)	0.5330978245262535
  (0, 11)	0.5330978245262535
  (0, 7)	0.3793034928087496
  (0, 0)	0.3793034928087496
  (1, 19)	0.3174043985887566
  (1, 14)	0.3174043985887566
  (1, 11)	0.4461008073765536
  (1, 8)	0.3174043985887566
  (1, 7)	0.3174043985887566
  (1, 6)	0.4461008073765536
  (1, 5)	0.4461008073765536


### Binary Relevance

In [14]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

print(data.comment_text[1], '\n',
      data.comment_text[3], '\n',
      y_test, '\n',
      predictions, '\n')


hate love kind food okay lunch 
 food hate love get lunch 
    lunch_talk  love_talk  hate_talk
1           1          0          1
3           1          1          1 
   (0, 0)	1
  (1, 0)	1
  (1, 1)	1
  (0, 2)	1
  (1, 2)	1 

