# MOUNTING THE DRIVE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


# READING THE DATA

In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/My Drive/Datasets/ToxicCommentProject/train.csv/train.csv')
# The session was crashing and that is whhy I took a very small sample to try the algorithms
data = data.sample(frac = 0.01).reset_index(drop = True)
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,bd46839761b63064,I removed the speedy tag on the strength of th...,0,0,0,0,0,0
1,6774bca4d6dbf97f,Please stop. If you continue to blank out or d...,0,0,0,0,0,0
2,02f516a2a8f57506,Effects under £3000 - dated 3rd June (with one...,0,0,0,0,0,0
3,925f325f0a056907,"""\n\nI don't know which """"personal attacks"""" y...",0,0,0,0,0,0
4,3a16fababd9bf78e,If you expect everyone here to agree with you ...,0,0,0,0,0,0


In [None]:
data.shape

(1596, 8)

# IMPORTING LIBRARIES 

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

# TEXT PROCESSING

#### Converting upper-case to lower-case

In [None]:
data['comment_text'] = data['comment_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data['comment_text'].head()

0    i removed the speedy tag on the strength of th...
1    please stop. if you continue to blank out or d...
2    effects under £3000 - dated 3rd june (with one...
3    " i don't know which ""personal attacks"" you ...
4    if you expect everyone here to agree with you ...
Name: comment_text, dtype: object

#### Removing stopwords

In [None]:
nltk.download('stopwords')
stop = stopwords.words('english')
data['comment_text'] = data['comment_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data['comment_text'].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0       removed speedy tag strength times story alone.
1    please stop. continue blank delete portions pa...
2    effects £3000 - dated 3rd june (with one codic...
3    " know ""personal attacks"" referring to. sinc...
4                expect everyone agree terribly wrong.
Name: comment_text, dtype: object

#### Removing the very common words

In [None]:
common_words = pd.Series(' '.join(data['comment_text']).split()).value_counts()[:10]
common_words = list(common_words.index)
data['comment_text'] = data['comment_text'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))

#### Removing the rare words

In [None]:
rare_words = pd.Series(' '.join(data['comment_text']).split()).value_counts()[-10:]
rare_words = list(rare_words.index)
data['comment_text'] = data['comment_text'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_words))

#### Removing tags, punctuation

In [None]:
def tagremoval(text):
  comp = re.compile('<.*?>')
  cl_text = re.sub(comp, '', text)
  return cl_text

def Puncremoval(text): #function to clean the word of any punctuation or special characters
    cl_text = re.sub(r'[?|!|\'|"|#]',r'', text)
    cl_text = re.sub(r'[.|,|)|(|\|/]',r' ',cl_text)
    cl_text = cl_text.strip()
    cl_text = cl_text.replace("\n"," ")
    return cl_text

def processed(text):
    processed_text = ""
    for word in text.split():
        processed_word = re.sub('[^a-z A-Z]+', ' ', word)
        processed_text += processed_word
        processed_text += " "
    processed_text = processed_text.strip()
    return processed_text


data['comment_text'] = data['comment_text'].apply(tagremoval)
data['comment_text'] = data['comment_text'].apply(Puncremoval)
data['comment_text'] = data['comment_text'].apply(processed)

#### Displaying the text after processing

In [None]:
data['comment_text'][:5]

0        removed speedy tag strength times story alone
1    stop continue blank delete portions content te...
2    effects   dated  rd june with codicil charlott...
3    know personal attacks referring to since blank...
4                 expect everyone agree terribly wrong
Name: comment_text, dtype: object

#### Applying stemmer
What is steeming ?

Stemming is just a simpler version of lemmatization where we are interested in stripping the suffix at the end of the word. When stemming we are interesting in reducing the inflected or derived word to it's base form.

What is inflection ?

inflections: adding a suffix to a word, that doesn't change its grammatical category, such as tenses in verbs (-ing, -ed, -s), plural in nouns (s).

What is derivation ?

derivations - adding a suffix to a word, that changes its grammatical category, such as nation (noun) => national (adjective) => nationalize (verb).

In [None]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
    Sentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        Sentence += stem
        Sentence += " "
    Sentence = Sentence.strip()
    return Sentence
data['comment_text'] = data['comment_text'].apply(stemming)

# SPLITTING THE DATA

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state=1, test_size=0.2, shuffle=True)
tr_text = train['comment_text']
te_text = test['comment_text']

# VECTORIZING THE DATA
#### TF-IDF Vectorizer

There are some words which are insignificant so, we need to remove it. In order to do so, we use TF-IDF,

<strong>TF - Term Frequency : </strong>

(How many times a term occures in a document)

<em><strong>tf(i, j) = n(i, j) / E(n(i, j)</strong></em>

<strong>IDF - Inverse Document frequency :</strong>

How common a word is across all documents,

<em><strong>IDF(w) = log ( N/DF(i) )</em></strong>

In [None]:
# Importing library
from sklearn.feature_extraction.text import TfidfVectorizer

# Defining the vectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

# Fitting the vectorizer to train and text data
vectorizer.fit(tr_text)
vectorizer.fit(te_text)

# Transforming the training data
x_train = vectorizer.transform(tr_text)
y_train = train.drop(labels = ['id','comment_text'], axis=1)

# Transforming the training data
x_test = vectorizer.transform(te_text)
y_test = test.drop(labels = ['id','comment_text'], axis=1)

#### COUNT VECTORIZER
CountVectorizer takes the approach of bag of words.

This works as follows :

<strong>
1] Each word inside the document will be separated into tokens.

2] Assigning a weight to each token proportional to the frequency with which it shows up in the document and/or corpus.

3] Creating a document-term matrix with each row representing a document and each column addressing a token.
</strong>

<em>Example : doc = ['This is Count vectorizer']</em>

1st token = 'This'

2nd token = 'is' and so on....

Then the number of times each token occures in a document is counted in case of CountVectorizer.

In [None]:
# Importing library
from sklearn.feature_extraction.text import CountVectorizer

# Defining the vectorizer
cvectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3))

# Fitting the vectorizer to train and text data
cvectorizer.fit(tr_text)
cvectorizer.fit(te_text)

# Transforming the training data
cx_train = cvectorizer.transform(tr_text)
cy_train = train.drop(labels = ['id','comment_text'], axis=1)

# Transforming the training data
cx_test = cvectorizer.transform(te_text)
cy_test = test.drop(labels = ['id','comment_text'], axis=1)

# Using scikit-multilearn library for multi-label classification

In [None]:
!pip install scikit-multilearn

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |███▊                            | 10kB 15.5MB/s eta 0:00:01[K     |███████▍                        | 20kB 1.7MB/s eta 0:00:01[K     |███████████                     | 30kB 2.1MB/s eta 0:00:01[K     |██████████████▊                 | 40kB 2.5MB/s eta 0:00:01[K     |██████████████████▍             | 51kB 2.0MB/s eta 0:00:01[K     |██████████████████████          | 61kB 2.3MB/s eta 0:00:01[K     |█████████████████████████▊      | 71kB 2.4MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81kB 2.7MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 2.4MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [None]:
from sklearn.metrics import accuracy_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

# MODEL
#### Applying Tfidf in Binart Relevance(GaussianNB)
What is Binary Relevance ?

the data is split up into L data sets, where L is the number of labels. Each subset has a column where either a 0 or a 1 is assigned to an instance, indicating the presence or absence of that label on that instance. A separate classifier is trained on each data set.

In [None]:
%%time

# using binary relevance
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
BR_clf_tfidf = BinaryRelevance(GaussianNB())

# train
BR_clf_tfidf.fit(x_train, y_train)

# predict
BR_clf_tfidf_predictions = BR_clf_tfidf.predict(x_test)

# accuracy
print(f" Accuracy : {accuracy_score(y_test, BR_clf_tfidf_predictions)*100:.2f} %")
print("\n")

 Accuracy : 90.31 %


CPU times: user 3.14 s, sys: 414 ms, total: 3.55 s
Wall time: 3.56 s


#### Applying Tfidf ClassifierChain(Logistic regression)

In [None]:
%%time

# using classifier chains
# initialize classifier chains multi-label classifier
CC_clf_tfidf = ClassifierChain(LogisticRegression(penalty = 'l2', C = 0.01))

# Training logistic regression model on train data
CC_clf_tfidf.fit(x_train, y_train)

# predict
CC_clf_tfidf_predictions = CC_clf_tfidf.predict(x_test)

# accuracy
print(f" Accuracy : {accuracy_score(y_test,CC_clf_tfidf_predictions)*100:.2f} %")
print("\n")

 Accuracy : 91.56 %


CPU times: user 9.13 s, sys: 813 ms, total: 9.94 s
Wall time: 6.31 s


In [None]:
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
%%time
BR_clf_bow = BinaryRelevance(GaussianNB())

# train
BR_clf_bow.fit(cx_train, cy_train)

# predict
BR_clf_bow_predictions = BR_clf_bow.predict(cx_test)

# accuracy
print(f" Accuracy : {accuracy_score(cy_test, BR_clf_bow_predictions)*100:.2f} %")
print("\n")

 Accuracy : 90.31 %


CPU times: user 4.22 s, sys: 33 ms, total: 4.25 s
Wall time: 4.26 s


#### Applying CountVectorizer in ClassifierChain(LogisticRegression)

In [None]:
%%time
CC_clf_bow = ClassifierChain(LogisticRegression())

# Training logistic regression model on train data
CC_clf_bow.fit(cx_train, cy_train)

# predict
CC_clf_bow_predictions = CC_clf_bow.predict(cx_test)

# accuracy
print(f" Accuracy : {accuracy_score(cy_test, CC_clf_bow_predictions)*100:.2f} %")
print("\n")

 Accuracy : 92.50 %


CPU times: user 23.6 s, sys: 1.81 s, total: 25.4 s
Wall time: 14.1 s


#### LOSS
‘hamming loss’ value ranges from 0 to 1. As it is a loss metric, its interpretation is reverse in nature unlike normal accuracy ratio. Lesser value of hamming loss indicates a better classifier.

In [None]:
import sklearn.metrics as metrics
print(f" hamming loss for BR-TFIDF: {metrics.hamming_loss(y_test, BR_clf_tfidf_predictions):.2f}")
print(f" hamming loss for CC-TFIDF: {metrics.hamming_loss(y_test, CC_clf_tfidf_predictions):.2f}")
print(f" hamming loss for BR-BoW: {metrics.hamming_loss(cy_test, BR_clf_bow_predictions):.2f}")
print(f" hamming loss for CC-BoW: {metrics.hamming_loss(cy_test, CC_clf_bow_predictions):.2f}")

 hamming loss for BR-TFIDF: 0.04
 hamming loss for CC-TFIDF: 0.04
 hamming loss for BR-BoW: 0.04
 hamming loss for CC-BoW: 0.02


In [None]:
from prettytable import PrettyTable
x = PrettyTable()

x.field_names = ['Model', 'Vectorizer', 'Accuracy', 'Hamming loss']
x.add_row(['BinaryRelevance(GaussianNB)', 'Tfidf', '90.31%', 0.04])
x.add_row(['ClassifierChain(LogisticRegression)', 'Tfidf', '91.56%', 0.04])
x.add_row(['BinaryRelevance(GaussianNB)', 'BoW', '90.31%', 0.04])
x.add_row(['ClassifierChain(LogisticRegression)', 'BoW', '92.50%', 0.02])



x.padding_width = 5
print(x)

+---------------------------------------------+--------------------+------------------+----------------------+
|                    Model                    |     Vectorizer     |     Accuracy     |     Hamming loss     |
+---------------------------------------------+--------------------+------------------+----------------------+
|         BinaryRelevance(GaussianNB)         |       Tfidf        |      90.31%      |         0.04         |
|     ClassifierChain(LogisticRegression)     |       Tfidf        |      91.56%      |         0.04         |
|         BinaryRelevance(GaussianNB)         |        BoW         |      90.31%      |         0.04         |
|     ClassifierChain(LogisticRegression)     |        BoW         |      92.50%      |         0.02         |
+---------------------------------------------+--------------------+------------------+----------------------+
