# Simple Approach to Multi-Label Classification

## 1. EDA

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
#printmd('**bold**')

In [3]:
data_path = "./coding/with_multi_tags/leetcode.csv"

In [4]:
data_raw = pd.read_csv(data_path)
#data_raw = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data_raw.shape

(2114, 4)

In [5]:
print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
printmd("**Sample data:**")
data_raw.head()

Number of rows in data = 2114
Number of columns in data = 4




**Sample data:**

Unnamed: 0,problem_statement,titleslug,title,tags
0,Given an array of integers nums and an integer...,two-sum,Two Sum,"Array, Hash Table"
1,You are given two non-empty linked lists repre...,add-two-numbers,Add Two Numbers,"Linked List, Math, Recursion"
2,"Given a string s, find the length of the longe...",longest-substring-without-repeating-characters,Longest Substring Without Repeating Characters,"Hash Table, String, Sliding Window"
3,Given two sorted arrays nums1 and nums2 of siz...,median-of-two-sorted-arrays,Median of Two Sorted Arrays,"Array, Binary Search, Divide and Conquer"
4,"Given a string s, return the longest palindrom...",longest-palindromic-substring,Longest Palindromic Substring,"String, Dynamic Programming"


In [6]:
data_raw['tags'] = data_raw['tags'].str.split(', ')

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

# Create an instance of MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Apply one-hot encoding on the tags column
tags_encoded = mlb.fit_transform(data_raw['tags'])

In [8]:
print(tags_encoded.shape)

(2114, 71)


In [9]:
# Get the unique list of tags in the same order filled by the MultiLabelBinarizer
unique_tags = list(mlb.classes_)

# Print the unique list of tags
print(unique_tags)


['Array', 'Backtracking', 'Biconnected Component', 'Binary Indexed Tree', 'Binary Search', 'Binary Search Tree', 'Binary Tree', 'Bit Manipulation', 'Bitmask', 'Brainteaser', 'Breadth-First Search', 'Bucket Sort', 'Combinatorics', 'Concurrency', 'Counting', 'Counting Sort', 'Data Stream', 'Database', 'Depth-First Search', 'Design', 'Divide and Conquer', 'Doubly-Linked List', 'Dynamic Programming', 'Enumeration', 'Eulerian Circuit', 'Game Theory', 'Geometry', 'Graph', 'Greedy', 'Hash Function', 'Hash Table', 'Heap (Priority Queue)', 'Interactive', 'Iterator', 'Line Sweep', 'Linked List', 'Math', 'Matrix', 'Memoization', 'Merge Sort', 'Minimum Spanning Tree', 'Monotonic Queue', 'Monotonic Stack', 'Number Theory', 'Ordered Set', 'Prefix Sum', 'Probability and Statistics', 'Queue', 'Quickselect', 'Radix Sort', 'Randomized', 'Recursion', 'Rejection Sampling', 'Reservoir Sampling', 'Rolling Hash', 'Segment Tree', 'Shell', 'Shortest Path', 'Simulation', 'Sliding Window', 'Sorting', 'Stack', 'S

In [10]:
tags_encoded


array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1]])

In [11]:
# Define the list of lists
# list_of_lists = data_raw['tags']

# # Flatten the list of lists into a single list using list comprehension
# flat_list = [item for sublist in list_of_lists for item in sublist]

# # Use set() to get unique entries
# unique_entries = set(flat_list)
# ue = list(unique_entries)
# unique_entries = ue

unique_entries = unique_tags
print(unique_entries)


['Array', 'Backtracking', 'Biconnected Component', 'Binary Indexed Tree', 'Binary Search', 'Binary Search Tree', 'Binary Tree', 'Bit Manipulation', 'Bitmask', 'Brainteaser', 'Breadth-First Search', 'Bucket Sort', 'Combinatorics', 'Concurrency', 'Counting', 'Counting Sort', 'Data Stream', 'Database', 'Depth-First Search', 'Design', 'Divide and Conquer', 'Doubly-Linked List', 'Dynamic Programming', 'Enumeration', 'Eulerian Circuit', 'Game Theory', 'Geometry', 'Graph', 'Greedy', 'Hash Function', 'Hash Table', 'Heap (Priority Queue)', 'Interactive', 'Iterator', 'Line Sweep', 'Linked List', 'Math', 'Matrix', 'Memoization', 'Merge Sort', 'Minimum Spanning Tree', 'Monotonic Queue', 'Monotonic Stack', 'Number Theory', 'Ordered Set', 'Prefix Sum', 'Probability and Statistics', 'Queue', 'Quickselect', 'Radix Sort', 'Randomized', 'Recursion', 'Rejection Sampling', 'Reservoir Sampling', 'Rolling Hash', 'Segment Tree', 'Shell', 'Shortest Path', 'Simulation', 'Sliding Window', 'Sorting', 'Stack', 'S

In [12]:
import pandas as pd

# Load the CSV file into a pandas DataFrame


# Define the 2D matrix with the new data

# Define the list of column names for the new data


# Create a new pandas DataFrame from the 2D matrix
new_df = pd.DataFrame(tags_encoded, columns=unique_entries)

# Concatenate the two DataFrames along axis 1 (columns)
merged_df = pd.concat([data_raw, new_df], axis=1)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('leetcode_mod.csv', index=False)




In [13]:
data_raw = merged_df

### 1.1. Checking for missing values

In [14]:
missing_values_check = data_raw.isnull().sum()
print(missing_values_check)

problem_statement    0
titleslug            0
title                0
tags                 0
Array                0
                    ..
Topological Sort     0
Tree                 0
Trie                 0
Two Pointers         0
Union Find           0
Length: 75, dtype: int64


### 1.2. Calculating number of comments under each label

In [15]:
# Comments with no label are considered to be clean comments.
# Creating seperate column in dataframe to identify clean comments.

# We use axis=1 to count row-wise and axis=0 to count column wise

rowSums = data_raw.iloc[:,2:].sum(axis=1)
clean_comments_count = (rowSums==0).sum(axis=0)

print("Total number of comments = ",len(data_raw))
print("Number of clean comments = ",clean_comments_count)
print("Number of comments with labels =",(len(data_raw)-clean_comments_count))

Total number of comments =  2114
Number of clean comments =  0
Number of comments with labels = 2114


  rowSums = data_raw.iloc[:,2:].sum(axis=1)


In [16]:
categories = list(data_raw.columns.values)
categories = categories[4:]
print(categories)

['Array', 'Backtracking', 'Biconnected Component', 'Binary Indexed Tree', 'Binary Search', 'Binary Search Tree', 'Binary Tree', 'Bit Manipulation', 'Bitmask', 'Brainteaser', 'Breadth-First Search', 'Bucket Sort', 'Combinatorics', 'Concurrency', 'Counting', 'Counting Sort', 'Data Stream', 'Database', 'Depth-First Search', 'Design', 'Divide and Conquer', 'Doubly-Linked List', 'Dynamic Programming', 'Enumeration', 'Eulerian Circuit', 'Game Theory', 'Geometry', 'Graph', 'Greedy', 'Hash Function', 'Hash Table', 'Heap (Priority Queue)', 'Interactive', 'Iterator', 'Line Sweep', 'Linked List', 'Math', 'Matrix', 'Memoization', 'Merge Sort', 'Minimum Spanning Tree', 'Monotonic Queue', 'Monotonic Stack', 'Number Theory', 'Ordered Set', 'Prefix Sum', 'Probability and Statistics', 'Queue', 'Quickselect', 'Radix Sort', 'Randomized', 'Recursion', 'Rejection Sampling', 'Reservoir Sampling', 'Rolling Hash', 'Segment Tree', 'Shell', 'Shortest Path', 'Simulation', 'Sliding Window', 'Sorting', 'Stack', 'S

In [17]:
# Calculating number of comments in each category

counts = []
for category in categories:
    counts.append((category, data_raw[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of comments'])
df_stats

Unnamed: 0,category,number of comments
0,Array,1150
1,Backtracking,78
2,Biconnected Component,1
3,Binary Indexed Tree,22
4,Binary Search,191
...,...,...
66,Topological Sort,25
67,Tree,157
68,Trie,32
69,Two Pointers,142


In [18]:
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))

ax= sns.barplot(categories, data_raw.iloc[:,2:].sum().values)

plt.title("Comments in each category", fontsize=24)
plt.ylabel('Number of comments', fontsize=18)
plt.xlabel('Comment Type ', fontsize=18)

#adding the text labels
rects = ax.patches
labels = data_raw.iloc[:,2:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)

plt.show()

TypeError: barplot() takes from 0 to 1 positional arguments but 2 were given

<Figure size 1500x800 with 0 Axes>

### 1.3. Calculating number of comments having multiple labels

In [19]:
rowSums = data_raw.iloc[:,2:].sum(axis=1)
multiLabel_counts = rowSums.value_counts()
multiLabel_counts = multiLabel_counts.iloc[1:]

sns.set(font_scale = 2)
plt.figure(figsize=(15,8))

ax = sns.barplot(multiLabel_counts.index, multiLabel_counts.values)

plt.title("Comments having multiple labels ")
plt.ylabel('Number of comments', fontsize=18)
plt.xlabel('Number of labels', fontsize=18)

#adding the text labels
rects = ax.patches
labels = multiLabel_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

  rowSums = data_raw.iloc[:,2:].sum(axis=1)


TypeError: barplot() takes from 0 to 1 positional arguments but 2 were given

<Figure size 1500x800 with 0 Axes>

### 1.4. WordCloud representation of most used words in each category of comments

In [20]:
from wordcloud import WordCloud,STOPWORDS

plt.figure(figsize=(40,25))

# toxic
subset = data_raw[data_raw.toxic==1]
text = subset.comment_text.values
cloud_toxic = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(2, 3, 1)
plt.axis('off')
plt.title("Toxic",fontsize=40)
plt.imshow(cloud_toxic)


# severe_toxic
subset = data_raw[data_raw.severe_toxic==1]
text = subset.comment_text.values
cloud_severe_toxic = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(2, 3, 2)
plt.axis('off')
plt.title("Severe Toxic",fontsize=40)
plt.imshow(cloud_severe_toxic)


# obscene
subset = data_raw[data_raw.obscene==1]
text = subset.comment_text.values
cloud_obscene = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(2, 3, 3)
plt.axis('off')
plt.title("Obscene",fontsize=40)
plt.imshow(cloud_obscene)


# threat
subset = data_raw[data_raw.threat==1]
text = subset.comment_text.values
cloud_threat = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(2, 3, 4)
plt.axis('off')
plt.title("Threat",fontsize=40)
plt.imshow(cloud_threat)


# insult
subset = data_raw[data_raw.insult==1]
text = subset.comment_text.values
cloud_insult = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(2, 3, 5)
plt.axis('off')
plt.title("Insult",fontsize=40)
plt.imshow(cloud_insult)


# identity_hate
subset = data_raw[data_raw.identity_hate==1]
text = subset.comment_text.values
cloud_identity_hate = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          collocations=False,
                          width=2500,
                          height=1800
                         ).generate(" ".join(text))

plt.subplot(2, 3, 6)
plt.axis('off')
plt.title("Identity Hate",fontsize=40)
plt.imshow(cloud_identity_hate)

plt.show()

AttributeError: 'DataFrame' object has no attribute 'toxic'

<Figure size 4000x2500 with 0 Axes>

## 2. Data Pre-Processing

In [21]:
data = data_raw
# data = data_raw.loc[np.random.choice(data_raw.index, size=2100)]
data.shape

(2114, 75)

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

### 2.1. Cleaning Data

In [23]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [24]:
data['problem_statement'] = data['problem_statement'].str.lower()
data['problem_statement'] = data['problem_statement'].apply(cleanHtml)
data['problem_statement'] = data['problem_statement'].apply(cleanPunc)
data['problem_statement'] = data['problem_statement'].apply(keepAlpha)
data.head()

Unnamed: 0,problem_statement,titleslug,title,tags,Array,Backtracking,Biconnected Component,Binary Indexed Tree,Binary Search,Binary Search Tree,...,Stack,String,String Matching,Strongly Connected Component,Suffix Array,Topological Sort,Tree,Trie,Two Pointers,Union Find
0,given an array of integers nums and an integer...,two-sum,Two Sum,"[Array, Hash Table]",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,you are given two non empty linked lists repre...,add-two-numbers,Add Two Numbers,"[Linked List, Math, Recursion]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,given a string s find the length of the longes...,longest-substring-without-repeating-characters,Longest Substring Without Repeating Characters,"[Hash Table, String, Sliding Window]",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,given two sorted arrays nums and nums of siz...,median-of-two-sorted-arrays,Median of Two Sorted Arrays,"[Array, Binary Search, Divide and Conquer]",1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,given a string s return the longest palindromi...,longest-palindromic-substring,Longest Palindromic Substring,"[String, Dynamic Programming]",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### 2.2. Removing Stop Words

In [25]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['problem_statement'] = data['problem_statement'].apply(removeStopWords)
data.head()

Unnamed: 0,problem_statement,titleslug,title,tags,Array,Backtracking,Biconnected Component,Binary Indexed Tree,Binary Search,Binary Search Tree,...,Stack,String,String Matching,Strongly Connected Component,Suffix Array,Topological Sort,Tree,Trie,Two Pointers,Union Find
0,given array integers nums integer target r...,two-sum,Two Sum,"[Array, Hash Table]",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,given non empty linked lists representing ...,add-two-numbers,Add Two Numbers,"[Linked List, Math, Recursion]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,given string find length longest substrin...,longest-substring-without-repeating-characters,Longest Substring Without Repeating Characters,"[Hash Table, String, Sliding Window]",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,given sorted arrays nums nums size n re...,median-of-two-sorted-arrays,Median of Two Sorted Arrays,"[Array, Binary Search, Divide and Conquer]",1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,given string return longest palindromic sub...,longest-palindromic-substring,Longest Palindromic Substring,"[String, Dynamic Programming]",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### 2.3. Stemming

In [26]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['problem_statement'] = data['problem_statement'].apply(stemming)
data.head()

Unnamed: 0,problem_statement,titleslug,title,tags,Array,Backtracking,Biconnected Component,Binary Indexed Tree,Binary Search,Binary Search Tree,...,Stack,String,String Matching,Strongly Connected Component,Suffix Array,Topological Sort,Tree,Trie,Two Pointers,Union Find
0,given array integ num integ target return indi...,two-sum,Two Sum,"[Array, Hash Table]",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,given non empti link list repres non negat int...,add-two-numbers,Add Two Numbers,"[Linked List, Math, Recursion]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,given string find length longest substr withou...,longest-substring-without-repeating-characters,Longest Substring Without Repeating Characters,"[Hash Table, String, Sliding Window]",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,given sort array num num size n respect return...,median-of-two-sorted-arrays,Median of Two Sorted Arrays,"[Array, Binary Search, Divide and Conquer]",1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,given string return longest palindrom substr s,longest-palindromic-substring,Longest Palindromic Substring,"[String, Dynamic Programming]",0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### 2.4. Train-Test Split

In [27]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.2, shuffle=True)

print(train.shape)
print(test.shape)

(1691, 75)
(423, 75)


In [28]:
train_text = train['problem_statement']
test_text = test['problem_statement']

### 2.5. TF-IDF

In [29]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)
with open('./coding/code_vector.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [82]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['problem_statement','titleslug','title','tags'], axis=1)
# print(x_train)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['problem_statement','titleslug','title','tags'], axis=1)

## 3. Multi-Label Classification

### 3.1. Multiple Binary Classifications - (One Vs Rest Classifier)

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [84]:
%%time
import joblib
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),])

# Define a dictionary to store the models
# models = {}

for category in unique_entries:
    printmd('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    
    # Save the trained model for the current category
#     models[category] = LogReg_pipeline
    
#     for i in range(len(prediction)):
#         print("Question : ", test_text.iloc[i])
#         print("Predicted : ", prediction[i])
#         print("Actual : ",test[category].iloc[i], "\n")
#         print()
    print("\n")
    
# joblib.dump(models, './coding/code_model.joblib')

**Processing Array comments...**

Test accuracy is 0.8605200945626478




**Processing Backtracking comments...**

Test accuracy is 0.966903073286052




**Processing Biconnected Component comments...**

Test accuracy is 1.0




**Processing Binary Indexed Tree comments...**

Test accuracy is 0.9905437352245863




**Processing Binary Search comments...**

Test accuracy is 0.9054373522458629




**Processing Binary Search Tree comments...**

Test accuracy is 0.9905437352245863




**Processing Binary Tree comments...**

Test accuracy is 0.9456264775413712




**Processing Bit Manipulation comments...**

Test accuracy is 0.9456264775413712




**Processing Bitmask comments...**

Test accuracy is 0.9763593380614657




**Processing Brainteaser comments...**

Test accuracy is 0.9905437352245863




**Processing Breadth-First Search comments...**

Test accuracy is 0.9101654846335697




**Processing Bucket Sort comments...**

Test accuracy is 0.9929078014184397




**Processing Combinatorics comments...**

Test accuracy is 0.9881796690307328




**Processing Concurrency comments...**

Test accuracy is 0.9952718676122931




**Processing Counting comments...**

Test accuracy is 0.9598108747044918




**Processing Counting Sort comments...**

Test accuracy is 0.9976359338061466




**Processing Data Stream comments...**

Test accuracy is 0.9976359338061466




**Processing Database comments...**

Test accuracy is 0.9716312056737588




**Processing Depth-First Search comments...**

Test accuracy is 0.9196217494089834




**Processing Design comments...**

Test accuracy is 0.9645390070921985




**Processing Divide and Conquer comments...**

Test accuracy is 0.9810874704491725




**Processing Doubly-Linked List comments...**

Test accuracy is 1.0




**Processing Dynamic Programming comments...**

Test accuracy is 0.8156028368794326




**Processing Enumeration comments...**

Test accuracy is 0.983451536643026




**Processing Eulerian Circuit comments...**

Test accuracy is 0.9976359338061466




**Processing Game Theory comments...**

Test accuracy is 0.9858156028368794




**Processing Geometry comments...**

Test accuracy is 0.9858156028368794




**Processing Graph comments...**

Test accuracy is 0.9645390070921985




**Processing Greedy comments...**

Test accuracy is 0.8534278959810875




**Processing Hash Function comments...**

Test accuracy is 0.9905437352245863




**Processing Hash Table comments...**

Test accuracy is 0.8203309692671394




**Processing Heap (Priority Queue) comments...**

Test accuracy is 0.9479905437352246




**Processing Interactive comments...**

Test accuracy is 0.9976359338061466




**Processing Iterator comments...**

Test accuracy is 1.0




**Processing Line Sweep comments...**

Test accuracy is 0.9976359338061466




**Processing Linked List comments...**

Test accuracy is 0.983451536643026




**Processing Math comments...**

Test accuracy is 0.806146572104019




**Processing Matrix comments...**

Test accuracy is 0.9290780141843972




**Processing Memoization comments...**

Test accuracy is 0.9881796690307328




**Processing Merge Sort comments...**

Test accuracy is 0.9976359338061466




**Processing Minimum Spanning Tree comments...**

Test accuracy is 0.9976359338061466




**Processing Monotonic Queue comments...**

Test accuracy is 0.9952718676122931




**Processing Monotonic Stack comments...**

Test accuracy is 0.9810874704491725




**Processing Number Theory comments...**

Test accuracy is 0.983451536643026




**Processing Ordered Set comments...**

Test accuracy is 0.9763593380614657




**Processing Prefix Sum comments...**

Test accuracy is 0.9314420803782506




**Processing Probability and Statistics comments...**

Test accuracy is 0.9952718676122931




**Processing Queue comments...**

Test accuracy is 0.9905437352245863




**Processing Quickselect comments...**

Test accuracy is 0.9952718676122931




**Processing Radix Sort comments...**

Test accuracy is 0.9976359338061466




**Processing Randomized comments...**

Test accuracy is 0.9905437352245863




**Processing Recursion comments...**

Test accuracy is 0.9858156028368794




**Processing Rejection Sampling comments...**

Test accuracy is 0.9976359338061466




**Processing Reservoir Sampling comments...**

Test accuracy is 0.9952718676122931




**Processing Rolling Hash comments...**

Test accuracy is 0.9905437352245863




**Processing Segment Tree comments...**

Test accuracy is 0.9905437352245863




**Processing Shell comments...**

Test accuracy is 0.9976359338061466




**Processing Shortest Path comments...**

Test accuracy is 0.9976359338061466




**Processing Simulation comments...**

Test accuracy is 0.9621749408983451




**Processing Sliding Window comments...**

Test accuracy is 0.9739952718676123




**Processing Sorting comments...**

Test accuracy is 0.8747044917257684




**Processing Stack comments...**

Test accuracy is 0.9408983451536643




**Processing String comments...**

Test accuracy is 0.8936170212765957




**Processing String Matching comments...**

Test accuracy is 0.9881796690307328




**Processing Strongly Connected Component comments...**

Test accuracy is 0.9976359338061466




**Processing Suffix Array comments...**

Test accuracy is 0.9976359338061466




**Processing Topological Sort comments...**

Test accuracy is 0.9929078014184397




**Processing Tree comments...**

Test accuracy is 0.9361702127659575




**Processing Trie comments...**

Test accuracy is 0.9787234042553191




**Processing Two Pointers comments...**

Test accuracy is 0.9243498817966903




**Processing Union Find comments...**

Test accuracy is 0.9787234042553191


CPU times: total: 766 ms
Wall time: 6.19 s


In [None]:
# for loading joblib model which is trained for the catgory 
# but for that the data should be preprocessed and vectorized first using

vectorizer = ''
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

test_text = ['This is a new document to be transformed.']
x_test = vectorizer.transform(test_text)

# now this x_test can go under models[category].predict(x_test)

# Load the saved dictionary of models
models = joblib.load('LogReg_models.joblib')

# Make predictions using the model for a specific category
category = 'some_category'
prediction = models[category].predict(new_data)

# Make predictions for all categories
for category, model in models.items():
    prediction = model.predict(new_data)


### 3.2. Multiple Binary Classifications - (Binary Relevance)

In [37]:
%%time

# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(classifier=MultinomialNB(alpha=1.0, class_prior=None,
                                         fit_prior=True),
                require_dense=[True, True])

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.07328605200945626


CPU times: total: 1min 1s
Wall time: 49.8 s


### 3.3. Classifier Chains

In [48]:
pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
Installing collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [38]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

In [39]:
%%time

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.07328605200945626


CPU times: total: 6min 13s
Wall time: 2min 59s


### 3.4. Label Powerset

In [40]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset

In [41]:
%%time

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.09929078014184398


CPU times: total: 9min 37s
Wall time: 11min 22s


### 3.5. Adapted Algorithm

In [42]:
# http://scikit.ml/api/api/skmultilearn.adapt.html#skmultilearn.adapt.MLkNN

from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

In [57]:
%%time

classifier_new = MLkNN(k=10, s=0.5)
# print(classifier_new.get_params().keys())
# Note that this classifier can throw up errors when handling sparse matrices.

# x_train = lil_matrix(x_train).toarray()
# y_train = lil_matrix(y_train).toarray()
# x_test = lil_matrix(x_test).toarray()

# classifier_new.set_params(n_neighbors=10, algorithm='auto', metric='hamming')
# train
classifier_new.fit(x_train, y_train)

# predict
predictions_new = classifier_new.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))
print("\n")

TypeError: NearestNeighbors.__init__() takes 1 positional argument but 2 were given