In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import re

import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import accuracy_score,hamming_loss
from sklearn.model_selection import train_test_split

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

In [None]:
train_set_np=np.load("./training_set.npy",allow_pickle=True)
test_set_np=np.load("./testing_set.npy",allow_pickle=True)

In [None]:
train_set_pd=pd.DataFrame(train_set_np)[0:int(len(train_set_np)*0.05)]
test_set_pd=pd.DataFrame(test_set_np)[0:int(len(test_set_np)*0.05)]

In [None]:
train_set_pd.shape

(3357, 8)

In [None]:
test_set_pd.shape

(839, 8)

In [None]:
train_set_pd[5] = train_set_pd[5].astype(float)
train_set_pd[6] = train_set_pd[6].astype(float)
train_set_pd[7] = train_set_pd[7].astype(float)

test_set_pd[5] = test_set_pd[5].astype(float)
test_set_pd[6] = test_set_pd[6].astype(float)
test_set_pd[7] = test_set_pd[7].astype(float)

In [None]:
def data_preprocessing(review):

    # package setting
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    # data cleaning
    review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
    review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words

    # lowercase
    review = review.lower()

    # tokenization
    tokens = nltk.word_tokenize(review) # converts review to tokens

    # stop_words removal
    review = [word for word in tokens if word not in stop_words] #removing stop words

    # lemmatization
    review = [lemmatizer.lemmatize(word) for word in review]

    # join words in preprocessed review
    review = ' '.join(review)

    return review

In [None]:
train_set_pd['clean_text']=train_set_pd[0].apply(lambda x: data_preprocessing(x))
test_set_pd['clean_text']=test_set_pd[0].apply(lambda x: data_preprocessing(x))

In [None]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(train_set_pd['clean_text'])
X_test = vectorizer.transform(test_set_pd['clean_text'])

In [None]:
y_train = train_set_pd[[5,6,7]]
y_test = test_set_pd[[5,6,7]]

In [None]:
def build_model(model_L,mlb_estimator_L,xtrain,ytrain,xtest,ytest):
    for model_name in model_L:
        for mlb_estimator_name in mlb_estimator_L:
    
            if model_name == "lr":
                model = LogisticRegression(penalty='l2',max_iter=1000)
            elif model_name == "nb":
                model = MultinomialNB()
            elif model_name == "rf":
                model = RandomForestClassifier()
            elif model_name == "svm":
                model = svm.SVC(kernel='linear')
            
            if mlb_estimator_name == "binary_relevance":
                mlb_estimator = BinaryRelevance
            elif mlb_estimator_name == "classifier_chains":
                mlb_estimator = BinaryRelevance
            elif mlb_estimator_name == "labelpowerset":
                mlb_estimator = LabelPowerset
    
            clf = mlb_estimator(model)
            clf.fit(xtrain,ytrain)
            clf_predictions = clf.predict(xtest)
            acc = accuracy_score(ytest,clf_predictions)
            ham = hamming_loss(ytest,clf_predictions)
            result = {"model_name:":model_name,"mlb_estimator":mlb_estimator_name,"accuracy:":acc,"hamming_score":ham}
            
            print (result)
            
    return 

In [None]:
model_L=["lr","rf","svm"]
mlb_estimator_L=["binary_relevance","classifier_chains","labelpowerset"]

In [None]:
build_model(model_L,mlb_estimator_L,X_train,y_train,X_test,y_test)

{'model_name:': 'lr', 'mlb_estimator': 'binary_relevance', 'accuracy:': 0.3909415971394517, 'hamming_score': 0.27691696464044496}
{'model_name:': 'lr', 'mlb_estimator': 'classifier_chains', 'accuracy:': 0.3909415971394517, 'hamming_score': 0.27691696464044496}
{'model_name:': 'lr', 'mlb_estimator': 'labelpowerset', 'accuracy:': 0.42193087008343266, 'hamming_score': 0.28843861740166865}
{'model_name:': 'rf', 'mlb_estimator': 'binary_relevance', 'accuracy:': 0.38498212157330153, 'hamming_score': 0.278108859753675}
{'model_name:': 'rf', 'mlb_estimator': 'classifier_chains', 'accuracy:': 0.4028605482717521, 'hamming_score': 0.2709574890742948}
{'model_name:': 'rf', 'mlb_estimator': 'labelpowerset', 'accuracy:': 0.42073897497020263, 'hamming_score': 0.29360349622566545}
{'model_name:': 'svm', 'mlb_estimator': 'binary_relevance', 'accuracy:': 0.38974970202622167, 'hamming_score': 0.288041319030592}
{'model_name:': 'svm', 'mlb_estimator': 'classifier_chains', 'accuracy:': 0.38974970202622167,

In [None]:
import gensim
import gensim.downloader as gensim_api

In [None]:
embeddings = gensim_api.load("word2vec-google-news-300")

In [None]:
def word2vec(df,embeddings):
    docs_vectors = pd.DataFrame() # creating empty final dataframe
    stopwords = nltk.corpus.stopwords.words('english') # removing stop words
    for doc in df[0].str.lower().str.replace('[^a-z ]', ''): # looping through each document and cleaning it
        temp = pd.DataFrame() # creating a temporary dataframe(store value for 1st doc & for 2nd doc remove the details of 1st & proced through 2nd and so on..)
        for word in doc.split(' '): # looping through each word of a single document and spliting through space
            if word not in stopwords: # if word is not present in stopwords then (try)
                try:
                    word_vec = embeddings[word] # if word is present in embeddings(goole provides weights associate with words(300)) then proceed
                    temp = temp.append(pd.Series(word_vec), ignore_index = True) # if word is present then append it to temporary dataframe
                except:
                    pass
        doc_vector = temp.mean() # take the average of each column(w0, w1, w2,........w300)
        docs_vectors = docs_vectors.append(doc_vector, ignore_index = True) # append each document value to the final dataframe
    return  docs_vectors

In [None]:
word2vec_train=word2vec(train_set_pd,embeddings)
word2vec_test=word2vec(test_set_pd,embeddings)

In [None]:
build_model(model_L,mlb_estimator_L,word2vec_train,y_train,word2vec_test,y_test)

{'model_name:': 'lr', 'mlb_estimator': 'binary_relevance', 'accuracy:': 0.3766388557806913, 'hamming_score': 0.2789034564958284}
{'model_name:': 'lr', 'mlb_estimator': 'classifier_chains', 'accuracy:': 0.3766388557806913, 'hamming_score': 0.2789034564958284}
{'model_name:': 'lr', 'mlb_estimator': 'labelpowerset', 'accuracy:': 0.4195470798569726, 'hamming_score': 0.2912197059992054}
{'model_name:': 'rf', 'mlb_estimator': 'binary_relevance', 'accuracy:': 0.4052443384982122, 'hamming_score': 0.2689709972189114}
{'model_name:': 'rf', 'mlb_estimator': 'classifier_chains', 'accuracy:': 0.3909415971394517, 'hamming_score': 0.27771156138259834}
{'model_name:': 'rf', 'mlb_estimator': 'labelpowerset', 'accuracy:': 0.43146603098927294, 'hamming_score': 0.28684942391736196}
{'model_name:': 'svm', 'mlb_estimator': 'binary_relevance', 'accuracy:': 0.39451728247914186, 'hamming_score': 0.2765196662693683}
{'model_name:': 'svm', 'mlb_estimator': 'classifier_chains', 'accuracy:': 0.39451728247914186, '