In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import re

import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import string

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [None]:
train_set_np=np.load("./training_set.npy",allow_pickle=True)
test_set_np=np.load("./testing_set.npy",allow_pickle=True)

In [None]:
train_set_pd=pd.DataFrame(train_set_np)[0:int(len(train_set_np)*0.5)]
test_set_pd=pd.DataFrame(test_set_np)[0:int(len(test_set_np)*0.5)]

In [None]:
train_set_pd.shape

(33570, 8)

In [None]:
test_set_pd.shape

(8393, 8)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

In [None]:
import neattext as nt
import neattext.functions as nfx

In [None]:
def preprocess(df):
    #df[0].apply(lambda x:nt.TextFrame(x).noise_scan())
    df[0].apply(lambda x:nt.TextExtractor(x).extract_stopwords())
    df[0].apply(nfx.remove_stopwords)
    corpus = df[0]
    tfidf = TfidfVectorizer(min_df=10)
    Xfeatures = tfidf.fit_transform(corpus).toarray()
    #np.save(root_path+"/data_preprocessing/preprocessing/preprocessed_data/"+file_name,Xfeatures)
    return Xfeatures

In [None]:
def data_preprocessing(review):

    # package setting
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    # data cleaning
    review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
    review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words

    # lowercase
    review = review.lower()

    # tokenization
    tokens = nltk.word_tokenize(review) # converts review to tokens

    # stop_words removal
    review = [word for word in tokens if word not in stop_words] #removing stop words

    # lemmatization
    review = [lemmatizer.lemmatize(word) for word in review]

    # join words in preprocessed review
    review = ' '.join(review)

    return review

In [None]:
train_set_pd['clean_text']=train_set_pd[0].apply(lambda x: data_preprocessing(x))
test_set_pd['clean_text']=test_set_pd[0].apply(lambda x: data_preprocessing(x))

In [None]:
vectorizer = TfidfVectorizer(min_df=10)
X_train = vectorizer.fit_transform(train_set_pd['clean_text'])
X_test = vectorizer.transform(test_set_pd['clean_text'])

In [None]:
#X_train=preprocess(train_set_pd)
#X_test=preprocess(test_set_pd)

In [None]:
train_set_pd[5] = train_set_pd[5].astype(float)
train_set_pd[6] = train_set_pd[6].astype(float)
train_set_pd[7] = train_set_pd[7].astype(float)

test_set_pd[5] = test_set_pd[5].astype(float)
test_set_pd[6] = test_set_pd[6].astype(float)
test_set_pd[7] = test_set_pd[7].astype(float)

In [None]:
y_train = train_set_pd[[5,6,7]]
y_test = test_set_pd[[5,6,7]]

In [None]:
binary_rel_clf = BinaryRelevance(MultinomialNB())

In [None]:
binary_rel_clf.fit(X_train,y_train)

BinaryRelevance(classifier=MultinomialNB(), require_dense=[True, True])

In [None]:
br_prediction = binary_rel_clf.predict(X_test)

In [None]:
accuracy_score(y_test,br_prediction)

0.41010365781007985

In [None]:
hamming_loss(y_test,br_prediction)

0.27066205965288537

In [None]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    clf_predictions = clf.predict(xtest)
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [None]:
clf_chain_model = build_model(MultinomialNB(),ClassifierChain,X_train,y_train,X_test,y_test)

In [None]:
clf_chain_model

{'accuracy:': 0.4105802454426308, 'hamming_score': 0.2746336232574765}

In [None]:
clf_labelP_model = build_model(MultinomialNB(),LabelPowerset,X_train,y_train,X_test,y_test)

In [None]:
clf_labelP_model

{'accuracy:': 0.4321458358155606, 'hamming_score': 0.28575400135033163}

In [None]:
clf_chain_model = build_model(LogisticRegression(penalty='l2',max_iter=1000),ClassifierChain,X_train,y_train,X_test,y_test)

In [None]:
clf_chain_model

{'accuracy:': 0.45657095198379605, 'hamming_score': 0.25787362484610193}

In [None]:
clf_labelP_model = build_model(LogisticRegression(penalty='l2',max_iter=1000),LabelPowerset,X_train,y_train,X_test,y_test)

In [None]:
clf_chain_model

{'accuracy:': 0.45657095198379605, 'hamming_score': 0.25787362484610193}