### Multi label Baseline Tests
This notebook includes the classifiers that will be used to determine the performance of the deep learning-based classifier for the multi label scenario.

Multi label classification is being considered for the following reasons:

- One of the main ways to perform a classification tasks

- It is hypothesised that a multi label classifier will have a drastically lower amount of training time required (training 7 classifiers vs 1) for only a slight drop in the classifier's performance (a multi labelling task is more complex - but how much more complex can it be than a binary classification task?)

### Classifiers:

1) Naive Bayes

2) Support Vector Machines (SVM)

### Performance Measures:

1) Storage requirements of the classifier and feature representation used

2) Training time of the classifier

3) Speed of the classifier

4) Accuracy of the classifier

In [2]:
# Importing the libraries
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.test.utils import datapath
from gensim.models import Word2Vec
import os

# Helpful variables
EXT_DATA_FOLDER = "C:\\Users\\Admin\\Projects\\thesis\\data\\"
EXT_DATA_FOLDER2 = "B:\\Datasets\\"

ANALYSIS_SAMPLES = os.path.join(EXT_DATA_FOLDER, "Credibility_Analysis_Samples\\September_25\\")
dataset_columns = ['Identifier', 'Type', 'Category', 'URL', 'Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5',
 'Cat6', 'Cat7', 'Score', 'First date_time', 'Tweets', 'Likes', 'Retweets',
 'Potential exposure', 'HTML', 'TEXT']



In [3]:
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
# nltk.download('punkt') #uncomment if running on new machine

In [None]:
def create_dataset(corpus_path, annotated_samples):
    """
    Input: 
    corpus_path: Path for a CSV file containing a list of article URLs and its article text
    annotated_samples: Path of the excel file containing articles and its associated URL along with its labels
    
    Method:
    Retrieves the article text by matching the URLs within the corpus_path and annotated_samples and creates a dataframe 
    containing the URL, article text and the article's corresponding labels.
    
    Output:
    A pandas dataframe
    """
    article_corpus = pd.read_csv(corpus_path)
    annotated_corpus = pd.read_excel((annotated_samples))
    article_corpus.columns = ["URL", "HTML", "TEXT"]
    annotated_articles = annotated_corpus.loc[(annotated_corpus["Cat7"] == 0) | (annotated_corpus["Cat7"] == 1)]
    dataset = pd.merge(annotated_articles, article_corpus, how='left', on='URL')
    return dataset


In [None]:
corpus_path = os.path.join(EXT_DATA_FOLDER, "url_text.csv")
excel_files = ["sample_third_adam_new.xlsx", "sample_third_amalie_new.xlsx", "sample_third_maryke_new.xlsx"]

df_files = []

for filename in excel_files:
    annotated_path = os.path.join(ANALYSIS_SAMPLES, filename)
    data = create_dataset(corpus_path, annotated_path)
    df_files.append(data)
    
dataset = pd.concat(df_files)

print(dataset.columns.values)
print(dataset.shape)

In [None]:
#Save dataset locally
writer = pd.ExcelWriter("multi_dataset.xlsx")
dataset.to_excel(writer, "Sheet1")
writer.save()

In [4]:
#pre-processing
from collections import defaultdict

labelled_articles = pd.read_excel("dataset3.xlsx")
labelled_articles = labelled_articles.dropna(subset=['TEXT'])
print(labelled_articles.shape)
criterias = ["Cat1", "Cat2", "Cat3", "Cat4", "Cat5", "Cat6", "Cat7"]
art_text_sent = np.array([sent_tokenize(article.split("TITLE: ")[1].replace("TEXT: ","").strip(" ")) for article in labelled_articles["TEXT"]])
art_text_word = np.array([word_tokenize(article.split("TITLE: ")[1].replace("TEXT: ","").strip(" ")) for article in labelled_articles["TEXT"]])
art_text_sent_word = np.array([[word_tokenize(sent) for sent in article] for article in art_text_sent])

(208, 19)


In [22]:
labels = [labelled_articles["Cat1"], labelled_articles["Cat2"], labelled_articles["Cat3"], labelled_articles["Cat4"], labelled_articles["Cat5"], labelled_articles["Cat6"], labelled_articles["Cat7"]]
labels = np.array(labels).transpose()
multi_labels = [[int(x) for x in row] for row in labels]
print(multi_labels[0])

[1, 1, 0, 1, 0, 0, 1]


In [None]:
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

import warnings
warnings.filterwarnings('ignore')

categories = ['Not Satisfied', 'Satisfied']
nb_bow = []
nb_tfidf = []
nb_w2v = []
svm_bow = []
svm_tfidf = []
svm_w2v = []