## Baseline Tests

This notebook includes the classifiers that will be used to determine the performance of the deep learning-based classifier.

### Classifiers:

1) Naive Bayes

2) Support Vector Machines (SVM)

### Performance Measures:

1) Storage requirements of the classifier and feature representation used

2) Training time of the classifier

3) Speed of the classifier

4) Accuracy of the classifier

In [1]:
# nltk.download('punkt') #uncomment if running on new machine

In [2]:
# Importing the libraries
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.test.utils import datapath
from gensim.models import Word2Vec
import os
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize

# Helpful variables
EXT_DATA_FOLDER = "C:\\Users\\Admin\\Projects\\thesis\\data\\"
EXT_DATA_FOLDER2 = "B:\\Datasets\\"

ANALYSIS_SAMPLES = os.path.join(EXT_DATA_FOLDER, "Credibility_Analysis_Samples\\September_25\\")
dataset_columns = ['Identifier', 'Type', 'Category', 'URL', 'Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5',
 'Cat6', 'Cat7', 'Score', 'First date_time', 'Tweets', 'Likes', 'Retweets',
 'Potential exposure', 'HTML', 'TEXT']
criterias = ["Cat1", "Cat2", "Cat3", "Cat4", "Cat5", "Cat6", "Cat7"]




### Reading in csv and excel data

In [3]:
def create_dataset(corpus_path, annotated_samples):
    """
    Input: 
    corpus_path: Path for a CSV file containing a list of article URLs and its article text
    annotated_samples: Path of the excel file containing articles and its associated URL along with its labels
    
    Method:
    Retrieves the article text by matching the URLs within the corpus_path and annotated_samples and creates a dataframe 
    containing the URL, article text and the article's corresponding labels.
    
    Output:
    A pandas dataframe
    """
    article_corpus = pd.read_csv(corpus_path)
    annotated_corpus = pd.read_excel((annotated_samples))
    article_corpus.columns = ["URL", "HTML", "TEXT"]
    annotated_articles = annotated_corpus.loc[(annotated_corpus["Cat7"] == 0) | (annotated_corpus["Cat7"] == 1)]
    dataset = pd.merge(annotated_articles, article_corpus, how='left', on='URL')
    return dataset


In [128]:
article_corpus = pd.read_csv(corpus_path)
article_corpus.columns = ["URL", "HTML", "TEXT"]
print(article_corpus.shape)

print(article_corpus.loc[article_corpus["URL"] == "https://www.thestar.com/news/world/2017/05/07/anti-vaccine-activists-just-sparked-a-us-states-worst-measles-outbreak-in-decades.html"]["TEXT"])
print(article_corpus.loc[article_corpus["URL"] == "https://www.newscientist.com/article/mg23531335-800-cancer-vaccines-could-prime-our-own-bodies-to-fight-tumours/?utm_campaign=RSS%7CNSNS&utm_source=NSNS&utm_medium=RSS&campaign_id=RSS%7CNSNS-"])

(1116, 3)
65    TITLE: Anti-vaccine activists just sparked a U...
Name: TEXT, dtype: object
Empty DataFrame
Columns: [URL, HTML, TEXT]
Index: []


In [141]:
corpus_path = os.path.join(EXT_DATA_FOLDER, "url_text.csv")
excel_files = ["sample_third_adam_new.xlsx", "sample_third_amalie_new.xlsx", "sample_third_maryke_new.xlsx"]

df_files = []

for filename in excel_files:
    annotated_path = os.path.join(ANALYSIS_SAMPLES, filename)
    data = create_dataset(corpus_path, annotated_path)
    df_files.append(data)
    
dataset = pd.concat(df_files)

print(dataset.columns.values)
print(dataset.shape)

['Identifier' 'Type' 'Category' 'URL' 'Cat1' 'Cat2' 'Cat3' 'Cat4' 'Cat5'
 'Cat6' 'Cat7' 'Score' 'First date_time' 'Tweets' 'Likes' 'Retweets'
 'Potential exposure' 'HTML' 'TEXT']
(447, 19)


In [102]:
for filename in excel_files[1:]:
    print(filename)

September_13\sample_third_amalie_new.xlsx
September_13\sample_third_maryke_new.xlsx


In [5]:
#Example of article with missing text
print(dataset.head()["URL"][3])
print(dataset.head()["TEXT"][3])  
print(dataset.head()["HTML"])

http://triblive.com/news/healthnow/12759008-74/stronger-flu-vaccine-for-elderly-could-help-younger-adults-with-chronic-conditions
TITLE: Stronger flu vaccine for elderly could help younger adults with chronic conditions | TribLIVE
TEXT:     “Persons who have these conditions have a much greater risk of the flu being more severe to the point of needing to be hospitalized,” Dr. Ken Smith, professor of medicine at Pitt and co-author of the paper, told the Tribune-Review on Thursday. “If you are hospitalized with the flu, your risk of dying is certainly something that is a possibility.”  The high-dose vaccine is recommended for the elderly population because their immune response to the standard-dose vaccine lessens as they age. However, the price for a standard dose is about $11, while the stronger vaccine is about $31 per dose, Smith said. He said the dose for the elderly is about 24 percent stronger than a standard vaccine.   “The growing proportion of middle-aged adults with chronic he

In [143]:
#Save dataset locally
writer = pd.ExcelWriter("dataset3.xlsx")
dataset.to_excel(writer, "Sheet1")
writer.save()

In [3]:
#pre-processing
from collections import defaultdict

labelled_articles = pd.read_excel("dataset5.xlsx")
labelled_articles = labelled_articles.dropna(subset=['TEXT'])
labelled_articles = labelled_articles[pd.to_numeric(labelled_articles['Cat1'], errors='coerce').notnull()]

for criteria in criterias:
    labelled_articles = labelled_articles.dropna(subset=[criteria])

print(labelled_articles.shape)
art_text_sent = np.array([sent_tokenize(article.split("TITLE: ")[1].replace("TEXT: ","").strip(" ")) for article in labelled_articles["TEXT"]])
art_text_word = np.array([word_tokenize(article.split("TITLE: ")[1].replace("TEXT: ","").strip(" ")) for article in labelled_articles["TEXT"]])
art_text_sent_word = np.array([[word_tokenize(sent) for sent in article] for article in art_text_sent])
labels = [labelled_articles["Cat1"], labelled_articles["Cat2"], labelled_articles["Cat3"], labelled_articles["Cat4"], labelled_articles["Cat5"], labelled_articles["Cat6"], labelled_articles["Cat7"]]
labels = np.array(labels).transpose()
multi_labels = np.array([[int(x) for x in row] for row in labels])

(470, 21)


### Baseline classifier performance

Performance of classifier is measured using an f1_micro score:

'f1_micro': Calculate metrics globally by counting the total true positives, false negatives and false positives and accounts for class imbalance. [Source](https://stackoverflow.com/questions/43421456/computing-macro-f1-score-using-sklearn)

f1_micro scores are calculated using stratified k-fold cross validation for k = 10



In [17]:
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

import warnings
warnings.filterwarnings('ignore')

categories = ['Not Satisfied', 'Satisfied']
nb_bow = []
nb_tfidf = []
nb_w2v = []
svm_bow = []
svm_tfidf = []
svm_w2v = []

nb_optimal = {
    "Cat1": Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english')), 
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
    "Cat2": Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict', 
        encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english')), 
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
    "Cat3": Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict', 
        encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english')), 
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
    "Cat4": Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english')), 
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
    "Cat5": Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english')),
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
    "Cat6": Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english')), 
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
    "Cat7": Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english')), 
        ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
}

svm_optimal = {
    "Cat1": None,
    "Cat2": None,
    "Cat3": None,
    "Cat4": None,
    "Cat5": None,
    "Cat6": None,
    "Cat7": None
}

### NB Performance using gridsearch results

In [18]:
for criteria in criterias:
    
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    nb_clf = nb_optimal[criteria]
    nb_clf.fit(X_train, y_train)
    nb_predicted = list(nb_clf.predict(X_test))
    #print("Actual vs. NB Predicted labels:\n" + str(y_test) + "\n" + str(nb_predicted))

    #print(metrics.classification_report(y_test, nb_predicted, target_names=categories))

    cv_scores = cross_val_score(nb_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("For " + criteria + ":")
    print("NB Average micro f1-score: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std()))
    nb_bow.append((cv_scores.mean(), cv_scores.std()))
"""
    svm_clf = svm_optimal[criteria]

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. SVM Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    #print(metrics.classification_report(y_test, svm_predicted, target_names=categories))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))
    svm_bow.append((svm_cv_scores.mean(), svm_cv_scores.std()))

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),}
"""

#print(nb_f1_scores)
#print("====")
#print(svm_f1_scores)

For Cat1:
NB Average micro f1-score: 0.85 (+/- 0.05)
For Cat2:
NB Average micro f1-score: 0.77 (+/- 0.05)
For Cat3:
NB Average micro f1-score: 0.86 (+/- 0.03)
For Cat4:
NB Average micro f1-score: 0.88 (+/- 0.04)
For Cat5:
NB Average micro f1-score: 0.65 (+/- 0.09)
For Cat6:
NB Average micro f1-score: 0.78 (+/- 0.06)
For Cat7:
NB Average micro f1-score: 0.83 (+/- 0.03)


'\n    svm_clf = svm_optimal[criteria]\n\n    svm_clf.fit(X_train, y_train)\n    svm_predicted = list(svm_clf.predict(X_test))\n    #print("Actual vs. SVM Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))\n\n    #print(metrics.classification_report(y_test, svm_predicted, target_names=categories))\n\n    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring=\'f1_micro\')\n    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))\n    svm_bow.append((svm_cv_scores.mean(), svm_cv_scores.std()))\n\nparameters = {\'vect__ngram_range\': [(1, 1), (1, 2)],\n              \'tfidf__use_idf\': (True, False),\n              \'clf__alpha\': (1e-2, 1e-3),\n}\n'

### BoW performance with stopwords

In [4]:
for criteria in criterias:
    
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    nb_clf = Pipeline([('vect', CountVectorizer()),
                         #('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),])
    nb_clf.fit(X_train, y_train)
    nb_predicted = list(nb_clf.predict(X_test))
    #print("Actual vs. NB Predicted labels:\n" + str(y_test) + "\n" + str(nb_predicted))

    #print(metrics.classification_report(y_test, nb_predicted, target_names=categories))

    cv_scores = cross_val_score(nb_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("For " + criteria + ":")
    print("NB Average micro f1-score: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std()))
    nb_bow.append((cv_scores.mean(), cv_scores.std()))

    svm_clf = Pipeline([('vect', CountVectorizer()),
                         #('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, random_state=69,
                                               max_iter=5, tol=None)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. SVM Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    #print(metrics.classification_report(y_test, svm_predicted, target_names=categories))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))
    svm_bow.append((svm_cv_scores.mean(), svm_cv_scores.std()))

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

#print(nb_f1_scores)
#print("====")
#print(svm_f1_scores)

For Cat1:
NB Average micro f1-score: 0.84 (+/- 0.04)
SVM Average micro f1-score: 0.69 (+/- 0.20)
For Cat2:
NB Average micro f1-score: 0.76 (+/- 0.03)
SVM Average micro f1-score: 0.76 (+/- 0.08)
For Cat3:
NB Average micro f1-score: 0.85 (+/- 0.02)
SVM Average micro f1-score: 0.83 (+/- 0.05)
For Cat4:
NB Average micro f1-score: 0.88 (+/- 0.05)
SVM Average micro f1-score: 0.81 (+/- 0.10)
For Cat5:
NB Average micro f1-score: 0.63 (+/- 0.09)
SVM Average micro f1-score: 0.58 (+/- 0.08)
For Cat6:
NB Average micro f1-score: 0.78 (+/- 0.08)
SVM Average micro f1-score: 0.63 (+/- 0.17)
For Cat7:
NB Average micro f1-score: 0.82 (+/- 0.04)
SVM Average micro f1-score: 0.78 (+/- 0.05)


### BoW Performance without stopwords

In [5]:
for criteria in criterias:
    
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    nb_clf = Pipeline([('vect', CountVectorizer(stop_words='english', analyzer='word')),
                         #('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),])
    nb_clf.fit(X_train, y_train)
    nb_predicted = list(nb_clf.predict(X_test))
    #print("Actual vs. NB Predicted labels:\n" + str(y_test) + "\n" + str(nb_predicted))

    #print(metrics.classification_report(y_test, nb_predicted, target_names=categories))

    cv_scores = cross_val_score(nb_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("For " + criteria + ":")
    print("NB Average micro f1-score: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std()))
    nb_bow.append((cv_scores.mean(), cv_scores.std()))

    svm_clf = Pipeline([('vect', CountVectorizer(stop_words='english', analyzer='word')),
                         #('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, random_state=69,
                                               max_iter=5, tol=None)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. SVM Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    #print(metrics.classification_report(y_test, svm_predicted, target_names=categories))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))
    svm_bow.append((svm_cv_scores.mean(), svm_cv_scores.std()))

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

#print(nb_f1_scores)
#print("====")
#print(svm_f1_scores)

For Cat1:
NB Average micro f1-score: 0.86 (+/- 0.06)
SVM Average micro f1-score: 0.83 (+/- 0.04)
For Cat2:
NB Average micro f1-score: 0.76 (+/- 0.04)
SVM Average micro f1-score: 0.74 (+/- 0.05)
For Cat3:
NB Average micro f1-score: 0.86 (+/- 0.02)
SVM Average micro f1-score: 0.82 (+/- 0.06)
For Cat4:
NB Average micro f1-score: 0.89 (+/- 0.04)
SVM Average micro f1-score: 0.89 (+/- 0.06)
For Cat5:
NB Average micro f1-score: 0.67 (+/- 0.07)
SVM Average micro f1-score: 0.60 (+/- 0.06)
For Cat6:
NB Average micro f1-score: 0.78 (+/- 0.02)
SVM Average micro f1-score: 0.75 (+/- 0.05)
For Cat7:
NB Average micro f1-score: 0.81 (+/- 0.05)
SVM Average micro f1-score: 0.80 (+/- 0.06)


### TF-IDF Performance

In [6]:
for criteria in criterias:
    
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    nb_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),])
    nb_clf.fit(X_train, y_train)
    nb_predicted = list(nb_clf.predict(X_test))
    #print("Actual vs. NB Predicted labels:\n" + str(y_test) + "\n" + str(nb_predicted))


    #print(metrics.classification_report(y_test, nb_predicted, target_names=categories))

    cv_scores = cross_val_score(nb_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("For " + criteria + ":")
    print("NB Average micro f1-score: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std()))
    nb_tfidf.append((cv_scores.mean(), cv_scores.std()))

    svm_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, random_state=69,
                                               max_iter=5, tol=None)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. SVM Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    #print(metrics.classification_report(y_test, svm_predicted, target_names=categories))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))
    svm_tfidf.append((svm_cv_scores.mean(), svm_cv_scores.std()))

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

#print(nb_f1_scores)
#print("====")
#print(svm_f1_scores)

For Cat1:
NB Average micro f1-score: 0.74 (+/- 0.01)
SVM Average micro f1-score: 0.82 (+/- 0.05)
For Cat2:
NB Average micro f1-score: 0.64 (+/- 0.00)
SVM Average micro f1-score: 0.81 (+/- 0.03)
For Cat3:
NB Average micro f1-score: 0.87 (+/- 0.00)
SVM Average micro f1-score: 0.86 (+/- 0.03)
For Cat4:
NB Average micro f1-score: 0.79 (+/- 0.01)
SVM Average micro f1-score: 0.89 (+/- 0.03)
For Cat5:
NB Average micro f1-score: 0.54 (+/- 0.04)
SVM Average micro f1-score: 0.61 (+/- 0.06)
For Cat6:
NB Average micro f1-score: 0.76 (+/- 0.01)
SVM Average micro f1-score: 0.79 (+/- 0.05)
For Cat7:
NB Average micro f1-score: 0.84 (+/- 0.01)
SVM Average micro f1-score: 0.83 (+/- 0.03)


### TF-IDF Performance without stopwords

In [7]:
for criteria in criterias:
    
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    nb_clf = Pipeline([('vect', CountVectorizer(stop_words='english', analyzer='word')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),])
    nb_clf.fit(X_train, y_train)
    nb_predicted = list(nb_clf.predict(X_test))
    #print("Actual vs. NB Predicted labels:\n" + str(y_test) + "\n" + str(nb_predicted))

    #print(metrics.classification_report(y_test, nb_predicted, target_names=categories))

    cv_scores = cross_val_score(nb_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("For " + criteria + ":")
    print("NB Average micro f1-score: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std()))
    nb_bow.append((cv_scores.mean(), cv_scores.std()))

    svm_clf = Pipeline([('vect', CountVectorizer(stop_words='english', analyzer='word')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, random_state=69,
                                               max_iter=5, tol=None)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. SVM Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    #print(metrics.classification_report(y_test, svm_predicted, target_names=categories))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))
    svm_bow.append((svm_cv_scores.mean(), svm_cv_scores.std()))

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

#print(nb_f1_scores)
#print("====")
#print(svm_f1_scores)

For Cat1:
NB Average micro f1-score: 0.74 (+/- 0.01)
SVM Average micro f1-score: 0.84 (+/- 0.04)
For Cat2:
NB Average micro f1-score: 0.65 (+/- 0.02)
SVM Average micro f1-score: 0.78 (+/- 0.04)
For Cat3:
NB Average micro f1-score: 0.86 (+/- 0.01)
SVM Average micro f1-score: 0.86 (+/- 0.03)
For Cat4:
NB Average micro f1-score: 0.79 (+/- 0.01)
SVM Average micro f1-score: 0.90 (+/- 0.04)
For Cat5:
NB Average micro f1-score: 0.59 (+/- 0.04)
SVM Average micro f1-score: 0.61 (+/- 0.06)
For Cat6:
NB Average micro f1-score: 0.76 (+/- 0.01)
SVM Average micro f1-score: 0.77 (+/- 0.03)
For Cat7:
NB Average micro f1-score: 0.84 (+/- 0.01)
SVM Average micro f1-score: 0.82 (+/- 0.04)


### LinearSVC performance using BoW/TFIDF with and without stopwords

#### LinearSVC + BoW with stopwords

In [25]:
from sklearn.svm import LinearSVC

for criteria in criterias:
    print("For " + criteria + ":")
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    svm_clf = Pipeline([('vect', CountVectorizer()),
                         #('tfidf', TfidfTransformer()),
                         ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. LinearSVC Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("LinearSVC Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))

For Cat1:
LinearSVC Average micro f1-score: 0.80 (+/- 0.09)
For Cat2:
LinearSVC Average micro f1-score: 0.76 (+/- 0.06)
For Cat3:
LinearSVC Average micro f1-score: 0.88 (+/- 0.03)
For Cat4:
LinearSVC Average micro f1-score: 0.88 (+/- 0.07)
For Cat5:
LinearSVC Average micro f1-score: 0.76 (+/- 0.10)
For Cat6:
LinearSVC Average micro f1-score: 0.76 (+/- 0.10)
For Cat7:
LinearSVC Average micro f1-score: 0.82 (+/- 0.11)


#### LinearSVC + TF-IDF with stopwords

In [26]:
for criteria in criterias:
    print("For " + criteria + ":")
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    svm_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. LinearSVC Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("LinearSVC Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))

For Cat1:
LinearSVC Average micro f1-score: 0.84 (+/- 0.06)
For Cat2:
LinearSVC Average micro f1-score: 0.84 (+/- 0.06)
For Cat3:
LinearSVC Average micro f1-score: 0.89 (+/- 0.02)
For Cat4:
LinearSVC Average micro f1-score: 0.88 (+/- 0.04)
For Cat5:
LinearSVC Average micro f1-score: 0.78 (+/- 0.07)
For Cat6:
LinearSVC Average micro f1-score: 0.76 (+/- 0.07)
For Cat7:
LinearSVC Average micro f1-score: 0.82 (+/- 0.04)


#### LinearSVC + BoW without stopwords

In [27]:
for criteria in criterias:
    print("For " + criteria + ":")
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    svm_clf = Pipeline([('vect', CountVectorizer(stop_words='english', analyzer='word')),
                         #('tfidf', TfidfTransformer()),
                         ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. LinearSVC Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("LinearSVC Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))

For Cat1:
LinearSVC Average micro f1-score: 0.83 (+/- 0.08)
For Cat2:
LinearSVC Average micro f1-score: 0.83 (+/- 0.09)
For Cat3:
LinearSVC Average micro f1-score: 0.90 (+/- 0.04)
For Cat4:
LinearSVC Average micro f1-score: 0.87 (+/- 0.06)
For Cat5:
LinearSVC Average micro f1-score: 0.80 (+/- 0.09)
For Cat6:
LinearSVC Average micro f1-score: 0.80 (+/- 0.11)
For Cat7:
LinearSVC Average micro f1-score: 0.79 (+/- 0.06)


#### LinearSVC + TF-IDF without stopwords

In [28]:
for criteria in criterias:
    print("For " + criteria + ":")
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    svm_clf = Pipeline([('vect', CountVectorizer(stop_words='english', analyzer='word')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)),
    ])

    svm_clf.fit(X_train, y_train)
    svm_predicted = list(svm_clf.predict(X_test))
    #print("Actual vs. LinearSVC Predicted labels:\n" + str(y_test) + "\n" + str(svm_predicted))

    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, cv=10, scoring='f1_micro')
    print("LinearSVC Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))

For Cat1:
LinearSVC Average micro f1-score: 0.80 (+/- 0.07)
For Cat2:
LinearSVC Average micro f1-score: 0.82 (+/- 0.03)
For Cat3:
LinearSVC Average micro f1-score: 0.90 (+/- 0.02)
For Cat4:
LinearSVC Average micro f1-score: 0.86 (+/- 0.04)
For Cat5:
LinearSVC Average micro f1-score: 0.76 (+/- 0.02)
For Cat6:
LinearSVC Average micro f1-score: 0.75 (+/- 0.09)
For Cat7:
LinearSVC Average micro f1-score: 0.81 (+/- 0.04)


In [8]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.items())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [9]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.items())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [10]:
with open(os.path.join(EXT_DATA_FOLDER2, "glove.6B.300d.txt"), "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

In [11]:
w2v_df = pd.DataFrame.from_dict(w2v, orient='index')
w2v_df.head()[0]

b'the'    <map object at 0x00000127D8974390>
b','      <map object at 0x00000127D8974C18>
b'.'      <map object at 0x00000127D8974B00>
b'of'     <map object at 0x00000127D8974A20>
b'to'     <map object at 0x00000127D8974EB8>
Name: 0, dtype: object

### Word2Vec with Mean Embedding Vectorizer

In [None]:
from gensim.sklearn_api import W2VTransformer

for criteria in criterias:
    
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    print("For " + criteria + ":")
    
    
    nb_clf = Pipeline([('w2v', MeanEmbeddingVectorizer(w2v)),
                         ('clf', MultinomialNB()),])
    nb_clf.fit(X_train, y_train)

    nb_predicted = nb_clf.predict(X_test)
    #print(metrics.classification_report(y_test, nb_predicted, target_names=categories))
    nb_cv_scores = cross_val_score(nb_clf, X_train, y_train, scoring='f1_micro')
    
    
    print("NB Average micro f1-score: %0.2f (+/- %0.2f)" % (nb_cv_scores.mean(), nb_cv_scores.std()))
    nb_w2v.append((nb_cv_scores.mean(), nb_cv_scores.std()))
    
    
    svm_clf = Pipeline([('w2v', MeanEmbeddingVectorizer(w2v)),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, random_state=69,
                                               max_iter=5, tol=None)),
    ])
    
    svm_clf.fit(X_train, y_train)
    svm_predicted = svm_clf.predict(X_test)
    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, scoring='f1_micro')
    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))
    svm_w2v.append((svm_cv_scores.mean(), svm_cv_scores.std()))
    


For Cat1:
NB Average micro f1-score: 0.79 (+/- 0.00)
SVM Average micro f1-score: 0.60 (+/- 0.28)
For Cat2:
NB Average micro f1-score: 0.79 (+/- 0.00)
SVM Average micro f1-score: 0.60 (+/- 0.27)
For Cat3:
NB Average micro f1-score: 0.92 (+/- 0.00)
SVM Average micro f1-score: 0.92 (+/- 0.00)
For Cat4:
NB Average micro f1-score: 0.81 (+/- 0.00)
SVM Average micro f1-score: 0.60 (+/- 0.29)
For Cat5:
NB Average micro f1-score: 0.76 (+/- 0.00)
SVM Average micro f1-score: 0.76 (+/- 0.00)
For Cat6:
NB Average micro f1-score: 0.66 (+/- 0.00)
SVM Average micro f1-score: 0.56 (+/- 0.15)
For Cat7:
NB Average micro f1-score: 0.81 (+/- 0.00)
SVM Average micro f1-score: 0.60 (+/- 0.29)


### Word2Vec with Tf-idf Weighted Vectorizer

In [12]:
from gensim.sklearn_api import W2VTransformer

for criteria in criterias:
    
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    print("For " + criteria + ":")
    
    
    nb_clf = Pipeline([('w2v', TfidfEmbeddingVectorizer(w2v)),
                         ('clf', MultinomialNB()),])
    nb_clf.fit(X_train, y_train)

    nb_predicted = nb_clf.predict(X_test)
    #print(metrics.classification_report(y_test, nb_predicted, target_names=categories))
    nb_cv_scores = cross_val_score(nb_clf, X_train, y_train, scoring='f1_micro')
    
    
    print("NB Average micro f1-score: %0.2f (+/- %0.2f)" % (nb_cv_scores.mean(), nb_cv_scores.std()))
    nb_w2v.append((nb_cv_scores.mean(), nb_cv_scores.std()))

    
    svm_clf = Pipeline([('w2v', TfidfEmbeddingVectorizer(w2v)),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, random_state=69,
                                               max_iter=5, tol=None)),
    ])
    
    svm_clf.fit(X_train, y_train)
    svm_predicted = svm_clf.predict(X_test)
    svm_cv_scores = cross_val_score(svm_clf, X_train, y_train, scoring='f1_micro')
    print("SVM Average micro f1-score: %0.2f (+/- %0.2f)" % (svm_cv_scores.mean(), svm_cv_scores.std()))
    svm_w2v.append((svm_cv_scores.mean(), svm_cv_scores.std()))
    


For Cat1:
NB Average micro f1-score: 0.80 (+/- 0.00)
SVM Average micro f1-score: 0.61 (+/- 0.28)
For Cat2:
NB Average micro f1-score: 0.78 (+/- 0.01)
SVM Average micro f1-score: 0.60 (+/- 0.26)
For Cat3:
NB Average micro f1-score: 0.89 (+/- 0.00)
SVM Average micro f1-score: 0.89 (+/- 0.00)
For Cat4:
NB Average micro f1-score: 0.81 (+/- 0.00)
SVM Average micro f1-score: 0.60 (+/- 0.29)
For Cat5:
NB Average micro f1-score: 0.79 (+/- 0.00)
SVM Average micro f1-score: 0.79 (+/- 0.00)
For Cat6:
NB Average micro f1-score: 0.69 (+/- 0.01)
SVM Average micro f1-score: 0.56 (+/- 0.18)
For Cat7:
NB Average micro f1-score: 0.83 (+/- 0.01)
SVM Average micro f1-score: 0.83 (+/- 0.01)


In [68]:
print("type of twent_test.data: ", type(twenty_train.target))
print(len(art_text))
print(twenty_test.target)

type of twent_test.data:  <class 'numpy.ndarray'>
66
[2 2 2 ... 2 2 1]


In [389]:
def load_embeddings(filename):
    """
    Load a DataFrame from the generalized text format used by word2vec, GloVe,
    fastText, and ConceptNet Numberbatch. The main point where they differ is
    whether there is an initial line with the dimensions of the matrix.
    """
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

In [366]:
#Loading pre-trained word2vec
pre_word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(datapath(os.path.join(EXT_DATA_FOLDER, "GoogleNews-vectors-negative300.bin")), binary=True)
pre_word2vec_model.save("pre_word2vec.model")

In [73]:
print("Similarity of 'woman' and 'man': ", pre_word2vec_model.similarity('woman', 'man'))
print("Similarity of 'woman' and 'woman': ", pre_word2vec_model.similarity('woman', 'woman'))
print("Similarity of 'dog' and 'hotdog': ", pre_word2vec_model.similarity('dog', 'hotdog'))

Similarity of 'woman' and 'man':  0.76640123
Similarity of 'woman' and 'woman':  1.0
Similarity of 'dog' and 'hotdog':  0.38931656


  if np.issubdtype(vec.dtype, np.int):


### Using Gridsearch to find optimal hyperparameters

In [32]:
#NB Parameters

nb_parameters = {  
    'vec__max_df': (0.5, 0.625, 0.75, 0.875, 1.0),  
    'vec__max_features': (None, 5000, 10000, 20000),  
    'vec__min_df': (1, 5, 10, 20, 50),  
    'tfidf__use_idf': (True, False),  
    'tfidf__sublinear_tf': (True, False),  
    'vec__binary': (True, False),  
    'tfidf__norm': ('l1', 'l2'),  
    'clf__alpha': [10 ** x for x in range(-5, 1)]  
    }

#SVM Parameters 

svm_parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

best_model = defaultdict(str)

In [None]:
for criteria in criterias:
    print("Currently finding best model for: " + criteria)
    X_train, X_test, y_train, y_test = train_test_split(list(labelled_articles["TEXT"]), list(labelled_articles[criteria]), test_size=int(20))

    nb_pipeline = Pipeline([('vec', CountVectorizer(stop_words='english', analyzer='word')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),])
    nb_grid_search = GridSearchCV(estimator=nb_pipeline, param_grid=nb_parameters, cv=10, n_jobs=-1, verbose=2)  
    nb_grid_search.fit(X_train, y_train)
    best_model[criteria] = nb_grid_search
    #nb_predicted = list(nb_grid_search.predict(X_test))
   

Currently finding best model for: Cat1
Fitting 10 folds for each of 9600 candidates, totalling 96000 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 9097 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 10352 tasks      | elapsed: 10.7