## Baseline Tests

This notebook includes the classifiers that will be used to determine the performance of the deep learning-based classifier.

### Classifiers:

1) Naive Bayes

2) Support Vector Machines (SVM)

### Performance Measures:

1) Storage requirements of the classifier and feature representation used

2) Training time of the classifier

3) Speed of the classifier

4) Accuracy of the classifier

In [2]:
# Importing the libraries
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.test.utils import datapath
from gensim.models import Word2Vec
import os


# Helpful variables
EXT_DATA_FOLDER = "C:\\Users\\Admin\\Projects\\thesis\\data\\"



In [None]:
##Testing cell

### Using gensim for word2vec

#### Inputs
Requires a sequence of sentences where the sentence is a list of words:
E.g. "Hi there. Goodbye there" -> [["Hi", "there"], ["Goodbye", "there"]]

In [24]:
#Loading pre-trained word2vec
pre_word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(datapath(os.path.join(EXT_DATA_FOLDER, "GoogleNews-vectors-negative300.bin")), binary=True)

In [13]:
print("Similarity of 'woman' and 'man': ", pre_word2vec_model.similarity('woman', 'man'))
print("Similarity of 'woman' and 'woman': ", pre_word2vec_model.similarity('woman', 'woman'))
print("Similarity of 'dog' and 'hotdog': ", pre_word2vec_model.similarity('dog', 'hotdog'))

Similarity of 'woman' and 'man':  0.76640123
Similarity of 'woman' and 'woman':  1.0
Similarity of 'dog' and 'hotdog':  0.38931656


  if np.issubdtype(vec.dtype, np.int):


### Reading in csv and excel data

In [2]:
#Importing the csv and excel files
article_corpus = pd.read_csv(os.path.join(EXT_DATA_FOLDER, "url_text.csv"))
annotated_corpus = pd.read_excel(os.path.join(EXT_DATA_FOLDER, "Credibility_Analysis_Samples\\September_13\\sample_third_adam_new.xlsx"), header=1)
article_corpus.columns = ["URL", "HTML", "TEXT"]
print("Column names for article_corpus: ", article_corpus.columns.values)
print("==========")
print("Column names for annotated_corpus: ", annotated_corpus.columns.values)

annotated_articles = annotated_corpus.loc[(annotated_corpus["Cat7"] == 0) | (annotated_corpus["Cat7"] == 1)]
print(annotated_articles.shape)


Column names for article_corpus:  ['URL' 'HTML' 'TEXT']
Column names for annotated_corpus:  ['Identifier' 'Type' 'Category' 'URL' 'Cat 1' 'Cat2' 'Cat3' 'Cat4' 'Cat5'
 'Cat6' 'Cat7' 'Score' 'First date_time' 'Tweets' 'Likes' 'Retweets'
 'Potential exposure' 'Comment']
(67, 18)


In [7]:
from pandas import ExcelWriter

#url_text.columns[2]
title_text = article_corpus["TEXT"][0].split("TEXT:")
#print(url_text["URL"][0])
#print(title_text[0].replace("TITLE:","").strip(" "))
#print(title_text[1].strip(" "))

(67, 20)


In [4]:
def create_dataset(corpus_path, annotated_samples):
    article_corpus = pd.read_csv(corpus_path)
    annotated_corpus = pd.read_excel((annotated_samples), header=1)
    article_corpus.columns = ["URL", "HTML", "TEXT"]
    annotated_articles = annotated_corpus.loc[(annotated_corpus["Cat7"] == 0) | (annotated_corpus["Cat7"] == 1)]
    dataset = annotated_articles.merge(article_corpus, how='left', on='URL')
    return dataset


In [5]:
corpus_path = os.path.join(EXT_DATA_FOLDER, "url_text.csv")
annotated_path = os.path.join(EXT_DATA_FOLDER, "Credibility_Analysis_Samples\\September_13\\sample_third_adam_new.xlsx")
dataset = create_dataset(corpus_path, annotated_path)
print(dataset.columns.values)

['Identifier' 'Type' 'Category' 'URL' 'Cat 1' 'Cat2' 'Cat3' 'Cat4' 'Cat5'
 'Cat6' 'Cat7' 'Score' 'First date_time' 'Tweets' 'Likes' 'Retweets'
 'Potential exposure' 'Comment' 'HTML' 'TEXT']


In [68]:
#Example of article with missing text
print(dataset.head()["URL"][3])
print(dataset.head()["TEXT"][3])
print(dataset.head())

http://triblive.com/news/healthnow/12759008-74/stronger-flu-vaccine-for-elderly-could-help-younger-adults-with-chronic-conditions
TITLE: Stronger flu vaccine for elderly could help younger adults with chronic conditions | TribLIVE
TEXT:     “Persons who have these conditions have a much greater risk of the flu being more severe to the point of needing to be hospitalized,” Dr. Ken Smith, professor of medicine at Pitt and co-author of the paper, told the Tribune-Review on Thursday. “If you are hospitalized with the flu, your risk of dying is certainly something that is a possibility.”  The high-dose vaccine is recommended for the elderly population because their immune response to the standard-dose vaccine lessens as they age. However, the price for a standard dose is about $11, while the stronger vaccine is about $31 per dose, Smith said. He said the dose for the elderly is about 24 percent stronger than a standard vaccine.   “The growing proportion of middle-aged adults with chronic he

In [None]:
#Save dataset locally
writer = pd.ExcelWriter("dataset.xlsx")
dataset.to_excel(writer, "Sheet1")
writer.save()

In [25]:
#pre-processing
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize

In [63]:

dataset2 = dataset
count = 0
cat4_scores = []
art_text = []

for index, row in dataset2.iterrows():
    title_text = str(row["TEXT"])
    if(title_text != 'nan'):
        count = count + 1
        art_text.append(title_text.split("TITLE: ")[1].replace("TEXT: ","").strip(" "))
        cat4_scores.append(int(row["Cat7"]))
    else:
        print("No text article found for the following url: \n", row["URL"])
print(count)  

cat4_scores = np.array(cat4_scores)
art_text = np.array(art_text)
art_text_sent = np.array([sent_tokenize(article) for article in art_text])
art_text_word = np.array([[word_tokenize(sent) for sent in article] for article in art_text_sent])
#art_text_sent_word = np.array([[word_tokenize(sent) for sent in art_text_sent] for art in art_text_sent])


No text article found for the following url: 
 http://vaccineinjurynews.com/2017-08-26-the-national-meningitis-association-are-a-front-for-the-vaccine-industry.html
66
Scientists


In [8]:
#dividing into training and testing set

#merge text and scores
cat4_dataset = np.array(list(zip(art_text, cat4_scores)))

#TODO: split this dataset into training and testing and then pass these into the next cell
#training_set = cat4_dataset[:int(len(cat4_dataset)*0.8)]
#testing_set = cat4_dataset[int(len(cat4_dataset)*0.8):]

split = 0.8
training_articles = art_text[:int(len(art_text)*split)]
training_preds = cat4_scores[:int(len(cat4_scores)*split)]

testing_articles = art_text[int(len(art_text)*split):]
testing_preds = cat4_scores[int(len(cat4_scores)*split):]
print("===== Training set size ====")
print("# of articles in testing set: {}".format(len(training_preds)))
print("Number of articles that satisfy this category (=1): {}\n".format(len(training_preds[training_preds == 1])))

print("===== Testing set size ====")
print("# of articles in testing set: {}".format(len(testing_preds)))
print("Number of articles that satisfy this category (=1): {}\n".format(len(testing_preds[testing_preds == 1])))

===== Training set size ====
# of articles in testing set: 52
Number of articles that satisfy this category (=1): 13

===== Testing set size ====
# of articles in testing set: 14
Number of articles that satisfy this category (=1): 1



In [9]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.externals import joblib

import numpy as np

random_state = 42

categories = ['Not Satisfies', 'Satisfies']

print("Number of articles: {}".format(len(training_articles)))

docs_test = testing_articles

# Naive Bayes classifier
bayes_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())
                      ])
bayes_clf.fit(training_articles, training_preds)
joblib.dump(bayes_clf, "naive_bayes.pkl", compress=9)

# Predict the test dataset using Naive Bayes
predicted = bayes_clf.predict(docs_test)
print('Naive Bayes correct prediction: {:4.2f}'.format(np.mean(predicted == testing_preds)))
print(metrics.classification_report(testing_preds, predicted, target_names=categories))

# Support Vector Machine (SVM) classifier
svm_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=   5, random_state=42)),
])
svm_clf.fit(training_articles, training_preds)
joblib.dump(svm_clf, "svm.pkl", compress=9)
# Predict the test dataset using SVM
predicted = svm_clf.predict(docs_test)
print('SVM correct prediction: {:4.2f}'.format(np.mean(predicted == testing_preds)))
print(metrics.classification_report(testing_preds, predicted, target_names=categories))

print(metrics.confusion_matrix(testing_preds, predicted))

Number of articles: 52
Naive Bayes correct prediction: 0.93
               precision    recall  f1-score   support

Not Satisfies       0.93      1.00      0.96        13
    Satisfies       0.00      0.00      0.00         1

  avg / total       0.86      0.93      0.89        14



  'precision', 'predicted', average, warn_for)


SVM correct prediction: 0.93
               precision    recall  f1-score   support

Not Satisfies       0.93      1.00      0.96        13
    Satisfies       0.00      0.00      0.00         1

  avg / total       0.86      0.93      0.89        14

[[13  0]
 [ 1  0]]


  'precision', 'predicted', average, warn_for)


In [68]:
print("type of twent_test.data: ", type(twenty_train.target))
print(len(art_text))
print(twenty_test.target)

type of twent_test.data:  <class 'numpy.ndarray'>
66
[2 2 2 ... 2 2 1]
