Machine Learning Test 

# Imports

In [1]:
import matplotlib
%matplotlib inline
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate

# Datasets

## Full Dataset

In [2]:
df = pd.read_csv('~/Desktop/fake_news_data/dataset.csv')

In [3]:
df.head()
del df['Unnamed: 0']

In [4]:
df.head()

Unnamed: 0,title,text,label
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,0
1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",0
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,0
3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,0
4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",0


## Sample Dataset

In [5]:
df_sample = df.sample(frac=0.02, random_state=3)

In [6]:
df_sample.shape

(1299, 3)

In [7]:
df_sample['label'].value_counts()

0    711
1    588
Name: label, dtype: int64

In [8]:
df_sample = df_sample.reset_index(drop=True)

# X and y

In [9]:
y = df_sample['label']

In [10]:
y.head()

0    1
1    0
2    0
3    0
4    1
Name: label, dtype: int64

In [11]:
X = df_sample.drop('label',axis=1)

In [12]:
X.head()

Unnamed: 0,title,text
0,"An open letter from Vladislav Krasnov, Ph.D. a...","Mon, 24 Oct 2016 00:00 UTC © Fitzgerald Griffi..."
1,U.S. preparing plan to draw down embassy staff...,WASHINGTON (Reuters) - The United States is cr...
2,Forgotten No Longer: Pennsylvania Breitbart Re...,Several Breitbart News Daily SiriusXM listener...
3,German minister warns on Washington trip again...,WASHINGTON (Reuters) - Germany’s foreign minis...
4,HOLY MOLY! TRUMP GIVES EPIC News Conference…SL...,"During Trump s press conference today, Jim Aco..."


In [13]:
!tree

[01;34m.[00m
└── mltest.ipynb

0 directories, 1 file


# Preprocessing

In [14]:
def clean(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')  # Remove Punctuation
    lowercased = text.lower()  # Lower Case
    tokenized = word_tokenize(lowercased)  # Tokenize
    words_only = [word for word in tokenized if word.isalpha()
                  ]  # Remove numbers
    stop_words = set(stopwords.words('english'))  # Make stopword list
    # Remove Stop Words
    without_stopwords = [word for word in words_only if not word in stop_words]
    lemma = WordNetLemmatizer()  # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word)
                  for word in without_stopwords]  # Lemmatize
    return lemmatized

In [15]:
df_sample_text = df_sample['text'].apply(lambda text: clean(text))

In [16]:
df_sample_text.head(20)

0     [mon, oct, utc, fitzgerald, griffin, foundatio...
1     [washington, reuters, united, state, crafting,...
2     [several, breitbart, news, daily, siriusxm, li...
3     [washington, reuters, germany, foreign, minist...
4     [trump, press, conference, today, jim, acosta,...
5     [drive, southwest, interstate, eating, barbecu...
6     [two, week, ago, middlebury, college, charles,...
7     [liz, heron, executive, editor, huffington, po...
8     [dems, hammer, republican, planned, vote, allo...
9     [madrid, reuters, spanish, supreme, court, jud...
10    [washington, reuters, u, congress, friday, cer...
11    [washington, reuters, japanese, first, lady, a...
12    [jeb, bush, suspends, campaign, honorable, thi...
13    [united, nation, reuters, chinese, foreign, mi...
14    [posted, october, frank, scott, greatest, obje...
15    [un, allegedly, nixed, report, predicted, rohi...
16    [reuters, oklahoma, republican, governor, mary...
17    [place, ever, overseas, place, overseas, m

In [17]:
df_sample_text_joined = df_sample_text.apply(lambda x: " ".join(x))

In [18]:
df_sample_text_joined.head()

0    mon oct utc fitzgerald griffin foundation russ...
1    washington reuters united state crafting plan ...
2    several breitbart news daily siriusxm listener...
3    washington reuters germany foreign minister wa...
4    trump press conference today jim acosta cnn bo...
Name: text, dtype: object

# Vectorizers

## Count Vectorizer

In [19]:
cv = CountVectorizer(max_features=10000)

In [20]:
X_cv = cv.fit_transform(df_sample_text_joined).todense()

In [21]:
X_cv.shape

(1299, 10000)

## TFidf Vectorizer

In [22]:
## TFidf Vectorizer
tfidf_vec=TfidfVectorizer(max_features=10000,ngram_range=(1,3))

In [23]:
X_tfidf = tfidf_vec.fit_transform(df_sample_text_joined).toarray()

In [24]:
X_tfidf.shape

(1299, 10000)

## HashingVectorizer

In [25]:
hs_vectorizer = HashingVectorizer(n_features=10000,non_negative=True)

In [26]:
X_hs = hs_vectorizer.fit_transform(df_sample_text_joined).toarray()



In [27]:
X_hs.shape

(1299, 10000)

# Train Test Set

In [28]:
## Divide the dataset into Train and Test CountVectorizer
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y, test_size=0.3, random_state=0)

In [29]:
## Divide the dataset into Train and Test TFidf Vectorizer
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.3, random_state=0)

In [30]:
## Divide the dataset into Train and Test Hashing Vecotrizer
X_train_hs, X_test_hs, y_train_hs, y_test_hs = train_test_split(X_hs, y, test_size=0.3, random_state=0)

# Models

## Baseline Model

In [31]:

# Baseline model
baseline_model = DummyClassifier(strategy="most_frequent")

# Cross validate model
baseline_cv = cross_validate(baseline_model, X, y, cv=10)

# Baseline Accuracy
score_baseline = baseline_cv['test_score'].mean()
score_baseline

0.5473470437486059

## DecsisionTreeClassifier with CountVectorizer

In [32]:
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train_cv, y_train_cv)
y_pred_cv = dtc.predict(X_test_cv)

In [33]:
confusion_matrix(y_test_cv, y_pred_cv)

array([[170,  45],
       [ 29, 146]])

In [34]:
score_decisionTree = metrics.accuracy_score(y_test_cv, y_pred_cv)
print("accuracy:   %0.3f" % score_decisionTree)

accuracy:   0.810


## MultinomialNB with Tfidf Vectorizer

In [35]:
classifier = MultinomialNB()

In [36]:
classifier.fit(X_train_tfidf, y_train_tfidf)
pred_tfidf = classifier.predict(X_test_tfidf)

In [37]:
confusion_matrix(y_test_tfidf, pred_tfidf)

array([[188,  27],
       [ 49, 126]])

In [38]:
score_MNB_tfidf = metrics.accuracy_score(y_test_tfidf, pred_tfidf)
print("accuracy:   %0.3f" % score_MNB_tfidf)

accuracy:   0.805


## Passive Aggressive Classifier with Tfidf Vectorizer

In [39]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)

In [40]:
linear_clf.fit(X_train_tfidf, y_train_tfidf)
pred_tfidf = linear_clf.predict(X_test_tfidf)



In [41]:
confusion_matrix(y_test_tfidf, pred_tfidf)

array([[188,  27],
       [ 23, 152]])

In [42]:
score_pac = metrics.accuracy_score(y_test_tfidf, pred_tfidf)
print("accuracy:   %0.3f" % score_pac)

accuracy:   0.872


## Multinomial Classifier (Hyperparameter) with Tfidf Vecotrizer

In [43]:
classifier=MultinomialNB(alpha=0.1)

In [44]:
previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train_tfidf,y_train_tfidf)
    y_pred=sub_classifier.predict(X_test_tfidf)
    score = metrics.accuracy_score(y_test_tfidf, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

Alpha: 0.0, Score : 0.8205128205128205
Alpha: 0.1, Score : 0.823076923076923
Alpha: 0.2, Score : 0.8282051282051283
Alpha: 0.30000000000000004, Score : 0.8205128205128205
Alpha: 0.4, Score : 0.823076923076923
Alpha: 0.5, Score : 0.8307692307692308
Alpha: 0.6000000000000001, Score : 0.8256410256410256


  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.7000000000000001, Score : 0.823076923076923
Alpha: 0.8, Score : 0.8128205128205128
Alpha: 0.9, Score : 0.8025641025641026


## MultinomialNB with Hashing Vectorizer

In [45]:
classifier = MultinomialNB()

In [46]:
classifier.fit(X_train_tfidf, y_train_tfidf)
pred = classifier.predict(X_test_tfidf)
score_mnb_hv = metrics.accuracy_score(y_test_tfidf, pred)
print("accuracy:   %0.3f" % score_mnb_hv)

accuracy:   0.805


# Overview

In [54]:
print(f'Baseline score is {score_baseline}')
print(f'DecisionTreeClassifier with CountVectorizer score is {score_decisionTree}')
print(f'MultinomialNB with Tfidf Vectorizer score is {score_MNB_tfidf}')
print(f'Passive Agressive Classifier with Tfidf Vecotizer score is {score_pac}')
print("Multinomial Classifier Hyperparameter with Tfidf Vectorizer best score is with alpha: 0.5, 0.8307692307692308")
print(f'MultinomialNB with Hashing Vecotrizer score is {score_mnb_hv}')

Baseline score is 0.5473470437486059
DecisionTreeClassifier with CountVectorizer score is 0.8102564102564103
MultinomialNB with Tfidf Vectorizer score is 0.8051282051282052
Passive Agressive Classifier with Tfidf Vecotizer score is 0.8717948717948718
Multinomial Classifier Hyperparameter with Tfidf Vectorizer best score is with alpha: 0.5, 0.8307692307692308
MultinomialNB with Hashing Vecotrizer score is 0.8051282051282052
