# Logistic Regression test

In [3]:
#setup
import pandas as pd
import numpy as np
import math
import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import warnings
import random
from string import punctuation
import seaborn as sns

from PIL import Image

# warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize


from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score
%matplotlib inline

# Set default plot size
plt.rcParams["figure.figsize"] = (15,8)

## Loading data
The data is made up of the title of the article, its content as well as subject and date. I will also be adding an authenticity column which would act as the label of the datapoint.

In [6]:
real = pd.read_csv("./News_dataset/True.csv")
fake = pd.read_csv("./News_dataset/Fake.csv")
fake['Authenticity'] = 'Fake'
real['Authenticity'] = 'Real'
news_data = fake.append(real)
news_data.head()

Unnamed: 0,title,text,subject,date,Authenticity
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",Fake


## Data Preprocessing 
We first have to preprocess the articles by tokenizing the text and removing stopwords (which are words that occur in abundance and thus providing no real information or pattern).


In [7]:
sw = stopwords.words('english')

# Adding any form of inverted commas into the stop words list

new_words=('’','“', '”')
for i in new_words:
    sw.append(i)
    
# Conversion of text in the article to lower case
news_data['text'] = news_data['text'].str.lower()

# Tokenizing 
news_data['tokenized_text'] = news_data['text'].apply(word_tokenize)

# Remove stopwords
news_data['filtered_text'] = news_data['tokenized_text'].apply(lambda x: [item for item in x if item not in sw])

# Remove punctuations
news_data['filtered_text'] = news_data['filtered_text'].apply(lambda x: [item for item in x if item not in punctuation])

# Check results
print(len(news_data['text'].iloc[0]),
      len(news_data['tokenized_text'].iloc[0]),
      len(news_data['filtered_text'].iloc[0]))

2893 599 287


In [12]:
news_data.head()

Unnamed: 0,title,text,subject,date,Authenticity,tokenized_text,filtered_text
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump just couldn t wish all americans ...,News,"December 31, 2017",Fake,"[donald, trump, just, couldn, t, wish, all, am...","[donald, trump, wish, americans, happy, new, y..."
1,Drunk Bragging Trump Staffer Started Russian ...,house intelligence committee chairman devin nu...,News,"December 31, 2017",Fake,"[house, intelligence, committee, chairman, dev...","[house, intelligence, committee, chairman, dev..."
2,Sheriff David Clarke Becomes An Internet Joke...,"on friday, it was revealed that former milwauk...",News,"December 30, 2017",Fake,"[on, friday, ,, it, was, revealed, that, forme...","[friday, revealed, former, milwaukee, sheriff,..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"on christmas day, donald trump announced that ...",News,"December 29, 2017",Fake,"[on, christmas, day, ,, donald, trump, announc...","[christmas, day, donald, trump, announced, wou..."
4,Pope Francis Just Called Out Donald Trump Dur...,pope francis used his annual christmas day mes...,News,"December 25, 2017",Fake,"[pope, francis, used, his, annual, christmas, ...","[pope, francis, used, annual, christmas, day, ..."


## Training
For training, we'll now use the TfidVectorizer module from sklearn in order to obtain the term frequency of each relevant word in the article. The term frequency is the ratio of the number of times a word appears in the article compared to the length of words of the same article.

In [8]:
vectorizer = TfidfVectorizer(stop_words=sw, lowercase=True)
X = vectorizer.fit_transform(news_data.text)
Y = news_data.Authenticity

In [34]:
print(X.shape)
print(Y.shape)

(44898, 121858)
(44898,)


We can now then split the data 75 - 25 for training and testing purposes.

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25)
print(X_train.shape)
print(Y_train.shape)
print(X_train)

(33673, 121858)
(33673,)
  (0, 115421)	0.03976084339600614
  (0, 28642)	0.11670481898365798
  (0, 70816)	0.03890160632788599
  (0, 59293)	0.03976084339600614
  (0, 48211)	0.03286149029326648
  (0, 20961)	0.11281795669619066
  (0, 40312)	0.03819955852506085
  (0, 91247)	0.03396923953821837
  (0, 38965)	0.03709180928010896
  (0, 44521)	0.07418361856021792
  (0, 89339)	0.034671287341043515
  (0, 90685)	0.03553052440916366
  (0, 12933)	0.0325532776603638
  (0, 638)	0.0349369514494997
  (0, 319)	0.03553052440916366
  (0, 114908)	0.03396923953821837
  (0, 82313)	0.037605985565396884
  (0, 1439)	0.06884555032842357
  (0, 70989)	0.034422775164211786
  (0, 71429)	0.034671287341043515
  (0, 77726)	0.10566693532878293
  (0, 56655)	0.03418933385704132
  (0, 117507)	0.03663827365411555
  (0, 89337)	0.02896621751330449
  (0, 82103)	0.030991992789418506
  :	:
  (33672, 109097)	0.05264895224820117
  (33672, 78196)	0.05403476680100724
  (33672, 108252)	0.051945747698567135
  (33672, 28118)	0.2047403616

With the training data obtained, we then train our Logistic Regression Model

In [23]:
model = LogisticRegression()
model.fit(X_train, Y_train)

LogisticRegression()

In [24]:
model.score(X_test, Y_test)

0.9861915367483296

We can also experiment with a KNN model instead

In [36]:
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

KNeighborsClassifier()

In [37]:
knn.score(X_test, Y_test)

0.6291314031180401

In [40]:
svm = SVC()
svm.fit(X_train, Y_train)

SVC()

In [41]:
svm.score(X_test, Y_test)

0.9927839643652562

## Using Sublinear TF
Alternatively, we can also use sublineaer TF (replacing tf with 1 + log(tf)) as the weights for training. This can be easily done by configuring the vectorizer to obtain an idf weighting instead.

In [26]:
sub_vectorizer = vectorizer = TfidfVectorizer(stop_words=sw, lowercase=True, sublinear_tf=True)
X_sub = vectorizer.fit_transform(news_data.text)
Y_sub = news_data.Authenticity

In [27]:
print(X_sub.shape)
print(Y_sub.shape)

(44898, 121858)
(44898,)


In [28]:
X_sub_train, X_sub_test, Y_sub_train, Y_sub_test = train_test_split(X,Y, test_size=0.25)
print(X_sub_train.shape)
print(Y_sub_train.shape)

(33673, 121858)
(33673,)


In [30]:
model_sub = LogisticRegression()
model_sub.fit(X_sub_train, Y_sub_train)
model_sub.score(X_sub_test, Y_sub_test)

0.9895768374164811

In [42]:
knn_sub = KNeighborsClassifier()
knn_sub.fit(X_sub_train, Y_sub_train)
knn_sub.score(X_sub_test, Y_sub_test)

0.5371937639198219

In [43]:
svm_sub = SVC()
svm_sub.fit(X_sub_train, Y_sub_train)
svm_sub.score(X_sub_test, Y_sub_test)

0.9946547884187082

## Cross Validation
Getting the scores with cross validation

In [10]:
cv_log = LogisticRegression()
cv_knn = KNeighborsClassifier()
cv_svm = SVC()

In [15]:
log_scores = cross_validate(cv_log, X, Y, cv=5)
print(log_scores)

{'fit_time': array([1.45313096, 1.93981576, 1.59972715, 2.07445765, 1.53589416]), 'score_time': array([0.02092838, 0.0249331 , 0.019943  , 0.0179503 , 0.0149951 ]), 'test_score': array([0.98381589, 0.98530067, 0.98515219, 0.98485298, 0.98574399])}


In [17]:
knn_scores = cross_validate(cv_knn, X, Y, cv=5)
print(knn_scores)

{'fit_time': array([0.08078218, 0.0758009 , 0.09075761, 0.07579613, 0.08976841]), 'score_time': array([23.96076345, 23.08591866, 20.89121675, 21.25639606, 22.58120608]), 'test_score': array([0.55868597, 0.54988864, 0.56937639, 0.61231763, 0.60351932])}


In [18]:
svm_scores = cross_validate(cv_svm, X, Y, cv=5)
print(svm_scores)

KeyboardInterrupt: 

In [None]:
log_scores.head()

In [None]:
svm_scores.head()

In [None]:
knn_scores.head()