In [44]:
import numpy as np
import pandas as pd
import nltk
import re #for searching words in a text
from nltk.corpus import stopwords #get rid of words with less value (articles, etc.) 
from nltk.stem.porter import PorterStemmer # important aid for stemming 
from sklearn.feature_extraction.text import TfidfVectorizer # convert text to feature vectors
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Pre-processing

1 : Fake News

0 : Real News

In this exercise we use the combination of title and author to train our process.

In [45]:
train_news = pd.read_csv('C:/Users/glama/Machine_Learning_Assignment2/train.csv') 

In [46]:
train_news

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [47]:
train_news.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [48]:
train_news = train_news.fillna('')

In [49]:
train_news.duplicated(subset=["text"]).sum()

413

In [50]:
train_news = train_news.drop_duplicates(subset={"text"}, keep='first', inplace=False)
train_news.shape

(20387, 5)

In [51]:
train_news['author_title_concat'] = train_news['author'] + ' ' + train_news['title'] 

Stemming: the process of reducing a word to its root word. Better performance of our model

In [52]:
def stemm(author_title_concat):
    #keeps only alphabetic characters, removing any symbols nubers etc.
    stemmed_atc = re.sub('[^a-zA-Z]', ' ', author_title_concat)
    #keep only lower case, convert upper case chars in concated text
    stemmed_atc = stemmed_atc.lower()
    #converts test string into a list, default separator is ' '
    stemmed_atc = stemmed_atc.split() 
    #finally stemming the created lists after removing stopwords
    stemmed_atc = [PorterStemmer().stem(word) for word in stemmed_atc if not word in stopwords.words('English')]
    #converting the edited lists back into a string separated by ' '
    stemmed_atc = ' '.join(stemmed_atc)
    return stemmed_atc

In [53]:
train_news['author_title_concat'] = train_news['author_title_concat'].apply(stemm)

In [54]:
train_no_label = train_news.drop(columns = 'label', axis = 'columns') #axis to drop whole column, not only its data
only_labels = train_news['label'] 
values_text = train_no_label['author_title_concat'].values
values_labels = only_labels.values

In [55]:
#Using Tfid vectorizer, converting text to numeric vectors
vectorizer = TfidfVectorizer()
vectorizer.fit(values_text)
values_text = vectorizer.transform(values_text)

Train and Test data split

In [56]:
values_text_train, values_text_test, values_labels_train, values_labels_test = train_test_split(values_text,values_labels,
                                                                                                test_size = 0.2,
                                                                                                stratify = values_labels,
                                                                                                random_state = 6)
# random_state = 6 to produce the same results across different calls
# stratify = values_labels data is split in a stratified fashion, using this as the class labels

Logistic Regression Training:

In [57]:
LogReg_model = LogisticRegression() 
LogReg_model.fit(values_text_train, values_labels_train)

LogisticRegression()

In [58]:
news_train_pred = LogReg_model.predict(values_text_train) 
training_accuracy = accuracy_score(news_train_pred, values_labels_train)
print(f'Logistic Regression Training Accuracy: {training_accuracy}')

Logistic Regression Training Accuracy: 0.9866331473419584


Logistic Regression Test:

In [59]:
news_test_pred = LogReg_model.predict(values_text_test) 
testing_accuracy = accuracy_score(news_test_pred, values_labels_test)
print(f'Logistic Regression Testing Accuracy: {testing_accuracy}')

Logistic Regression Testing Accuracy: 0.976213830308975


In [60]:
from sklearn.metrics import confusion_matrix
c_matrix_lr = confusion_matrix(values_labels_test, news_test_pred)

Tr_Pos = c_matrix_lr[0][0]
Tr_Neg = c_matrix_lr[1][1]
F_Pos = c_matrix_lr[0][1]
F_Neg = c_matrix_lr[1][0]

print(c_matrix_lr)
print('Logistic Regression Testing Acc.: = ', (Tr_Pos + Tr_Neg)/(Tr_Pos + Tr_Neg + F_Pos + F_Neg))

[[1995   83]
 [  14 1986]]
Logistic Regression Testing Acc.: =  0.976213830308975


Decision Tree Classifier Training:

In [61]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree.fit(values_text_train, values_labels_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [62]:
news_train_pred_1 = tree.predict(values_text_train) 
training_accuracy_1 = accuracy_score(news_train_pred_1, values_labels_train)
print(f'Decision Tree Classifier Training Accuracy: {training_accuracy_1}')

Decision Tree Classifier Training Accuracy: 1.0


Decision Tree Classifier Test:

In [63]:
news_test_pred_1 = tree.predict(values_text_test) 
testing_accuracy_1 = accuracy_score(news_test_pred_1, values_labels_test)
print(f'Decision Tree Classifier Testing Accuracy: {testing_accuracy_1}')

Decision Tree Classifier Testing Accuracy: 0.9931338891613536


In [64]:
c_matrix_dt = confusion_matrix(values_labels_test, news_test_pred_1)

Tr_Pos = c_matrix_dt[0][0]
Tr_Neg = c_matrix_dt[1][1]
F_Pos = c_matrix_dt[0][1]
F_Neg = c_matrix_dt[1][0]

print(c_matrix_dt)
print('Decision Tree Classifier Testing Acc.: = ', (Tr_Pos + Tr_Neg)/(Tr_Pos + Tr_Neg + F_Pos + F_Neg))

[[2062   16]
 [  12 1988]]
Decision Tree Classifier Testing Acc.: =  0.9931338891613536


Random Forest Classifier Training:

In [65]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
forest.fit(values_text_train, values_labels_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [66]:
news_train_pred_2 = forest.predict(values_text_train) 
training_accuracy_2 = accuracy_score(news_train_pred_2, values_labels_train)
print(f'Random Forest Classifier Training Accuracy: {training_accuracy_2}')

Random Forest Classifier Training Accuracy: 0.9995707891348335


Random Forest Classifier Testing:

In [67]:
news_test_pred_2 = forest.predict(values_text_test) 
testing_accuracy_2 = accuracy_score(news_test_pred_2, values_labels_test)
print(f'Random Forest Classifier Testing Accuracy: {testing_accuracy_2}')

Random Forest Classifier Testing Accuracy: 0.986758214811182


In [68]:
c_matrix_rf = confusion_matrix(values_labels_test, news_test_pred_2)

Tr_Pos = c_matrix_rf[0][0]
Tr_Neg = c_matrix_rf[1][1]
F_Pos = c_matrix_rf[0][1]
F_Neg = c_matrix_rf[1][0]

print(c_matrix_rf)
print('Random Forest Classifier Testing Acc.: = ', (Tr_Pos + Tr_Neg)/(Tr_Pos + Tr_Neg + F_Pos + F_Neg))

[[2048   30]
 [  24 1976]]
Random Forest Classifier Testing Acc.: =  0.986758214811182
