### FAKE NEWS DETECTION ###

## IMPORT LIBRARIES 

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLP libraries to clean the text data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

# Vectorization technique TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# For Splitting the dataset
from sklearn.model_selection import train_test_split

# Model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
#Accuracy measuring library
from sklearn.metrics import accuracy_score,precision_score,recall_score

## Loading the Data 

In [5]:
data = pd.read_csv("data.csv")

In [6]:
data.shape

(4009, 4)

In [7]:
data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [8]:
data.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

In [9]:
data.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

### Data-Preprocessing 

In [10]:
df = data.copy()

### Removing the null values 

In [11]:
df['Body'] = df['Body'].fillna('') 

In [12]:
df.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

### Adding a new column

In [13]:
df['News'] = df['Headline']+df['Body'] #combine heaadline +body

In [14]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label,News
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald TrumpImag...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tributeCou...


In [15]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label', 'News'], dtype='object')

### Drop features that are not needed

In [16]:
features_dropped = ['URLs','Headline','Body']
df = df.drop(features_dropped, axis =1)

In [17]:
df.columns

Index(['Label', 'News'], dtype='object')

### Text Processing 

In [18]:
ps = PorterStemmer()
def wordopt(text): 
    text = re.sub('[^a-zA-Z]', ' ',text)   #to remove symbols , stops words, steeming 
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [19]:
df['News'] = df['News'].apply(wordopt)

In [20]:
df.head()

Unnamed: 0,Label,News
0,1,four way bob corker skewer donald trumpimag co...
1,1,linklat war veteran comedi speak modern americ...
2,1,trump fight corker jeopard legisl agendath feu...
3,1,egypt cheiron win tie pemex mexican onshor oil...
4,1,jason aldean open snl vega tributecountri sing...


### Splitting Data set 

In [21]:
X = df['News']
Y = df['Label']
#Split the data into training and test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

### vectorization 
### This is used to handle our text data, by converting it into vectors.


In [22]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)


### Model Fitting
 will be fitting my data onto 3 classifications models

Logistic Regression
,SVM
,RandomForestClassifier

In [41]:
#1. Logistic Regression - used because this model is best suited for binary classification
LR_model = LogisticRegression()

#Fitting training set to the model
LR_model.fit(xv_train,y_train)

#Predicting the test set results based on the model
lr_y_pred = LR_model.predict(xv_test)

#Calculate the accurracy of this model
score = accuracy_score(y_test,lr_y_pred)
score1= precision_score(y_test,lr_y_pred)
score2= recall_score(y_test,lr_y_pred)

print('Accuracy of LR model is ', score)
print('precision of LR model is ', score1)
print('recall of LR model is ', score2)

Accuracy of LR model is  0.9680957128614157
precision of LR model is  0.9482758620689655
recall of LR model is  0.9821428571428571


In [40]:
#2. Support Vector Machine(SVM) - SVM works relatively well when there is a clear margin of separation between classes.
svm_model = SVC(kernel='linear')

#Fitting training set to the model
svm_model.fit(xv_train,y_train)

#Predicting the test set results based on the model
svm_y_pred = svm_model.predict(xv_test)

#Calculate the accuracy score of this model
score = accuracy_score(y_test,svm_y_pred)
score1= precision_score(y_test,svm_y_pred)
score2= recall_score(y_test,svm_y_pred)

print('Accuracy of SVM model is ', score)
print('precision of SVM model is ', score1)
print('recall of SVM model is ', score2)

Accuracy of SVM model is  0.9840478564307079
precision of SVM model is  0.9736842105263158
recall of SVM model is  0.9910714285714286


In [39]:
#3. Random Forest Classifier 
RFC_model = RandomForestClassifier(random_state=0)

#Fitting training set to the model
RFC_model.fit(xv_train, y_train)

#Predicting the test set results based on the model
rfc_y_pred = RFC_model.predict(xv_test)

#Calculate the accuracy score of this model
score = accuracy_score(y_test,rfc_y_pred)
score1= precision_score(y_test,rfc_y_pred)
score2= recall_score(y_test,rfc_y_pred)

print('Accuracy of RFC model is ', score)
print('precision of RFC model is ', score1)
print('recall of RFC model is ', score2)

Accuracy of RFC model is  0.9651046859421735
precision of RFC model is  0.9347368421052632
recall of RFC model is  0.9910714285714286


In [37]:
#4. Voting classifier
vc_model=VotingClassifier([('clf1',LR_model),('clf2',RFC_model),('clf3',svm_model)])

#Fitting training set to the model
vc_model.fit(xv_train, y_train)

#Predicting the test set results based on the model
vc_y_pred = vc_model.predict(xv_test)

#Calculate the accuracy score of this model
score = accuracy_score(y_test,vc_y_pred)
score1= precision_score(y_test,vc_y_pred)
score2= recall_score(y_test,vc_y_pred)
print('Accuracy of vc model is ', score)
print('precision of vc model is ', score1)
print('recall of vc model is ', score2)

Accuracy of vc model is  0.9780658025922233
precision of vc model is  0.959051724137931
recall of vc model is  0.9933035714285714


In [38]:
#5. Bagging classifier (decision trees)
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
dt_model=DecisionTreeClassifier()

bgg_model=BaggingClassifier(DecisionTreeClassifier())

#Fitting training set to the model
bgg_model.fit(xv_train, y_train)
dt_model.fit(xv_train, y_train)
#Predicting the test set results based on the model
bgg_y_pred = bgg_model.predict(xv_test)
dt_y_pred=dt_model.predict(xv_test)
#Calculate the accuracy score of this model
score = accuracy_score(y_test,bgg_y_pred)
score1= precision_score(y_test,bgg_y_pred)
score2= recall_score(y_test,bgg_y_pred)

score11 = accuracy_score(y_test,dt_y_pred)
score12= precision_score(y_test,dt_y_pred)
score13= recall_score(y_test,dt_y_pred)

print('Accuracy of bgg model is ', score)
print('precision of bgg model is ', score1)
print('recall of bgg model is ', score2)

print('Accuracy of dt model is ', score11)
print('precision of dt model is ', score12)
print('recall of dt model is ', score13)

Accuracy of bgg model is  0.9521435692921236
precision of bgg model is  0.9504504504504504
recall of bgg model is  0.9419642857142857
Accuracy of dt model is  0.9431704885343968
precision of dt model is  0.9413092550790068
recall of dt model is  0.9308035714285714


In [36]:
#6. Gradient Boosting 
from sklearn.ensemble import GradientBoostingClassifier


gbc_model=GradientBoostingClassifier()

#Fitting training set to the model
gbc_model.fit(xv_train, y_train)

#Predicting the test set results based on the model
gbc_y_pred = gbc_model.predict(xv_test)

#Calculate the accuracy score of this model
score = accuracy_score(y_test,gbc_y_pred)
score1= precision_score(y_test,gbc_y_pred)
score2= recall_score(y_test,gbc_y_pred)
print('Accuracy of gbc model is ', score)
print('precision of gbc model is ', score1)
print('recall of gbc model is ', score2)

Accuracy of gbc model is  0.9710867397806581
precision of gbc model is  0.9584245076586433
recall of gbc model is  0.9776785714285714
