# FAKE NEWS DETECTION

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# NLP libraries to clean the text data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [3]:
# Vectorization technique TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# For Splitting the dataset
from sklearn.model_selection import train_test_split

In [5]:
# Model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [6]:
#Accuracy measuring library
from sklearn.metrics import accuracy_score

# Loading Data

In [7]:
data = pd.read_csv('data .csv')

In [8]:
data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


# EDA

In [9]:
data.shape

(4009, 4)

In [10]:
data.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

1. URLs: This column contains the URLs of the news articles. It provides the source information and allows researchers to analyze the credibility of different news sources.
2. Headline: The headline column contains the titles or summaries of the news articles. Headlines are often the first piece of information readers encounter, making them crucial for detecting potential misinformation.
3. Body: The body column contains the main text or content of the news articles. It includes the full articles or relevant excerpts, providing ample textual data for analysis.
4. Label: The label column indicates the classification or ground truth of each article. It assigns a binary value (e.g., 1 for real news and 0 for fake news) to indicate whether the article is genuine or deceptive. This column serves as the target variable for training and evaluating fake news detection models.

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URLs      4009 non-null   object
 1   Headline  4009 non-null   object
 2   Body      3988 non-null   object
 3   Label     4009 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 125.4+ KB


In [12]:
data.isna().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

In [13]:
# Handling null values by replacing with '' 

data['Body'] = data['Body'].fillna('')

In [14]:
data.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

In [15]:
data['News'] = data['Headline'] + " " + data['Body']

In [16]:
data.head()

Unnamed: 0,URLs,Headline,Body,Label,News
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald Trump Ima...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tribute Co...


In [17]:
data.columns

Index(['URLs', 'Headline', 'Body', 'Label', 'News'], dtype='object')

In [18]:
features_dropped = ['URLs','Headline', 'Body']
data = data.drop(features_dropped, axis = 1)

In [19]:
data.columns

Index(['Label', 'News'], dtype='object')

In [20]:
data.head()

Unnamed: 0,Label,News
0,1,Four ways Bob Corker skewered Donald Trump Ima...
1,1,Linklater's war veteran comedy speaks to moder...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...


### Preprocessing

In [21]:
ps = PorterStemmer()

def wordopt(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split() 
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')] 
    text = ' '.join(text) 
    return text

In [22]:
data['News'] = data['News'].apply(wordopt)

In [23]:
data.head()

Unnamed: 0,Label,News
0,1,four way bob corker skewer donald trump imag c...
1,1,linklat war veteran comedi speak modern americ...
2,1,trump fight corker jeopard legisl agenda feud ...
3,1,egypt cheiron win tie pemex mexican onshor oil...
4,1,jason aldean open snl vega tribut countri sing...


In [24]:
X = data['News']
y = data['Label']

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [32]:
vectorization = TfidfVectorizer() 
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test)

In [33]:
X_train

1947    learn faster learn faster reader think stori f...
723     mr robot season return feverish form cnn mr ro...
3755    jet vs brown week preview jet vs brown week pr...
230     bali volcano brink erupt first time sinc year ...
1151    adirondack histor lodg rough like millionair c...
                              ...                        
1517    fantasi footbal idp rank week embattl pharmace...
3178    veteran ask wors kneel millionair vet neglect ...
3093    europ innov univers reuter europ top tech hub ...
1610    catalan leader pressur drop independ barcelona...
994     fantasi footbal idp rank week warn someth big ...
Name: News, Length: 3006, dtype: object

In [34]:
print (xv_train[0])

  (0, 22629)	0.05000157809362559
  (0, 5721)	0.07632521285010137
  (0, 4999)	0.08241031543225946
  (0, 5052)	0.07358837108904148
  (0, 17357)	0.07594608503066917
  (0, 1280)	0.06700951687756117
  (0, 1038)	0.0795168610786423
  (0, 24002)	0.1276854455993943
  (0, 9942)	0.06971869808533003
  (0, 21622)	0.0694899140757261
  (0, 21932)	0.058116901096859415
  (0, 14659)	0.05551662794758385
  (0, 19444)	0.0677367037162603
  (0, 979)	0.08384572456041228
  (0, 1306)	0.19262302559057765
  (0, 22056)	0.12632245291997116
  (0, 14230)	0.12913367919440566
  (0, 19745)	0.06495564897676094
  (0, 23313)	0.08998673156971286
  (0, 22263)	0.11543519829578269
  (0, 719)	0.10576879075286473
  (0, 16856)	0.07840339532918505
  (0, 25087)	0.08342645170841596
  (0, 26989)	0.08261000798510068
  (0, 11380)	0.1444544410698256
  (0, 7047)	0.14305012064942826
  (0, 4772)	0.15945264193855124
  (0, 5002)	0.2888575696472209
  (0, 27025)	0.19500747797517073
  (0, 11175)	0.15936835741880173
  (0, 3930)	0.190796461986845

# Model

In [35]:
# Logistic Regression

LR_model = LogisticRegression()

LR_model.fit(xv_train, y_train)

lr_y_pred = LR_model.predict(xv_test)

score = accuracy_score(y_test, lr_y_pred)

print ("Accuracy LR:", score)

Accuracy LR: 0.9730807577268196


In [36]:
# SVM
svm_model = SVC(kernel='linear')
svm_model.fit(xv_train, y_train)
svm_y_pred = svm_model.predict(xv_test)
score = accuracy_score(y_test, svm_y_pred)

print ("Accuracy SVM:", score)

Accuracy SVM: 0.9760717846460618


In [38]:
# Random Forest Classifier
RFC_model = RandomForestClassifier(random_state = 0)
RFC_model.fit(xv_train, y_train)
rfc_y_pred = RFC_model.predict(xv_test)

score = accuracy_score(y_test, rfc_y_pred)

print ("Accuracy RFC:", score)

Accuracy RFC: 0.9621136590229312
