# Detecting Fake News

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import LinearSVC

## Importing CSV File

In [10]:
data = pd.read_csv("fake_or_real_news.csv")

## Displaying the data

In [11]:
data

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


## converting label (text) to binary

In [17]:
data['fake'] = data['label'].apply(lambda x: 0 if x == "REAL" else 1)

In [18]:
data

Unnamed: 0,id,title,text,label,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,0


## Dropping 'label' column

In [19]:
data = data.drop("label", axis =1)

In [20]:
data

Unnamed: 0,id,title,text,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",0


## Assigning the x and y to 'text' and 'fake'

In [21]:
x, y = data['text'], data['fake']

In [22]:
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [23]:
y

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: fake, Length: 6335, dtype: int64

## Performing train_test_split on x and y

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)      #0.2 is for 20% of the data taken for evaluation and remaining 80% will be taken for testing

In [28]:
x_train


3709    Statins my disrupt vascular function On the Gr...
1460    Orangutan. \nRigged. Worth trying but its not ...
4942    Dispatches from STEPHEN LENDMAN A s the moment...
1439    Wenjian Liu, 32, and his partner, Rafael Ramos...
2427    by Yves Smith \nThis unprecedented election se...
                              ...                        
5708    Killing Obama administration rules, dismantlin...
3937    With the Department of Homeland Security’s fun...
1692    This post was originally published on this sit...
247     (CNN) Hillary Clinton and Bernie Sanders both ...
4372    The evening leaves him with a strong moral cas...
Name: text, Length: 5068, dtype: object

In [29]:
x_test

2196    By Claire Bernish at thefreethoughtproject.com...
4986    Washington (CNN) President Barack Obama announ...
2813    at 3:47 pm Leave a comment \nToday’s college-r...
1935    GET VISIBLE! Advertise Here. Find Out More Rea...
3381      Edmondo Burr in News , World // 0 Comments J...
                              ...                        
5131    When Henry Adams wrote in the early 20th centu...
1481    This is true at both the national and state le...
711     The FBI should get the lead out on its investi...
4401    Hillary Clinton announced perhaps her most amb...
4355    The Islamic State ­appears to be starting to f...
Name: text, Length: 1267, dtype: object

In [30]:
y_train

3709    1
1460    1
4942    1
1439    0
2427    1
       ..
5708    0
3937    0
1692    1
247     0
4372    0
Name: fake, Length: 5068, dtype: int64

In [31]:
y_test

2196    1
4986    0
2813    1
1935    1
3381    1
       ..
5131    0
1481    0
711     0
4401    0
4355    0
Name: fake, Length: 1267, dtype: int64

## Vectorize the data

In [1]:
vectorizer =TfidfVectorizer(stop_words = "english", max_df = 0.7)
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

NameError: name 'TfidfVectorizer' is not defined

## Creating classifier

In [33]:
clf = LinearSVC()
clf.fit(x_train_vectorized, y_train)



In [34]:
clf.score(x_test_vectorized, y_test)

0.9400157853196527

## Creating PassiveAggresiveClassifier

In [48]:
pac_classifier = PassiveAggressiveClassifier(max_iter=50)
pac_classifier.fit(x_train_vectorized, y_train)

In [54]:
y_train_pred = pac_classifier.predict(x_train_vectorized)

# Calculate accuracy on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

Training Accuracy: 1.0


In [55]:
y_test_pred = pac_classifier.predict(x_test_vectorized)

# Calculate accuracy on the test data
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9502762430939227
