In [6]:
import numpy as np
import pandas as pd

#from scikit-learn
from sklearn.model_selection import train_test_split#we have dataset split into 80% training data and 20% testing data(unseen data)
from sklearn.feature_extraction.text import TfidfVectorizer#we want to take the text and transform it into something(numerics) we want to feed into ml model, can't just blow into a random forest model, i.e. something we can represent into numbers
# tf(term frequency):no of times a term appears in document, idf(inverse document frequency):divide number of documents/number of documents contained the term

from sklearn.svm import LinearSVC

In [7]:
data = pd.read_csv("fake_or_real_news.csv")

In [8]:
data

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [10]:
#is the data fake or no, for each value in label, 0:Real, 1:false
data['fake'] = data['label'].apply(lambda x: 0 if x == "REAL" else 1)

In [11]:
data = data.drop("label", axis = 1)

In [12]:
X, y = data['text'], data['fake']

In [13]:
X

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [15]:
y#is data fake(1) or real(0)

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: fake, Length: 6335, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)#80/20 train test split, this is random split

In [17]:
X_train

4694    CHARLESTON, S.C. -- Andre McPherson has been c...
6238    by Yves Smith \nBy Bill Black, the author of T...
3294    The Economic Collapse – by Michael Snyder \nJu...
5832    Ted Cruz won more delegates than anyone else o...
2772    One of the first signs that the presidential c...
                              ...                        
1337    United States – reformation or fracture? by Th...
6261    Last week, Julian Assange, Editor-In-Chief of ...
1812    PITTSBURGH—Hillary Clinton on Saturday drew a ...
5418    The election in 232 photos, 43 numbers and 131...
4112    Hillary Clinton’s presidential campaign, her n...
Name: text, Length: 5068, dtype: object

In [18]:
len(X_train)

5068

In [19]:
len(X_test)

1267

In [21]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

#vectorize
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [22]:
clf = LinearSVC()
clf.fit(X_train_vectorized, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [23]:
clf.score(X_test_vectorized, y_test) #94% accuracy, i.e. out of 1267 articles, 94% were classified correctly

0.9408050513022889

In [24]:
len(y_test) * 0.9408

1191.9936

In [28]:
#open in writing mode
with open("mytextfakenews.txt", "w", encoding="utf-8") as f:
    f.write(X_test.iloc[10])# a part of dataset

In [30]:
with open("mytextfakenews.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [31]:
text

'The world is about to change drastically . Will you be ready for it? \nThe Future Doesn’t Need Us… Or So We’ve Been Told. \nWith the rise of technology and the real-time pressures of an online, global economy, humans will have to be very clever – and very careful – not to be left behind by the future. \nFrom the perspective of those in charge, human labor is losing its value, and people are becoming a liability. \nThis documentary reveals the real motivation behind the secretive effort to reduce the population and bring resource use into strict, centralized control. \nCould it be that the biggest threat we face isn’t just automation and robots destroying jobs, but the larger sense that humans could become obsolete altogether?'

In [33]:
vectorized_text = vectorizer.transform([text])

In [46]:
clf.predict(vectorized_text)# To get just 1 instead of array([1]) do indexing [0]

array([1])

In [47]:
y_test.iloc[10]

np.int64(1)

In [48]:
#output of 1 means the news is fake

In [51]:
# My custom news text or import
# my_news = input("Enter the news text: ")
my_news = """The government has heavily banned all social media platforms."""
vectorized_input = vectorizer.transform([my_news]) # vectorize the input
prediction = clf.predict(vectorized_input)[0] # use our trained classifier to predict if news is fake or not

In [52]:
if prediction == 1:
    print("This news is likely FAKE.")
else:
    print("This news is likely REAL.")

This news is likely FAKE.
