In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [3]:
data = pd.read_csv('data/fake_or_real_news.csv')
data

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [6]:
data['fake'] = data['label'].apply(lambda x : 0 if x == 'REAL' else 1)
data = data.drop('label', axis = 1)

In [7]:
x, y = data['text'], data['fake']

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)
X_train

3022    Assange claims ‘crazed’ Clinton campaign tried...
4177    Veterans help veterans cope with PTSD through ...
1865    (CNN) Donald Trump believes he would "absolute...
1893    To understand what ails Hillary Clinton, let’s...
2358    MI5 Chief Gives First Ever Interview to Press,...
                              ...                        
4525    Washington (CNN) He cut billions in taxes, int...
3536    By Tony Cartalucci \nMyanmar’s defacto leader,...
2115    Americans must be vigilant in light of a terro...
2082    Hezbollah’s Candidate Becomes Lebanese Preside...
1288    BREAKING : Trump BEATING “Federal Investigatio...
Name: text, Length: 5068, dtype: object

In [9]:
vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [11]:
clf = LinearSVC()
clf.fit(X_train_vectorized, Y_train)

In [13]:
clf.score(X_test_vectorized, Y_test)

0.9313338595106551

In [24]:
text = X_test.iloc[10]


'On September 5, 2006, Eli Chomsky was an editor and staff writer for the Jewish Press, and Hillary Clinton was running for a shoo-in re-election as a U.S. senator. Her trip making the rounds of editorial boards brought her to Brooklyn to meet the editorial board of the Jewish Press.\n\nThe tape was never released and has only been heard by the small handful of Jewish Press staffers in the room. According to Chomsky, his old-school audiocassette is the only existent copy and no one has heard it since 2006, until today when he played it for the Observer.\n\nThe tape is 45 minutes and contains much that is no longer relevant, such as analysis of the re-election battle that Sen. Joe Lieberman was then facing in Connecticut. But a seemingly throwaway remark about elections in areas controlled by the Palestinian Authority has taken on new relevance amid persistent accusations in the presidential campaign by Clinton’s Republican opponent Donald Trump that the current election is “rigged.”\n\

In [18]:
vectorized_text = vectorizer.transform([text])

In [21]:
arr = clf.predict(vectorized_text)
arr

array([1], dtype=int64)

In [23]:
arr[0]

1

In [25]:
data

Unnamed: 0,id,title,text,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",0


In [27]:
data.iloc[3]

id                                                   10142
title    Bernie supporters on Twitter erupt in anger ag...
text     — Kaydee King (@KaydeeKing) November 9, 2016 T...
fake                                                     1
Name: 3, dtype: object

In [29]:
t = data['text'].iloc[3]

In [30]:
vectorized_text = vectorizer.transform([t])

In [31]:
arr1 = clf.predict(vectorized_text)
arr1[0]

1