In [29]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#TF - term frequency - number of times a term appears in a doc
#IDF - inverse document frequency - log and division metric - number of docs/number of docs that contain the term



In [30]:
df = pd.read_csv("fake_or_real_news.csv")

In [31]:
df 

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [32]:
df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [33]:
df.tail()

Unnamed: 0,id,title,text,label
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
6334,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


In [34]:
df.shape

(6335, 4)

In [35]:
df['fake']  = df['label'].apply(lambda x:0 if x == "REAL" else 1)

In [36]:
df

Unnamed: 0,id,title,text,label,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,0


In [37]:
x,y = df['text'],df['fake']

In [38]:
y

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: fake, Length: 6335, dtype: int64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [40]:
X_train

3622    John Kasich was killing it with these Iowa vot...
5762    2013: 7,157 [4] \nVirtually no one in the Zion...
1610    Abigail Marsh almost lost her life in a car ac...
4389    X Dear Reader! VDARE.com isn’t just a website....
3610    On this day in 1973, J. Fred Buzhardt, a lawye...
                              ...                        
3979    Senate Republicans are rejecting renewed calls...
6032    Many see a double standard in the FBI's recomm...
5925    Sen. Bernie Sanders won nearly three dozen del...
5757    Really the only unusual or exciting thing abou...
4493    Sean Hannity interviewing Mike Pence about Oba...
Name: text, Length: 5068, dtype: object

In [41]:
y_train

3622    0
5762    1
1610    1
4389    1
3610    0
       ..
3979    0
6032    0
5925    0
5757    0
4493    1
Name: fake, Length: 5068, dtype: int64

In [42]:
# X_train.shape
y_train.shape

(5068,)

In [43]:
# X_test.shape
y_test.shape 

(1267,)

In [44]:
vectorizer = TfidfVectorizer(stop_words="english",max_df=0.7)


In [45]:
# Fit the vectorizer on the training data and transform both training and test data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


In [46]:
X_train_vectorized

<5068x60886 sparse matrix of type '<class 'numpy.float64'>'
	with 1321799 stored elements in Compressed Sparse Row format>

In [47]:
from sklearn.svm import LinearSVC 
clf = LinearSVC() 
#classifier

In [48]:
clf.fit(X_train_vectorized,y_train)

In [49]:
clf.score(X_test_vectorized,y_test)

0.9376479873717443

In [50]:
print(X_test_vectorized.shape)


(1267, 60886)


In [51]:
print(X_train_vectorized.shape)


(5068, 60886)


In [63]:
X_test.iloc[100]

'Fear of a possible Islamic State bloodbath sent tens of thousands of Iraqis fleeing Ramadi on Monday after government forces abandoned the city -- just 80 miles from Baghdad -- in what one U.S. military official conceded was a fight "pretty much over."\n\nSome 25,000 people have fled the embattled streets of Ramadi as thousands of ISIS fighters seized the key Iraqi city, killing some 500, and reportedly going door-to-door looking for Iraqi government troops and police to run out of town.\n\n“There have been executions in the streets of Ramadi," Muhannad Haimour, a spokesman for the Anbar provincial government, told NBC News Monday. ISIS extremists used vehicles, bulldozers rigged with explosives and suicide bombers to overrun the city after weeks of battles in the street.\n\n"The situation in the city is absolutely terrible," Haimour said. "The city is in very bad shape."\n\nGen. Martin Dempsey, chairman of the Joint Chiefs of Staff, called ISIS\' gains "a serious setback" for both th

In [57]:
with open("mytext.txt","w",encoding="utf-8") as f:
    f.write(X_test.iloc[9])

In [58]:
with open("mytext.txt","r",encoding="utf-8") as f:
    text = f.read()

In [64]:
vectorized_text = vectorizer.transform([text])

In [65]:
clf.predict(vectorized_text)

array([1], dtype=int64)

In [67]:
y_test.iloc[100]

0