In [1]:
# PROJECT ON FAKE NEWS DETECTION

In [2]:
pip install numpy pandas sklearn

Note: you may need to restart the kernel to use updated packages.


In [3]:
# import libraries
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

In [5]:
# Read the data
df = pd.read_csv("news.csv")


In [6]:
# removing unnamed columns
df = df.iloc[:,0:4]

In [7]:
# Get the shape of data
df.shape

(7795, 4)

In [8]:
# This step is to print the information up to 5 columns
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [9]:
# This step is to print the information of last 5 columns
df.tail()

Unnamed: 0.1,Unnamed: 0,title,text,label
7790,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
7791,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
7792,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
7793,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL
7794,4330,Jeb Bush Is Suddenly Attacking Trump. Here's W...,Jeb Bush Is Suddenly Attacking Trump. Here's W...,REAL


In [10]:
# This step is to count the null values
df.isnull().sum()

Unnamed: 0     219
title          610
text           866
label         1040
dtype: int64

In [11]:
# Changing into DATAFRAME
df = pd.DataFrame(df)

In [12]:
# removing the null values from dataframe 
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [13]:
x = df.text
y = df.label

In [14]:
# Split the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 7)

In [15]:
# train the model

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english',max_df=0.7)
tfidt_train = tfidf.fit_transform(x_train)
tfidt_test = tfidf.transform(x_test)

# let us use the function get_feature_names()
tfidf.get_feature_names()

['00',
 '000',
 '0000',
 '000000031',
 '000035',
 '00006',
 '0001',
 '0001pt',
 '0002',
 '000billion',
 '000ft',
 '000km',
 '000x',
 '001',
 '0011',
 '003',
 '004',
 '004s',
 '005',
 '005s',
 '006',
 '00684',
 '006s',
 '007',
 '007s',
 '008',
 '009',
 '00am',
 '00pm',
 '01',
 '010',
 '011',
 '012',
 '013',
 '013c2812c9',
 '014',
 '015',
 '016',
 '01am',
 '02',
 '022',
 '023',
 '024',
 '027',
 '028',
 '02welcome',
 '03',
 '030',
 '031',
 '032',
 '033',
 '034',
 '037',
 '039',
 '03eb',
 '04',
 '0400',
 '042',
 '044',
 '047',
 '04pm',
 '05',
 '050',
 '052',
 '053',
 '056',
 '058',
 '06',
 '0600',
 '063',
 '0640',
 '066',
 '068',
 '06pm',
 '07',
 '0700',
 '071',
 '075',
 '0750',
 '076',
 '079',
 '08',
 '084',
 '0843',
 '085',
 '0851',
 '089',
 '09',
 '091',
 '098263',
 '09am',
 '09pm',
 '0_65b67362bd',
 '0_jgdktlmn',
 '0_kvyhphja',
 '0b6njlny5j',
 '0dpbdk6rjd',
 '0fjjvowyhg8qtskiz',
 '0h4at2yetra17uxetni02ls2jeg0mty45jrcu7mrzsrpcbq464i',
 '0hour',
 '0hq3vb2giv',
 '0in',
 '0jsn6pjkan',
 '0p

In [17]:
# Let us use the function get_stop_words()
tfidf.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [18]:
# Let us use the build_analyzer() function
tfidf.build_analyzer()

functools.partial(<function _analyze at 0x00000196EA729820>, ngrams=<bound method _VectorizerMixin._word_ngrams of TfidfVectorizer(max_df=0.7, stop_words='english')>, tokenizer=<built-in method findall of re.Pattern object at 0x00000196EA835AC0>, preprocessor=functools.partial(<function _preprocess at 0x00000196EA729790>, accent_function=None, lower=True), decoder=<bound method _VectorizerMixin.decode of TfidfVectorizer(max_df=0.7, stop_words='english')>, stop_words=frozenset({'again', 'few', 'i', 'and', 'sometime', 'mill', 'but', 'many', 'which', 'not', 'everything', 'our', 'more', 'nothing', 'found', 'afterwards', 'sincere', 'some', 'amoungst', 'anywhere', 'yet', 'whenever', 'or', 'seems', 'sometimes', 'latterly', 'per', 'anyhow', 'hereupon', 'fill', 'out', 'wherein', 'nine', 'ourselves', 'whose', 'own', 'last', 'under', 'where', 'five', 'herein', 'became', 'whence', 'elsewhere', 'him', 'now', 'someone', 'herself', 'how', 'in', 'becomes', 'this', 'we', 'see', 'yourselves', 'whoever',

In [19]:
# Let us use the function build_preprocessor()
tfidf.build_preprocessor()

functools.partial(<function _preprocess at 0x00000196EA729790>, accent_function=None, lower=True)

In [20]:
# use the class PassiveAggressiveClassifier and predict the model
pac = PassiveAggressiveClassifier()
pac.fit(tfidt_train,y_train)
y_pred = pac.predict(tfidt_test)

In [21]:
# calculate the accuracy
round(accuracy_score(y_test,y_pred),2)*100

87.0

In [22]:
print(type(x_train))

<class 'pandas.core.series.Series'>


In [23]:
print(type(x_test))

<class 'pandas.core.series.Series'>


In [24]:
print(type(y_train))

<class 'pandas.core.series.Series'>
