In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news-detection/data.h5
/kaggle/input/fake-news-detection/data.csv


In [2]:
import matplotlib.pyplot as plt
import seaborn as sea

sea.set()

In [3]:
# Taking the input
data = pd.read_csv('../input/fake-news-detection/data.csv')

In [4]:
# printing the first 5 values
data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [5]:
# checking for null values
data.isnull().sum()

URLs         0
Headline     0
Body        21
Label        0
dtype: int64

In [6]:
# making a copy of data to apply operation on it
df = data.copy()

In [7]:
# removing the rows with null values
df = df.dropna()

In [8]:
# Again checking for null values
df.isnull().sum()

URLs        0
Headline    0
Body        0
Label       0
dtype: int64

In [9]:
# checking the shape 
df.shape

(3988, 4)

In [10]:
# we can apply this algorithm on heading or body.
# I am choosing to merge those parameter and then apply this algorithm
df['News'] = df['Headline'] + df['Body']
df.head()

Unnamed: 0,URLs,Headline,Body,Label,News
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,Four ways Bob Corker skewered Donald TrumpImag...
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,Linklater's war veteran comedy speaks to moder...
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,Jason Aldean opens 'SNL' with Vegas tributeCou...


In [11]:
# urls giving no/less information about the Label of the news
df = df.drop(['URLs','Headline','Body'],axis=1)
df.head()

Unnamed: 0,Label,News
0,1,Four ways Bob Corker skewered Donald TrumpImag...
1,1,Linklater's war veteran comedy speaks to moder...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,1,Jason Aldean opens 'SNL' with Vegas tributeCou...


In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

ps = PorterStemmer()
def wordcheck(text):
    
    # Exchange the pattern other than [a-zA-Z] with space
    text = re.sub('[^a-zA-Z]', ' ',text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Split it into array on the basics of space ' '
    text = text.split()
    
    # Take the stem words that are not present in stopwords
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    
    # Join them using space " "
    text = ' '.join(text)
    
    return text

In [13]:
# apply this fuction to every row
df['News'] = df['News'].apply(wordcheck)
df.head()

Unnamed: 0,Label,News
0,1,four way bob corker skewer donald trumpimag co...
1,1,linklat war veteran comedi speak modern americ...
2,1,trump fight corker jeopard legisl agendath feu...
3,1,egypt cheiron win tie pemex mexican onshor oil...
4,1,jason aldean open snl vega tributecountri sing...


In [33]:
# For Splitting the dataset
from sklearn.model_selection import train_test_split

X = df['News']
Y = df['Label']

# Split the data into training and test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# columns will be divided using one for each unique word in the Text
# TfidfVectorizer(stop_words=stopwords_list) Can also remove stopwords

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
print(xv_train[0])
xv_test = vectorization.transform(x_test)

  (0, 22197)	0.14727735947002057
  (0, 20926)	0.1454491850984949
  (0, 7419)	0.10728063545929646
  (0, 27349)	0.11965521495135883
  (0, 4014)	0.1454491850984949
  (0, 29547)	0.09410810547372458
  (0, 12171)	0.07701688630576346
  (0, 24685)	0.0723220060747356
  (0, 16914)	0.2574032750794516
  (0, 7176)	0.1549902000723302
  (0, 6487)	0.1234306919122799
  (0, 8861)	0.0946246200339839
  (0, 26779)	0.08229781312444595
  (0, 25089)	0.07424908760484568
  (0, 27799)	0.08516431306657678
  (0, 20567)	0.0838180342599189
  (0, 17925)	0.1276088759346977
  (0, 4349)	0.10775864953736415
  (0, 27388)	0.061042643819899114
  (0, 237)	0.09151978768581427
  (0, 8876)	0.08636294644701457
  (0, 25289)	0.07580789045858484
  (0, 26487)	0.07728727109307852
  (0, 21477)	0.0978313098027029
  (0, 22003)	0.20997559489080853
  (0, 22008)	0.280629402336117
  (0, 28427)	0.38047435223157106
  (0, 12265)	0.42700944335971086
  (0, 874)	0.2729707050549553
  (0, 16806)	0.2866443999942031
  (0, 26234)	0.20668896907551723
 

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_model = SVC(kernel='linear')

# Fitting training set to the model
svm_model.fit(xv_train,y_train)

# Predicting the test set results based on the model
svm_y_pred = svm_model.predict(xv_test)

# Calculate the accuracy score of this model
score = accuracy_score(y_test,svm_y_pred)
print('Accuracy of SVM model is ', score)

Accuracy of SVM model is  0.9874686716791979


In [17]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression()

#Fitting training set to the model
LR_model.fit(xv_train,y_train)

#Predicting the test set results based on the model
lr_y_pred = LR_model.predict(xv_test)

#Calculate the accurracy of this model
score = accuracy_score(y_test,lr_y_pred)
print('Accuracy of LR model is ', score)

Accuracy of LR model is  0.974937343358396


In [18]:
from sklearn.naive_bayes import MultinomialNB
NB_model =MultinomialNB()

#Fitting training set to the model
NB_model.fit(xv_train, y_train)

#Predicting the test set results based on the model
nb_y_pred = NB_model.predict(xv_test)

#Calculate the accurracy of this model
score = accuracy_score(y_test,nb_y_pred)
print('Accuracy of NB model is ', score)

Accuracy of NB model is  0.924812030075188


In [19]:
def check_news(news):
    new_def_test = pd.DataFrame([news], columns=['text'])
    
    new_def_test["text"] = new_def_test["text"].apply(wordcheck)
    new_x_test = new_def_test["text"]
    
    # print(new_x_test)
    vectorized_input_data = vectorization.transform(new_x_test)
    
    prediction = svm_model.predict(vectorized_input_data)
    
    print(prediction)

In [20]:
print(df['Label'][878])
check_news("save children f branco cartoona potato batteri light room month diy system use provid room led power light long day video")

0
[0]


In [21]:
print(df['Label'][123])
check_news("exclus prioriti usa run tax reform tv adstori highlight far republican releas framework tax plan stop progress group attack gop effort washington cnn democrat super pac throw attent money fight republican tax plan focus first televis ad year messag argu presid donald trump tax plan hurt middl class prioriti usa action support hillari clinton presidenti bid last year focus issu say take ad buy begin tuesday ad run cnn fox news along digit ad campaign second ad featur peopl identifi middl class career carpent teacher nurs worri new plan rais tax pay tax cut wealthi big corpor earli morn tweet make right one man say ad refer trump habit tweet start day ad run nation cabl specif target nevada arizona missouri ohio wisconsin senat elect next year ad also run washington dc accord group read")

1
[1]
