About the Dataset:
1. id : unique id for a news article
2. title: the title of a news article
3. author: author of the news article
4. text: the text of the article; could be incomplete
5. label: a label that marks whether the news article is real or fake

#1 : Fake news
#2 : Real news

In [5]:
import numpy as np
import pandas as pd
import re # re- regular expression: it is useful for searching the text in a document
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # remove the prefix and suffixes 
from sklearn.feature_extraction.text import TfidfVectorizer # convert the text into feature vector
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\A C E
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [11]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Data Pre-processing

In [20]:
# loading the dataset to a pandas DataFrame
news_dataset=pd.read_csv('WELFake_Dataset.csv')

In [21]:
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [24]:
news_dataset.shape

(72134, 4)

In [26]:
# Counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [28]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna("")

In [30]:
 # separating the data & label
x = news_dataset.drop(columns='label', axis = 1)
y = news_dataset['label']

In [32]:
print(x)
print(y)

       Unnamed: 0                                              title  \
0               0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1               1                                                      
2               2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3               3  Bobby Jindal, raised Hindu, uses story of Chri...   
4               4  SATAN 2: Russia unvelis an image of its terrif...   
...           ...                                                ...   
72129       72129  Russians steal research on Trump in hack of U....   
72130       72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131       72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132       72132  Trump tussle gives unpopular Mexican leader mu...   
72133       72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  
0      No comment is expected from Barack Obama Membe...  
1         Did the

# Stemming:
Stemming is the process of reducing a word to its Root word
example:
actor, actress, acting --> act (root word)

In [35]:
port_stem = PorterStemmer()

In [41]:
def stemming(content): # define a function name called stemming
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # re:searching a paragraph for text , sub:substitute certain values , a-zA-Z : remove everything that is not in alphabet and words
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split() # all those text and words will be converted to a list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ''.join(stemmed_content)
    return stemmed_content

In [43]:
news_dataset['content'] = news_dataset['title'].apply(stemming)

In [45]:
print(news_dataset['content'])

0        lawenforchighalertfollowthreatcopwhiteblackliv...
1                                                         
2        unbelievobamaattorneygenersaycharlottrioterpea...
3        bobbijindalraishinduusestorichristianconverswo...
4        satanrussiaunvimagterrifinewsupernukwesternwor...
                               ...                        
72129          russianstealresearchtrumphackudemocratparti
72130    watchgiulianidemanddemocratapologtrumpracistbi...
72131               migrantrefusleavtrainrefugecamphungari
72132    trumptusslgiveunpopularmexicanleadermuchneedsh...
72133                goldmansachendorshillariclintonpresid
Name: content, Length: 72134, dtype: object


In [47]:
# separating the data and label
x = news_dataset['content'].values
y = news_dataset['label'].values

In [49]:
print(x)
print(y)

['lawenforchighalertfollowthreatcopwhiteblacklivesmattfyfterroristvideo'
 ''
 'unbelievobamaattorneygenersaycharlottrioterpeacprotesthomestatenorthcarolinavideo'
 ... 'migrantrefusleavtrainrefugecamphungari'
 'trumptusslgiveunpopularmexicanleadermuchneedshotarm'
 'goldmansachendorshillariclintonpresid']
[1 1 1 ... 0 0 1]


In [53]:
x.shape

(72134,)

In [55]:
y.shape

(72134,)

In [62]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer() # Tf: term frequency , idf: inverse document frequency - counts number of times a particular word 
vectorizer.fit(x)
x = vectorizer.transform(x)

In [64]:
print(x)

  (0, 26906)	1.0
  (2, 55387)	1.0
  (3, 3871)	1.0
  (4, 41950)	1.0
  (5, 47408)	1.0
  (6, 13379)	1.0
  (7, 22286)	1.0
  (8, 44882)	1.0
  (9, 26826)	1.0
  (10, 19791)	1.0
  (11, 29313)	1.0
  (12, 42283)	1.0
  (13, 58691)	1.0
  (14, 7062)	1.0
  (15, 3488)	1.0
  (16, 5745)	1.0
  (17, 55823)	1.0
  (18, 28631)	1.0
  (19, 42559)	1.0
  (20, 780)	1.0
  (21, 21542)	1.0
  (22, 42458)	1.0
  (23, 55566)	1.0
  (24, 14889)	1.0
  (25, 4171)	1.0
  :	:
  (72109, 36778)	1.0
  (72110, 21966)	1.0
  (72111, 3930)	1.0
  (72112, 55615)	1.0
  (72113, 18465)	1.0
  (72114, 3623)	1.0
  (72115, 38492)	1.0
  (72116, 42174)	1.0
  (72117, 46765)	1.0
  (72118, 27270)	1.0
  (72119, 3561)	1.0
  (72120, 33081)	1.0
  (72121, 17320)	1.0
  (72122, 45934)	1.0
  (72123, 10938)	1.0
  (72124, 55657)	1.0
  (72125, 61135)	1.0
  (72126, 12133)	1.0
  (72127, 60538)	1.0
  (72128, 25531)	1.0
  (72129, 41273)	1.0
  (72130, 58634)	1.0
  (72131, 30383)	1.0
  (72132, 52644)	1.0
  (72133, 19475)	1.0


In [68]:
# Splitting dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y, random_state=2)

In [72]:
 # training the model: logistic Regression
model = LogisticRegression()

In [74]:
model.fit(x_train, y_train)

# Evaluation

In [85]:
# accuracy score
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
print('Training data accuracy: ',training_data_accuracy)

Training data accuracy:  0.9898452527423016


In [87]:
model.fit(x_test,y_test)
x_test_prediction = model.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('Testing data accuracy: ',testing_data_accuracy)

Testing data accuracy:  1.0


 # Making a predictive system

In [90]:
x_new = x_test[0]
prediction = model.predict(x_new)
print(prediction)

if (prediction[0]==0):
    print('The news is Real')
else:
    print('The news is fake')

[1]
The news is fake


In [92]:
print(y_test[0])

1


In [96]:
x_new = x_test[1]
prediction = model.predict(x_new)
print(prediction)

if (prediction[0]==0):
    print('The news is Real')
else:
    print('The news is fake')

[0]
The news is Real


In [98]:
print(y_test[1])

0
