Import the dataset


In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
# printing the stopwords in English
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Preprocessing

In [4]:
# loading the dataset to pandas Data Frame
news_dataset = pd.read_csv('/content/train.csv', encoding='latin-1', low_memory=False)

In [5]:
news_dataset.shape

(2426, 686)

In [6]:
#print the first five rows of dataset
news_dataset.head()


Unnamed: 0,id,title,author,text,label,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 676,Unnamed: 677,Unnamed: 678,Unnamed: 679,Unnamed: 680,Unnamed: 681,Unnamed: 682,Unnamed: 683,Unnamed: 684,Unnamed: 685
0,0,House Dem Aide: We Didnât Even See Comeyâs...,Darrell Lucus,House Dem Aide: We Didnât Even See Comeyâs...,1,,,,,,...,,,,,,,,,,
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,,,,,,...,,,,,,,,,,
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,,,,,,...,,,,,,,,,,
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,,,,,,...,,,,,,,,,,
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,,,,,,...,,,,,,,,,,


In [7]:
#counting the no of missing vauses in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,123
author,349
text,190
label,244
...,...
Unnamed: 681,2426
Unnamed: 682,2426
Unnamed: 683,2426
Unnamed: 684,2426


In [8]:
# replacing the null valuses wiith empty strings
news_dataset = news_dataset.fillna('')

In [9]:
#merge the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']


  news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']


In [10]:
#separating the data and label
X = news_dataset.drop(columns='label',axis=1)
Y = news_dataset['label']

In [None]:
print(X)

In [None]:
print(Y)

stemming the process of reducing a word to its root word
example : actor , actress , acting ----> act


In [13]:
from nltk.stem.porter import PorterStemmer
port_stem = PorterStemmer()

In [14]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in set(stopwords.words('english'))]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [15]:
news_dataset['content']= news_dataset['content'].apply(stemming)


In [16]:
print(news_dataset['content'])

0       darrel lucu hous dem aid even see comey letter...
1       daniel j flynn flynn hillari clinton big woman...
2                  consortiumnew com truth might get fire
3       jessica purkiss civilian kill singl us airstri...
4       howard portnoy iranian woman jail fiction unpu...
                              ...                        
2421    prashant rao sewel chan alan rusbridg guardian...
2422                                                     
2423    liam stack giant panda longer endang vulner ne...
2424    adam liptak michael shear suprem court tie blo...
2425    activist post fbi want believ examin email second
Name: content, Length: 2426, dtype: object


In [25]:
# Filter the dataset to include only rows where 'label' is '0' or '1'
filtered_dataset = news_dataset[news_dataset['label'].isin(['0', '1'])]

# Separate the data and label from the filtered dataset
X = filtered_dataset['content'].values
Y = filtered_dataset['label'].values

In [18]:
print(X)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'liam stack giant panda longer endang vulner new york time'
 'adam liptak michael shear suprem court tie block obama immigr plan new york time'
 'activist post fbi want believ examin email second']


In [19]:
print(Y)

['1' '0' '1' ... '0' '0' '1']


In [20]:
Y.shape


(2426,)

In [33]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

In [35]:
#splitting the dataset to training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [None]:
print(X_test)

**Training the Model : Logistic Regression Model**

In [37]:
model = LogisticRegression()

In [38]:
model.fit(X_train, Y_train)

In [39]:
#evaluation
#accuracy
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [40]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.974390243902439


In [41]:
#accuracy for test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [42]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9391727493917275


Making a predictive system


In [57]:
from os import pread
x_news=X_test[186]
preadiction = model.predict(x_news)
print(preadiction)

if (preadiction[0]=='0'):
  print('The news is Real')
else:
  print('The news is Fake')

['1']
The news is Fake


In [58]:
print(Y_test[186])

1
