In [4]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
news_data=pd.read_csv('/content/train.csv')

In [8]:
news_data.shape


(20800, 5)

In [9]:
news_data.head


<bound method NDFrame.head of           id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2        

In [10]:
news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [11]:
#we have a large dataset so we can replace null values with empty string
news_data=news_data.fillna('')

In [13]:
news_data['content']=news_data['author']+''+news_data['title']

In [14]:
print(news_data)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [19]:
#seperating data and labels
X=news_data.drop(columns='label',axis=1)
Y=news_data['label']

In [20]:
print(X)
print(Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [21]:
#stemming->process of reducing a word to its root word
port_stem=PorterStemmer()

In [27]:
def stemming(content):
  stemmed_content=re.sub('[^a-zA-Z]','',content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english') ]
  stemmed_content=''.join(stemmed_content)
  return stemmed_content

In [28]:
news_data['content']=news_data['content'].apply(stemming)

In [29]:
print(news_data['content'])

0        darrelllucushousedemaidewedidntevenseecomeysle...
1        danieljflynnflynnhillaryclintonbigwomanoncampu...
2               consortiumnewscomwhythetruthmightgetyoufir
3        jessicapurkisscivilianskilledinsingleusairstri...
4        howardportnoyiranianwomanjailedforfictionalunp...
                               ...                        
20795    jeromehudsonrappertitrumpaposterchildforwhites...
20796    benjaminhoffmannflplayoffsschedulematchupsando...
20797    michaeljdelamercedandrachelabramsmacysissaidto...
20798    alexansarynatorussiatoholdparallelexercisesinb...
20799                          davidswansonwhatkeepsthefal
Name: content, Length: 20800, dtype: object


In [31]:
#seperating data and labels
X=news_data['content'].values
Y=news_data['label'].values

In [32]:
print(X)

['darrelllucushousedemaidewedidntevenseecomeysletteruntiljasonchaffetztweetedit'
 'danieljflynnflynnhillaryclintonbigwomanoncampusbreitbart'
 'consortiumnewscomwhythetruthmightgetyoufir' ...
 'michaeljdelamercedandrachelabramsmacysissaidtoreceivetakeoverapproachbyhudsonsbaythenewyorktim'
 'alexansarynatorussiatoholdparallelexercisesinbalkan'
 'davidswansonwhatkeepsthefal']


In [33]:
print(Y)

[1 0 1 ... 0 1 1]


In [36]:
#converting the textual data to numerical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)

In [37]:
print(X)

  (0, 5208)	1.0
  (1, 4896)	1.0
  (2, 4549)	1.0
  (3, 9741)	1.0
  (4, 7968)	1.0
  (5, 4994)	1.0
  (6, 11988)	1.0
  (7, 1010)	1.0
  (8, 6803)	1.0
  (9, 12883)	1.0
  (10, 85)	1.0
  (11, 4201)	1.0
  (12, 1165)	1.0
  (13, 9095)	1.0
  (14, 1698)	1.0
  (15, 8725)	1.0
  (16, 13016)	1.0
  (17, 17437)	1.0
  (18, 18102)	1.0
  (19, 9269)	1.0
  (20, 14162)	1.0
  (21, 9651)	1.0
  (22, 15084)	1.0
  (23, 12692)	1.0
  (24, 5799)	1.0
  :	:
  (20775, 1824)	1.0
  (20776, 8919)	1.0
  (20777, 408)	1.0
  (20778, 13973)	1.0
  (20779, 18501)	1.0
  (20780, 4165)	1.0
  (20781, 7747)	1.0
  (20782, 18134)	1.0
  (20783, 3726)	1.0
  (20784, 5551)	1.0
  (20785, 1573)	1.0
  (20786, 7464)	1.0
  (20787, 8238)	1.0
  (20788, 9851)	1.0
  (20789, 19648)	1.0
  (20790, 288)	1.0
  (20791, 4863)	1.0
  (20792, 10228)	1.0
  (20793, 16406)	1.0
  (20794, 11851)	1.0
  (20795, 9666)	1.0
  (20796, 2366)	1.0
  (20797, 13127)	1.0
  (20798, 890)	1.0
  (20799, 5474)	1.0


In [38]:
#spitting the data to the train and the test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [55]:
#training data to the model
model=LogisticRegression()

In [56]:
model.fit(X_train,Y_train)

LogisticRegression()

In [57]:
#evaluation of the model
#train  data prediction
X_train_pred=model.predict(X_train)
train_data_accuracy=accuracy_score(X_train_pred,Y_train)

In [58]:
print('the training data accuracy is :',train_data_accuracy)

the training data accuracy is : 0.9927283653846154


In [59]:
#test  data prediction
X_test_pred=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_pred,Y_test)

In [61]:
print('the testing data accuracy is :',test_data_accuracy)

the testing data accuracy is : 0.5329326923076924


In [65]:
#Predictive modelling
X_news=X_test[0]
prediction=model.predict(X_news)
print(prediction)
if(prediction==0):
 print('it is a real news')
else:
 print('it is a fraud news')

[0]
it is a real news


In [66]:
print(Y_test[0])

1
