# About the DataSet:
# 1.id:unique id for news article
# 2.title:the title of news article
# 3.author:the authoe of news article
# 4.text:the text of the article,could be incomplete
# 5.lebel:label that make the news is fake or real



Importing the dependencies

In [44]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [31]:
import nltk
nltk.download('stopwords')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [33]:
#printing the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data pre-processing

In [34]:
#laoding the dataset to a pandas Dataframe


news_dataset = pd.read_csv('/content/train.csv')

In [16]:
news_dataset.shape

(20800, 5)

In [17]:
#print first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [19]:
#counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,558
author,1957
text,39
label,0


In [20]:
#replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [41]:
#merging title and author column
news_dataset['content']=news_dataset['author']+' '+news_dataset['title']

In [24]:
print(news_dataset['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [26]:
#separate the label and data
X= news_dataset.drop(columns='label',axis=1)
Y=news_dataset['label']

In [27]:
print(X)
print(Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

Stemming:

Stemming is a process of reducing a word to its root word.

Exmaple:
actor,actress,acting --> act

In [32]:
port_stem = PorterStemmer()

In [38]:
def stemming(content): #defining function
  stemmed_content = re.sub('[^a-zA-Z]',' ', content)
  stemmed_content=stemmed_content.lower()
  stemmed_content=stemmed_content.split()
  stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content=' '.join(stemmed_content)
  return stemmed_content



In [50]:
nltk.download('punkt')
news_dataset['content'] = news_dataset['content'].apply(stemming)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [51]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [52]:
print(news_dataset['content'])

0        darrel lucu hou dem aid : we didn ’ t even see...
1        daniel j. flynn flynn : hillari clinton , big ...
2        consortiumnews.com whi the truth might get you...
3        jessica purkiss 15 civilian kill in singl us a...
4        howard portnoy iranian woman jail for fiction ...
                               ...                        
20795    jerom hudson rapper t.i . : trump a ’ poster c...
20796    benjamin hoffman n.f.l . playoff : schedul , m...
20797    michael j. de la merc and rachel abram maci ’ ...
20798    alex ansari nato , russia to hold parallel exe...
20799                david swanson what keep the f-35 aliv
Name: content, Length: 20800, dtype: object


In [53]:
#separating the data and labels
X=news_dataset['content'].values
Y=news_dataset['label'].values

In [54]:
print(X)

['darrel lucu hou dem aid : we didn ’ t even see comey ’ s letter until jason chaffetz tweet it'
 'daniel j. flynn flynn : hillari clinton , big woman on campu - breitbart'
 'consortiumnews.com whi the truth might get you fire' ...
 'michael j. de la merc and rachel abram maci ’ s is said to receiv takeov approach by hudson ’ s bay - the new york time'
 'alex ansari nato , russia to hold parallel exerci in balkan'
 'david swanson what keep the f-35 aliv']


In [55]:
print(Y)

[1 0 1 ... 0 1 1]


In [56]:
Y.shape

(20800,)

In [57]:
#converting textual data into numerical values
vectorizer=TfidfVectorizer()
vectorizer.fit(X)

X=vectorizer.transform(X)


In [58]:
print(X)

  (0, 16137)	0.1929394389101528
  (0, 15637)	0.2878266336363749
  (0, 15363)	0.25398082215784673
  (0, 13254)	0.22785171618552097
  (0, 8912)	0.31842007278599516
  (0, 8653)	0.25982548022629764
  (0, 7790)	0.21818967685665908
  (0, 7703)	0.1570374108152007
  (0, 7132)	0.1941263071103146
  (0, 5167)	0.20480646160683144
  (0, 4247)	0.2651040175355607
  (0, 4054)	0.2380784711533736
  (0, 3866)	0.31842007278599516
  (0, 3250)	0.21912133816794632
  (0, 2803)	0.3302870954826738
  (0, 693)	0.23927422651709104
  (1, 16435)	0.30314402494338183
  (1, 10617)	0.1622671322050862
  (1, 6953)	0.19405908713752693
  (1, 5681)	0.7047914946678246
  (1, 3835)	0.2614184089062767
  (1, 3109)	0.19468531860693214
  (1, 2554)	0.36206203788117136
  (1, 2236)	0.15314709450655994
  (1, 1861)	0.2942757007699871
  :	:
  (20797, 7674)	0.1284116342916905
  (20797, 7167)	0.20941454391931577
  (20797, 3908)	0.21130967158113106
  (20797, 2467)	0.1484704722525159
  (20797, 1658)	0.32187162265402935
  (20797, 1102)	0.2990

Splitting the dataset to training and test data




In [60]:
#80% data is train data and 20% data is test data
X_train , X_test, Y_train , Y_test = train_test_split(X , Y , test_size=0.2 , stratify=Y , random_state =2)


Training the model: Logistic Regression

In [61]:
model = LogisticRegression()

In [63]:
model.fit(X_train,Y_train)

Evaluation

accuracy score

In [64]:
#accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)


In [65]:
print('Accuracy score of training dataset: ' , training_data_accuracy)

Accuracy score of training dataset:  0.9903846153846154


In [66]:
#accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction , Y_test)


In [67]:
print('Accuracy score of test dataset: ' , test_data_accuracy)

Accuracy score of test dataset:  0.9841346153846153


Making a predictive system

In [68]:
X_new = X_test[0]
prediction = model.predict(X_new)
print(prediction)
if (prediction[0]==0):
  print("The news is real")
else:
  print("The news is fake")


[1]
The news is fake
