In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
fake_news=pd.read_csv("Fake.csv")
true_news=pd.read_csv("True.csv")

In [5]:
fake_news['label'] = 0
true_news['label'] = 1

In [6]:
print(fake_news.shape,true_news.shape)

(23481, 5) (21417, 5)


In [7]:
news_dataset = pd.concat([fake_news, true_news], axis =0 )
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [8]:
news_dataset = news_dataset.sample(len(news_dataset))
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
15580,Somalia's Islamist insurgency executes four me...,(Reuters) - Somalia s Islamist al Shabaab insu...,worldnews,"November 6, 2017",1
16218,FATHER OF SON KILLED BY ILLEGAL Speaks Up On D...,,Government News,"Mar 1, 2017",0
21164,Rome's 5-Star mayor launches bid to save ailin...,ROME (Reuters) - Rome s mayor on Friday launch...,worldnews,"September 1, 2017",1
13235,MEDIA IMMEDIATELY REPORTS Alleged Killer Of Im...,There is no question that the killing of any i...,politics,"Aug 16, 2016",0
15255,Syria's Eastern Ghouta faces 'complete catastr...,"GENEVA (Reuters) - The 400,000 civilians besie...",worldnews,"November 9, 2017",1


In [9]:
news_dataset.shape

(44898, 5)

In [10]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

#### To make the model work more accurately and to reduce the computational time and computational power of the model, we have not used the data that uses the Text, as it has more number of un-necessary words and would be making no sense.

#### Inplace of that we have created a new-column content that stores the combination of subject column and title, whose length compared to text column is less, and would result in less computation time and power and better performance.

In [11]:
# merging the subject name and news title
news_dataset['content'] = news_dataset['subject']+' '+news_dataset['title']

In [12]:
print(news_dataset['content'])

15580    worldnews Somalia's Islamist insurgency execut...
16218    Government News FATHER OF SON KILLED BY ILLEGA...
21164    worldnews Rome's 5-Star mayor launches bid to ...
13235    politics MEDIA IMMEDIATELY REPORTS Alleged Kil...
15255    worldnews Syria's Eastern Ghouta faces 'comple...
                               ...                        
5309     News  Loser Donald Trump Ordered To Pay $300,0...
23268    Middle-east Episode #152 – SUNDAY WIRE: ‘From ...
3791     News  WATCH: NSA Chief Drops HUGE Bombshell Ab...
15855    politics BREAKING: IRAN THROWS DOWN ULTIMATUM:...
73       politicsNews White House aide sees temporary f...
Name: content, Length: 44898, dtype: object


### Stemming:

#### Stemming is the process of reducing a word to its Root word

#### example:
#### actor, actress, acting --> act

In [13]:
port_stem = PorterStemmer()

In [14]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [15]:
news_dataset['content'] = news_dataset['content'].apply(stemming)
print(news_dataset['content'])

15580    worldnew somalia islamist insurg execut four m...
16218    govern news father son kill illeg speak democr...
21164    worldnew rome star mayor launch bid save ail c...
13235    polit media immedi report alleg killer imam as...
15255    worldnew syria eastern ghouta face complet cat...
                               ...                        
5309     news loser donald trump order pay judg florida...
23268    middl east episod sunday wire ground zero syri...
3791     news watch nsa chief drop huge bombshel involv...
15855          polit break iran throw ultimatum move barri
73       politicsnew white hous aid see temporari fund ...
Name: content, Length: 44898, dtype: object


In [25]:
#separating the data and label
x = news_dataset['content'].values
y = news_dataset['label'].values

In [26]:
print(x)

['worldnew somalia islamist insurg execut four men accus spi'
 'govern news father son kill illeg speak democrat ignor like ignor video'
 'worldnew rome star mayor launch bid save ail citi transport firm' ...
 'news watch nsa chief drop huge bombshel involv wikileak russia elect'
 'polit break iran throw ultimatum move barri'
 'politicsnew white hous aid see temporari fund fix children health program']


#### At the end we have only one feature to train our model which is "content".
#### Now we will be converting the values of content feature from textual data to numerical data.

In [27]:
print(y)

[1 0 1 ... 0 0 1]


In [28]:
y.shape

(44898,)

In [29]:
 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [30]:
x_train

array(['worldnew chines envoy exchang view korean peninsula issu north korea',
       'politicsnew cuba want sign accord u obama exit offici',
       'left news pro trump chicagoan hit back use real fake sculptur front trump tower monument cnn',
       ..., 'politicsnew trump shift away complet muslim ban penc',
       'news trump biggest kkk fan back love anti muslim tweet morn',
       'politicsnew elect bid ohio senat keep safe distanc trump'],
      dtype=object)

In [31]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

#### So by using TfidfVectorizer, we have weighted the word counts by a measure of how often they appear in our feature and will be training our model on this numerical data.

In [32]:
print(xv_train)

  (0, 5828)	0.24505712729629356
  (0, 7212)	0.2398497460749799
  (0, 5491)	0.30683979266513595
  (0, 7768)	0.3932720280457551
  (0, 5829)	0.3190500558134875
  (0, 11430)	0.3366305226942707
  (0, 3621)	0.40674744342640057
  (0, 3491)	0.34061137087016913
  (0, 1836)	0.3484410390171002
  (0, 11801)	0.12498705520076242
  (1, 7349)	0.3127546956616098
  (1, 3640)	0.4523654694124668
  (1, 7295)	0.22273744980777857
  (1, 63)	0.4839674844513082
  (1, 9576)	0.37192632903516437
  (1, 11549)	0.3052274758878403
  (1, 2497)	0.40127452726520113
  (1, 8008)	0.1455995617215469
  (2, 2007)	0.20755913302825002
  (2, 6865)	0.3033583083739351
  (2, 10778)	0.30023405176955836
  (2, 4173)	0.27630313096162046
  (2, 9300)	0.4161763690840035
  (2, 3736)	0.219193824199725
  (2, 8507)	0.2541514424169703
  :	:
  (31425, 762)	0.3301791460407234
  (31425, 7763)	0.3956545669412182
  (31425, 6982)	0.310111405095453
  (31425, 10899)	0.13443130041146772
  (31425, 8008)	0.1499867351415507
  (31426, 6884)	0.39515019904419

##### Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [35]:
pred_lr=LR.predict(xv_test)

In [36]:
LR.score(xv_test, y_test)

1.0

In [38]:
from sklearn.metrics import classification_report

In [39]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



###  Decision Tree Classification

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [42]:
pred_dt = DT.predict(xv_test)

In [43]:
DT.score(xv_test, y_test)

1.0

In [44]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [45]:
y_pred = DT.predict(xv_test)  

In [46]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))                                  #confusion Matrix

[[7029    0]
 [   0 6441]]


In [47]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))        #error
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0


###  Naive Bayes using Bernoulli naive bayes

In [48]:
from sklearn.naive_bayes import BernoulliNB

In [49]:
BNB = BernoulliNB()

In [50]:
BNB.fit(xv_train, y_train)

BernoulliNB()

In [51]:
pred_bnb = BNB.predict(xv_test)

In [52]:
BNB.score(xv_test,y_test)

0.9962880475129918

In [54]:
print(classification_report(y_test, pred_bnb))    # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      7029
           1       0.99      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [55]:
print(confusion_matrix(y_test, pred_bnb))                  #confusion Matrix

[[6983   46]
 [   4 6437]]


In [56]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_bnb))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_bnb))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_bnb)))

Mean Absolute Error: 0.003711952487008166
Mean Squared Error: 0.003711952487008166
Root Mean Squared Error: 0.060925794923071507


###  K nearest neighbor

In [57]:
from sklearn.neighbors import KNeighborsClassifier

In [58]:
ng = KNeighborsClassifier()
ng.fit(xv_test, y_test)

KNeighborsClassifier()

In [60]:
pred_knn = ng.predict(xv_test)

In [61]:
ng.score(xv_test,y_test)

0.9468448403860431

In [62]:
print(classification_report(y_test, pred_knn))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       0.97      0.93      0.95      7029
           1       0.93      0.97      0.95      6441

    accuracy                           0.95     13470
   macro avg       0.95      0.95      0.95     13470
weighted avg       0.95      0.95      0.95     13470



In [63]:
print(confusion_matrix(y_test, pred_knn))                  #confusion Matrix

[[6533  496]
 [ 220 6221]]


In [64]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_knn))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_knn))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_knn)))

Mean Absolute Error: 0.05315515961395694
Mean Squared Error: 0.05315515961395694
Root Mean Squared Error: 0.2305540275379221


### Support Vector Machine

In [66]:
from sklearn.svm import LinearSVC

In [67]:
ls = LinearSVC()

In [68]:
ls.fit(xv_test, y_test)

LinearSVC()

In [69]:
pred_ls = ls.predict(xv_test)

In [70]:
ls.score(xv_test,y_test)

1.0

In [71]:
print(classification_report(y_test, pred_ls))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [72]:
print(confusion_matrix(y_test, pred_ls))                  #confusion Matrix

[[7029    0]
 [   0 6441]]


In [73]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ls))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ls))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ls)))

Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0


### ANN (Artificial Neural Network)

In [74]:
from sklearn.neural_network import MLPClassifier

In [75]:
ann = MLPClassifier()

In [76]:
ann.fit(xv_test, y_test)

MLPClassifier()

In [77]:
pred_ann = ann.predict(xv_test)

In [78]:
ann.score(xv_test,y_test)

1.0

In [79]:
print(classification_report(y_test, pred_ann))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7029
           1       1.00      1.00      1.00      6441

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [80]:
print(confusion_matrix(y_test, pred_ann))                  #confusion Matrix

[[7029    0]
 [   0 6441]]


In [81]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ann))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ann))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ann)))

Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0
