In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
fake_news=pd.read_csv("fake_or_real_news.csv")
true_news=pd.read_csv("True_1.csv")

In [5]:
fake_news['label'] = 0
true_news['label'] = 1

In [6]:
news_dataset = pd.concat([fake_news, true_news], axis =0 )
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,subject,date
0,8476.0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,,
1,10294.0,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,,
2,3608.0,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0,,
3,10142.0,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,,
4,875.0,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0,,


In [7]:
news_dataset = news_dataset.sample(len(news_dataset))
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,subject,date
489,,Senate Republicans shove tax bill ahead as Dem...,WASHINGTON (Reuters) - U.S. Senate Republicans...,1,politicsNews,"November 28, 2017"
12591,,Philippine senator criticizes 'reckless disreg...,MANILA (Reuters) - Philippine public officials...,1,worldnews,"December 11, 2017"
14994,,Southeast Asia summit draft statement skips ov...,MANILA (Reuters) - A draft of the statement to...,1,worldnews,"November 13, 2017"
5817,1253.0,Inside Bernie Sanders’s quest to win over Neva...,"EAST LAS VEGAS — ""Nevada is especially importa...",0,,
11289,,"U.S. sanctions North Korean missile experts, R...",WASHINGTON/MOSCOW (Reuters) - The United State...,1,worldnews,"December 26, 2017"


In [8]:
news_dataset.shape

(27752, 6)

In [9]:
news_dataset.isnull().sum()

Unnamed: 0    21417
title             0
text              0
label             0
subject        6335
date           6335
dtype: int64

In [10]:
print(news_dataset['title'])

489      Senate Republicans shove tax bill ahead as Dem...
12591    Philippine senator criticizes 'reckless disreg...
14994    Southeast Asia summit draft statement skips ov...
5817     Inside Bernie Sanders’s quest to win over Neva...
11289    U.S. sanctions North Korean missile experts, R...
                               ...                        
8430     Hackers targeted Trump campaign, Republican Pa...
2259                Hillary Clinton has a deep commitment.
3965      Trump questions why U.S. Civil War had to happen
21198    Jails, justice system at breaking point as Phi...
16282    U.S. backs Spanish efforts to block break-away...
Name: title, Length: 27752, dtype: object


In [11]:
port_stem = PorterStemmer()

In [12]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
news_dataset['title'] = news_dataset['title'].apply(stemming)
print(news_dataset['title'])

489      senat republican shove tax bill ahead democrat...
12591    philippin senat critic reckless disregard deng...
14994    southeast asia summit draft statement skip roh...
5817            insid berni sander quest win nevada latino
11289    u sanction north korean missil expert russia o...
                               ...                        
8430     hacker target trump campaign republican parti ...
2259                           hillari clinton deep commit
3965                     trump question u civil war happen
21198    jail justic system break point philippin drug ...
16282     u back spanish effort block break away catalonia
Name: title, Length: 27752, dtype: object


In [14]:
x = news_dataset['title'].values
y = news_dataset['label'].values

In [15]:
print(x)

['senat republican shove tax bill ahead democrat fume'
 'philippin senat critic reckless disregard dengu vaccin program'
 'southeast asia summit draft statement skip rohingya crisi' ...
 'trump question u civil war happen'
 'jail justic system break point philippin drug war intensifi'
 'u back spanish effort block break away catalonia']


In [16]:
print(y)

[1 1 1 ... 1 1 1]


In [17]:
y.shape

(27752,)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [19]:
x_train

array(['burundi lose bid stop u n atroc investig',
       'man knife shot amsterdam airport suspect known offend',
       'trump say send feder help fight chicago crime', ...,
       'australian high court sit resolv lawmak citizenship crisi',
       'u consum watchdog agenc offici sue block trump pick',
       'senat committe subpoena former trump advis flynn russia'],
      dtype=object)

In [20]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [21]:
print(xv_train)

  (0, 4399)	0.3296635537972154
  (0, 518)	0.5146326337524102
  (0, 8296)	0.3468123253788847
  (0, 816)	0.3341687872737205
  (0, 5055)	0.3722828696589251
  (0, 1177)	0.5058983011541606
  (1, 5994)	0.39392038334653084
  (1, 4738)	0.3808604212154715
  (1, 8462)	0.2713199993972599
  (1, 186)	0.31699047335195346
  (1, 290)	0.4437943110627838
  (1, 7837)	0.3117465110897118
  (1, 4733)	0.3848104945682223
  (1, 5180)	0.28888782326143964
  (2, 1953)	0.4118877671441987
  (2, 1464)	0.4471769013684652
  (2, 3160)	0.33289907234298244
  (2, 3939)	0.3523319167151712
  (2, 3097)	0.40552153974416844
  (2, 7705)	0.4158461447303466
  (2, 7567)	0.18903435354623366
  (2, 8924)	0.1493719925295352
  (3, 8650)	0.38368505869339287
  (3, 7053)	0.4474105601896885
  (3, 5295)	0.26280084241286905
  :	:
  (19422, 7567)	0.1400173217190502
  (19423, 7920)	0.4368814638122448
  (19423, 7227)	0.3947108663597587
  (19423, 1530)	0.42658159285908837
  (19423, 543)	0.36429517395789285
  (19423, 3963)	0.3393617370891709
  (1

###  Decision Tree Classification

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [24]:
pred_dt = DT.predict(xv_test)

In [25]:
DT.score(xv_test, y_test)

0.8565938025462407

In [26]:
from sklearn.metrics import classification_report

In [27]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.70      0.64      0.67      1878
           1       0.90      0.92      0.91      6448

    accuracy                           0.86      8326
   macro avg       0.80      0.78      0.79      8326
weighted avg       0.85      0.86      0.85      8326



In [28]:
y_pred = DT.predict(xv_test)

In [29]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred)) 

[[1204  674]
 [ 520 5928]]


In [30]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))        #error
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.14340619745375932
Mean Squared Error: 0.14340619745375932
Root Mean Squared Error: 0.37869010741470305


###  Naive Bayes using Bernoulli naive bayes

In [31]:
from sklearn.naive_bayes import BernoulliNB

In [32]:
BNB = BernoulliNB()

In [33]:
BNB.fit(xv_train, y_train)

BernoulliNB()

In [34]:
pred_bnb = BNB.predict(xv_test)
BNB.score(xv_test,y_test)
print(classification_report(y_test, pred_bnb)) 

              precision    recall  f1-score   support

           0       0.80      0.67      0.73      1878
           1       0.91      0.95      0.93      6448

    accuracy                           0.89      8326
   macro avg       0.86      0.81      0.83      8326
weighted avg       0.88      0.89      0.88      8326



In [35]:
print(confusion_matrix(y_test, pred_bnb)) 

[[1251  627]
 [ 306 6142]]


In [36]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_bnb))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_bnb))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_bnb)))

Mean Absolute Error: 0.1120586115781888
Mean Squared Error: 0.1120586115781888
Root Mean Squared Error: 0.3347515669540455


###  K nearest neighbor

In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [38]:
ng = KNeighborsClassifier()
ng.fit(xv_test, y_test)

KNeighborsClassifier()

In [39]:
pred_knn = ng.predict(xv_test)

In [40]:
ng.score(xv_test,y_test)

0.8251261109776603

In [41]:
print(classification_report(y_test, pred_knn))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       0.56      1.00      0.72      1878
           1       1.00      0.78      0.87      6448

    accuracy                           0.83      8326
   macro avg       0.78      0.89      0.80      8326
weighted avg       0.90      0.83      0.84      8326



In [42]:
print(confusion_matrix(y_test, pred_knn))                  #confusion Matrix

[[1870    8]
 [1448 5000]]


In [43]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_knn))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_knn))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_knn)))

Mean Absolute Error: 0.17487388902233966
Mean Squared Error: 0.17487388902233966
Root Mean Squared Error: 0.41817925465324035


### Support Vector Machine

In [44]:
from sklearn.svm import LinearSVC

In [45]:
ls = LinearSVC()

In [46]:
ls.fit(xv_test, y_test)

LinearSVC()

In [47]:
pred_ls = ls.predict(xv_test)

In [48]:
ls.score(xv_test,y_test)

0.9789815037232765

In [49]:
print(classification_report(y_test, pred_ls))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1878
           1       0.98      1.00      0.99      6448

    accuracy                           0.98      8326
   macro avg       0.98      0.96      0.97      8326
weighted avg       0.98      0.98      0.98      8326



In [50]:
print(confusion_matrix(y_test, pred_ls))                  #confusion Matrix

[[1728  150]
 [  25 6423]]


In [51]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ls))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ls))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ls)))

Mean Absolute Error: 0.021018496276723517
Mean Squared Error: 0.021018496276723517
Root Mean Squared Error: 0.14497757163342032


### ANN (Artificial Neural Network)

In [52]:
from sklearn.neural_network import MLPClassifier

In [53]:
ann = MLPClassifier()

In [54]:
ann.fit(xv_test, y_test)

MLPClassifier()

In [55]:
pred_ann = ann.predict(xv_test)

In [56]:
ann.score(xv_test,y_test)

0.9998798943069902

In [57]:
print(classification_report(y_test, pred_ann))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1878
           1       1.00      1.00      1.00      6448

    accuracy                           1.00      8326
   macro avg       1.00      1.00      1.00      8326
weighted avg       1.00      1.00      1.00      8326



In [58]:
print(confusion_matrix(y_test, pred_ann))                  #confusion Matrix

[[1877    1]
 [   0 6448]]


In [59]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ann))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ann))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ann)))

Mean Absolute Error: 0.00012010569300984866
Mean Squared Error: 0.00012010569300984866
Root Mean Squared Error: 0.01095927429211664
