In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
fake_news=pd.read_csv("fake-harsh.csv")
true_news=pd.read_csv("True-harsh.csv")

In [5]:
fake_news['label'] = 0
true_news['label'] = 1

In [6]:
print(fake_news.shape,true_news.shape)

(23481, 5) (21417, 5)


In [7]:
news_dataset = pd.concat([fake_news, true_news], axis =0 )
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [8]:
news_dataset = news_dataset.sample(len(news_dataset))
news_dataset.head()

Unnamed: 0,title,text,subject,date,label
8005,Cosmetic Company Posts Pic Of Black Woman’s L...,Racism is still very prevalent in America. And...,News,"February 18, 2016",0
12450,BOOM! MATH SHOWS Trump Would Have Beaten Obama...,It s easy to glance at Tuesday s popular vote ...,politics,"Nov 10, 2016",0
15284,“DEAL WITH IT”: YOU’LL LOVE WHAT TRUMP CHOSE T...,Trump s not backing down from his in your fac...,politics,"Aug 23, 2015",0
18162,Tokyo Governor Koike: No need for big change i...,"TOKYO (Reuters) - Tokyo Governor Yuriko Koike,...",worldnews,"October 6, 2017",1
9600,GUESS WHO’S TRYING To Tie Amnesty For Nearly 8...,"Hey, Paul do you hear that huge sucking sound?...",politics,"Oct 25, 2017",0


In [9]:
news_dataset.shape

(44898, 5)

In [10]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [11]:
news_dataset.subject.value_counts()

politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: subject, dtype: int64

In [12]:
news_dataset.label.value_counts()

0    23481
1    21417
Name: label, dtype: int64

In [13]:
# merging the subject name and news title
news_dataset['content'] = news_dataset['subject']+' '+news_dataset['title']

In [14]:
print(news_dataset['content'])

8005     News  Cosmetic Company Posts Pic Of Black Woma...
12450    politics BOOM! MATH SHOWS Trump Would Have Bea...
15284    politics “DEAL WITH IT”: YOU’LL LOVE WHAT TRUM...
18162    worldnews Tokyo Governor Koike: No need for bi...
9600     politics GUESS WHO’S TRYING To Tie Amnesty For...
                               ...                        
7768     News  Megyn Kelly TORCHES Mike Huckabee For De...
2803     News  Trump Regrets On Twitter Is Everything Y...
19167    worldnews Trump aide Greenblatt returning to I...
4864     politicsNews Exclusive: Delrahim to head Justi...
6730     politicsNews Microsoft co-founder Gates, Trump...
Name: content, Length: 44898, dtype: object


### Stemming:

#### Stemming is the process of reducing a word to its Root word

#### example:
#### actor, actress, acting --> act

In [15]:
port_stem = PorterStemmer()

In [16]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [17]:
news_dataset['content'] = news_dataset['content'].apply(stemming)
print(news_dataset['content'])

8005     news cosmet compani post pic black woman lip r...
12450    polit boom math show trump would beaten obama ...
15284    polit deal love trump chose symbol love americ...
18162    worldnew tokyo governor koik need big chang mo...
9600     polit guess tri tie amnesti nearli illeg end y...
                               ...                        
7768     news megyn kelli torch mike huckabe defend tru...
2803          news trump regret twitter everyth hope would
19167    worldnew trump aid greenblatt return israel pe...
4864     politicsnew exclus delrahim head justic depart...
6730     politicsnew microsoft co founder gate trump di...
Name: content, Length: 44898, dtype: object


In [18]:
#separating the data and label
x = news_dataset['content'].values
y = news_dataset['label'].values

In [19]:
print(x)

['news cosmet compani post pic black woman lip racist troll explod imag'
 'polit boom math show trump would beaten obama romney obama elect'
 'polit deal love trump chose symbol love america time magazin interview video'
 ... 'worldnew trump aid greenblatt return israel peac talk offici'
 'politicsnew exclus delrahim head justic depart antitrust unit sourc'
 'politicsnew microsoft co founder gate trump discuss innov']


#### At the end we have only one feature to train our model which is "content".
#### Now we will be converting the values of content feature from textual data to numerical data.

In [20]:
print(y)

[0 0 0 ... 1 1 1]


In [21]:
y.shape

(44898,)

In [22]:
 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [23]:
x_train

array(['worldnew kenya opposit leader support boycott poll hold prayer',
       'politicsnew obama cuba histor visit',
       'worldnew uk reaffirm commit iran nuclear deal call trump may offic',
       ...,
       'middl east fals flag florida fbi agent pose terrorist miami sting oper',
       'polit obama bounc ukip leader claim obama insult threat uk voter backfir actual drove voter support leav eu movement',
       'us news video us elect voter fraud emerg'], dtype=object)

In [24]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

#### So by using TfidfVectorizer, we have weighted the word counts by a measure of how often they appear in our feature and will be training our model on this numerical data.

In [25]:
print(xv_train)

  (0, 8122)	0.4323284876288718
  (0, 4971)	0.34740248278925384
  (0, 8032)	0.30813547976156436
  (0, 1256)	0.3988771083279019
  (0, 10322)	0.25651338170727317
  (0, 5980)	0.27895340898433185
  (0, 7419)	0.33489789979813217
  (0, 5728)	0.40433322287762075
  (0, 11820)	0.1381922375164093
  (1, 11488)	0.46959048438629636
  (1, 4948)	0.6043194138851937
  (1, 2495)	0.5360966058352427
  (1, 7292)	0.2982879027593892
  (1, 8029)	0.19469868837420823
  (2, 7350)	0.30896319437983466
  (2, 6517)	0.2731905396447903
  (2, 10915)	0.11517806573845224
  (2, 1532)	0.23561708612365767
  (2, 2631)	0.26966337383652367
  (2, 7250)	0.3064618131894948
  (2, 5441)	0.2802116318387114
  (2, 2110)	0.35882629572080577
  (2, 8528)	0.5240450119261368
  (2, 11037)	0.3076018558719207
  (2, 11820)	0.13406398455703608
  :	:
  (31425, 6684)	0.2071615665278077
  (31426, 3183)	0.31202317111248984
  (31426, 1242)	0.32058548644324963
  (31426, 704)	0.25644058742621373
  (31426, 11038)	0.316036054490506
  (31426, 6940)	0.2603

##### Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [28]:
pred_lr=LR.predict(xv_test)

In [29]:
LR.score(xv_test, y_test)

1.0

In [30]:
from sklearn.metrics import classification_report

In [31]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7023
           1       1.00      1.00      1.00      6447

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



###  Decision Tree Classification

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [34]:
pred_dt = DT.predict(xv_test)

In [35]:
DT.score(xv_test, y_test)

1.0

In [36]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7023
           1       1.00      1.00      1.00      6447

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [37]:
y_pred = DT.predict(xv_test)  

In [38]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))                                  #confusion Matrix

[[7023    0]
 [   0 6447]]


In [39]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))        #error
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0


###  Naive Bayes using Bernoulli naive bayes

In [40]:
from sklearn.naive_bayes import BernoulliNB

In [41]:
BNB = BernoulliNB()

In [42]:
BNB.fit(xv_train, y_train)

BernoulliNB()

In [43]:
pred_bnb = BNB.predict(xv_test)

In [44]:
BNB.score(xv_test,y_test)

0.9971789161098737

In [45]:
print(classification_report(y_test, pred_bnb))    # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      7023
           1       0.99      1.00      1.00      6447

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [46]:
print(confusion_matrix(y_test, pred_bnb))                  #confusion Matrix

[[6986   37]
 [   1 6446]]


In [47]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_bnb))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_bnb))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_bnb)))

Mean Absolute Error: 0.0028210838901262065
Mean Squared Error: 0.0028210838901262065
Root Mean Squared Error: 0.053113876624910426


###  K nearest neighbor

In [48]:
from sklearn.neighbors import KNeighborsClassifier

In [49]:
ng = KNeighborsClassifier()
ng.fit(xv_test, y_test)

KNeighborsClassifier()

In [50]:
pred_knn = ng.predict(xv_test)

In [51]:
ng.score(xv_test,y_test)

0.9461766889383816

In [52]:
print(classification_report(y_test, pred_knn))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       0.96      0.93      0.95      7023
           1       0.93      0.96      0.94      6447

    accuracy                           0.95     13470
   macro avg       0.95      0.95      0.95     13470
weighted avg       0.95      0.95      0.95     13470



In [53]:
print(confusion_matrix(y_test, pred_knn))                  #confusion Matrix

[[6550  473]
 [ 252 6195]]


In [54]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_knn))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_knn))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_knn)))

Mean Absolute Error: 0.05382331106161841
Mean Squared Error: 0.05382331106161841
Root Mean Squared Error: 0.23199851521425394


### Support Vector Machine

In [55]:
from sklearn.svm import LinearSVC

In [56]:
ls = LinearSVC()

In [57]:
ls.fit(xv_test, y_test)

LinearSVC()

In [58]:
pred_ls = ls.predict(xv_test)

In [59]:
ls.score(xv_test,y_test)

1.0

In [60]:
print(classification_report(y_test, pred_ls))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7023
           1       1.00      1.00      1.00      6447

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [61]:
print(confusion_matrix(y_test, pred_ls))                  #confusion Matrix

[[7023    0]
 [   0 6447]]


In [62]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ls))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ls))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ls)))

Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0


### ANN (Artificial Neural Network)

In [63]:
from sklearn.neural_network import MLPClassifier

In [64]:
ann = MLPClassifier()

In [65]:
ann.fit(xv_test, y_test)

MLPClassifier()

In [66]:
pred_ann = ann.predict(xv_test)

In [67]:
ann.score(xv_test,y_test)

1.0

In [68]:
print(classification_report(y_test, pred_ann))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7023
           1       1.00      1.00      1.00      6447

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



In [69]:
print(confusion_matrix(y_test, pred_ann))                  #confusion Matrix

[[7023    0]
 [   0 6447]]


In [70]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ann))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ann))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ann)))

Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0
