In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, confusion_matrix
import re
import string

In [2]:
df_fake = pd.read_csv("fake_or_real_news.csv")
df_true = pd.read_csv("True_1.csv")

In [3]:
df_fake.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df_true.head(5)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
df_fake["class"]=0
df_true["class"]=1

In [6]:
df_fake.shape,df_true.shape

((6335, 5), (21417, 5))

In [21]:
df_marge = pd.concat([df_fake, df_true],axis=0)
df_marge.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label,class,subject,date
0,8476.0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0,,
1,10294.0,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0,,
2,3608.0,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0,,
3,10142.0,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0,,
4,875.0,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0,,
5,6903.0,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE,0,,
6,7341.0,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE,0,,
7,95.0,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL,0,,
8,4869.0,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL,0,,
9,2909.0,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL,0,,


In [8]:
df_marge.columns

Index(['Unnamed: 0', 'title', 'text', 'label', 'class', 'subject', 'date'], dtype='object')

In [22]:
df = df_marge.drop(["title", "subject","date","label","Unnamed: 0"], axis = 1)

In [23]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [24]:
df = df.sample(frac = 1)

In [25]:
df.head()

Unnamed: 0,text,class
690,"For Jeb Bush’s campaign, August was a cruel mo...",0
3020,WASHINGTON (Reuters) - The State Department wi...,1
694,"DANANG, Vietnam (Reuters) - U.S. President Don...",1
6088,BERLIN (Reuters) - Germany would take advantag...,1
16713,"BAR ELIAS, Lebanon (Reuters) - A new girls sc...",1


In [26]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [27]:
df.columns

Index(['text', 'class'], dtype='object')

In [28]:
df.head()

Unnamed: 0,text,class
0,"For Jeb Bush’s campaign, August was a cruel mo...",0
1,WASHINGTON (Reuters) - The State Department wi...,1
2,"DANANG, Vietnam (Reuters) - U.S. President Don...",1
3,BERLIN (Reuters) - Germany would take advantag...,1
4,"BAR ELIAS, Lebanon (Reuters) - A new girls sc...",1


In [29]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [30]:
df["text"] = df["text"].apply(wordopt)

In [31]:
x = df["text"]
y = df["class"]

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [37]:
pred_lr=LR.predict(xv_test)

In [38]:
LR.score(xv_test, y_test)

0.9613259668508287

In [39]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91      1900
           1       0.96      0.99      0.98      6426

    accuracy                           0.96      8326
   macro avg       0.96      0.93      0.94      8326
weighted avg       0.96      0.96      0.96      8326



### 1) DECISION TREE

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [42]:
pred_dt = DT.predict(xv_test)

In [43]:
DT.score(xv_test, y_test)

0.9901513331731924

In [44]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1900
           1       0.99      0.99      0.99      6426

    accuracy                           0.99      8326
   macro avg       0.99      0.99      0.99      8326
weighted avg       0.99      0.99      0.99      8326



In [45]:
y_pred = DT.predict(xv_test)

In [46]:
print(confusion_matrix(y_test, y_pred))       #confusion matrix

[[1858   42]
 [  40 6386]]


In [47]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))        #error
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.00984866682680759
Mean Squared Error: 0.00984866682680759
Root Mean Squared Error: 0.09924044954960447


### 2) NAIVE BAYES USING BERNOULLI

In [48]:
from sklearn.naive_bayes import BernoulliNB

In [49]:
BNB = BernoulliNB()

In [50]:
BNB.fit(xv_train, y_train)

BernoulliNB()

In [51]:
pred_bnb = BNB.predict(xv_test)

In [52]:
BNB.score(xv_test,y_test)

0.8477059812635119

In [53]:
print(classification_report(y_test, pred_bnb))     #Precision, Recall, F-Measure, Accuracy

              precision    recall  f1-score   support

           0       0.68      0.64      0.66      1900
           1       0.89      0.91      0.90      6426

    accuracy                           0.85      8326
   macro avg       0.79      0.77      0.78      8326
weighted avg       0.84      0.85      0.85      8326



In [54]:
print(confusion_matrix(y_test, pred_bnb))  #Confusion Matrix

[[1208  692]
 [ 576 5850]]


In [55]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_bnb))        #error
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_bnb))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_bnb)))

Mean Absolute Error: 0.1522940187364881
Mean Squared Error: 0.1522940187364881
Root Mean Squared Error: 0.39024866269660435


### 3) K NEAREST NEIGHBOR

In [56]:
from sklearn.neighbors import KNeighborsClassifier

In [57]:
ng = KNeighborsClassifier()
ng.fit(xv_test, y_test)

KNeighborsClassifier()

In [58]:
pred_knn = ng.predict(xv_test)

In [59]:
ng.score(xv_test,y_test)

0.5026423252462167

In [60]:
print(classification_report(y_test, pred_knn))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       0.31      0.98      0.47      1900
           1       0.98      0.36      0.53      6426

    accuracy                           0.50      8326
   macro avg       0.65      0.67      0.50      8326
weighted avg       0.83      0.50      0.52      8326



In [61]:
print(confusion_matrix(y_test, pred_knn))                  #confusion Matrix

[[1862   38]
 [4103 2323]]


In [62]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_knn))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_knn))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_knn)))

Mean Absolute Error: 0.49735767475378334
Mean Squared Error: 0.49735767475378334
Root Mean Squared Error: 0.7052359000744243


### 4) SUPPORT VECTOR MACHINE

In [63]:
from sklearn.svm import LinearSVC

In [64]:
ls = LinearSVC()

In [65]:
ls.fit(xv_test, y_test)

LinearSVC()

In [66]:
pred_ls = ls.predict(xv_test)

In [67]:
ls.score(xv_test,y_test)

0.9992793658419409

In [68]:
print(classification_report(y_test, pred_ls))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1900
           1       1.00      1.00      1.00      6426

    accuracy                           1.00      8326
   macro avg       1.00      1.00      1.00      8326
weighted avg       1.00      1.00      1.00      8326



In [69]:
print(confusion_matrix(y_test, pred_ls))                  #confusion Matrix

[[1897    3]
 [   3 6423]]


In [70]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ls))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ls))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ls)))

Mean Absolute Error: 0.000720634158059092
Mean Squared Error: 0.000720634158059092
Root Mean Squared Error: 0.026844629966887083


### 5) ARTIFICIAL NEURAL NETWORK

In [71]:
from sklearn.neural_network import MLPClassifier

In [72]:
ann = MLPClassifier()

In [73]:
ann.fit(xv_test, y_test)

MLPClassifier()

In [74]:
pred_ann = ann.predict(xv_test)

In [75]:
ann.score(xv_test,y_test)

0.9998798943069902

In [76]:
print(classification_report(y_test, pred_ann))      # Precision , Recall , F-measure , Accuracy

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1900
           1       1.00      1.00      1.00      6426

    accuracy                           1.00      8326
   macro avg       1.00      1.00      1.00      8326
weighted avg       1.00      1.00      1.00      8326



In [77]:
print(confusion_matrix(y_test, pred_ann))                  #confusion Matrix

[[1900    0]
 [   1 6425]]


In [78]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred_ann))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred_ann))                 #error
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_ann)))

Mean Absolute Error: 0.00012010569300984866
Mean Squared Error: 0.00012010569300984866
Root Mean Squared Error: 0.01095927429211664
