In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib
import pickle
import matplotlib.pyplot as plt

import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Benson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load Datasets
df=pd.read_csv('datasets/true_fake_data.csv')

In [3]:
# df = df.drop(["title", "date","label","combined"], axis = 1)

In [4]:
df = df.sample(frac = 1)

In [5]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [6]:
df.isnull().sum()

text      2
target    0
dtype: int64

In [7]:
df.dropna(subset=['text'],inplace=True)

In [8]:
def preprocess(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)  
    return text

In [9]:
df["text"] = df["text"].apply(preprocess)

In [10]:
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [11]:
x = df["text"]
y = df["target"]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([('count_vectorization', CountVectorizer()),
                 ('tfidf_vectorization', TfidfTransformer()),
                 ('LR', LogisticRegression())])

LR = pipe.fit(x_train, y_train)

In [14]:
pred_dt = LR.predict(x_test)

LR.score(x_test, y_test)
#saving this model to the disk
model_file = 'models/logistic_regression.sav'
pickle.dump(LR,open(model_file,'wb'))

In [15]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6199
           1       0.97      0.96      0.97      5583

    accuracy                           0.97     11782
   macro avg       0.97      0.97      0.97     11782
weighted avg       0.97      0.97      0.97     11782



# Decision Tree Classification

In [16]:
from sklearn.tree import DecisionTreeClassifier

pipe = Pipeline([('count_vectorization', CountVectorizer()),
                 ('tfidf_vectorization', TfidfTransformer()),
                 ('DT', DecisionTreeClassifier())])

DT = pipe.fit(x_train, y_train)

In [17]:
pred_dt = DT.predict(x_test)

DT.score(x_test, y_test)
#saving this model to the disk
model_file = 'models/decision_tree.sav'
pickle.dump(DT,open(model_file,'wb'))

In [18]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      6199
           1       0.94      0.93      0.94      5583

    accuracy                           0.94     11782
   macro avg       0.94      0.94      0.94     11782
weighted avg       0.94      0.94      0.94     11782



# Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('count_vectorization', CountVectorizer()),
                 ('tfidf_vectorization', TfidfTransformer()),
                 ('RFC', RandomForestClassifier())])

RFC = pipe.fit(x_train, y_train)

In [20]:
pred_rfc = RFC.predict(x_test)

RFC.score(x_test, y_test)
#saving this model to the disk
model_file = 'models/random_forest.sav'
pickle.dump(RFC,open(model_file,'wb'))

In [21]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      6199
           1       0.97      0.97      0.97      5583

    accuracy                           0.97     11782
   macro avg       0.97      0.97      0.97     11782
weighted avg       0.97      0.97      0.97     11782



# Model Testing

In [22]:
def prediction_label(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "True News"
    
def detecting_fake_news(news_text):
    testing_news = {"text":[news_text]}
    new_def_test = pd.DataFrame(testing_news)
    
    new_def_test["text"] = new_def_test["text"].apply(preprocess) 
    new_x_test = new_def_test["text"]
    
    pred_LR = LR.predict(new_x_test)
    pred_DT = DT.predict(new_x_test)
    pred_RFC = RFC.predict(new_x_test)
    

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nRFC Prediction: {} ".format(prediction_label(pred_LR[0]),                                                                                                      
                                                                                                              prediction_label(pred_DT[0]), 
                                                                                                              prediction_label(pred_RFC[0])))

In [24]:
news_text = str(input())
detecting_fake_news(news_text)

Dwayne Johnson defending a bunch of beleaguered millennials sounds like our kind of movie, but we’ll have to make do with the Instagram video The Rock posted Friday decrying a recent interview published by The Daily Star in which the actor dragged the “snowflake generation.” Griped Johnson in the article, “So many good people fought for freedom and equality — but this generation are looking for a reason to be offended.” The thing is, according to the actor, he actually loves millennials (in addition to everyone else) and that interview is entirely made up.“The interview never took place. Never happened. Never said any of those words,” Johnson said on Instagram, saying he was “baffled” to learn he was reportedly in a heated battle with people born between 1982 and 2004. “Completely untrue. One hundred percent fabricated.” Adds the actor, “I always encourage empathy, I encourage growth but most importantly, I encourage everybody to be exactly who they want to be.” And really, in the end,