In [1]:
# Import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# nltk for text cleaning
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# wordcloud creation libraries
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

# ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
import joblib

# Postgres database connection
from sqlalchemy import create_engine
from config import username, password, db_name
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/{db_name}')
connection = engine.connect()

In [2]:
# Load the dataset
dataset = pd.read_sql("SELECT * FROM news", connection)
dataset.head()

Unnamed: 0,text,label
0,founding father wanted separation church state...,1
1,wow bravo tomi really nailed time enjoy notion...,1
2,karma bitch way around la seem like hillary cl...,1
3,washington reuters democratic presidential can...,0
4,may president trump biggest fan primary season...,1


In [3]:
# Collect a sample of the data for X & y
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]

# Run vectorizer so we can convert the text into numerical features
vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))

In [4]:
# View X data (articles)
X.head()

0    founding father wanted separation church state...
1    wow bravo tomi really nailed time enjoy notion...
2    karma bitch way around la seem like hillary cl...
3    washington reuters democratic presidential can...
4    may president trump biggest fan primary season...
Name: text, dtype: object

In [5]:
# View y data (real/fake news flag) 0 = True, 1 = False
y.head()

0    1
1    1
2    1
3    0
4    1
Name: label, dtype: int64

In [6]:
# Get the variables for train, test & split
train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)
train_data.shape , test_data.shape

((28000,), (7000,))

In [7]:
# Transform the training data
vector_train = vectorizer.fit_transform(train_data)
vector_train = vector_train.toarray()

# Transform the test data
vector_test = vectorizer.transform(test_data).toarray()

In [8]:
# Create the dataframes for the test & train data
training_data = pd.DataFrame(vector_train , columns=vectorizer.get_feature_names())
testing_data = pd.DataFrame(vector_test , columns= vectorizer.get_feature_names())

In [9]:
# Prepare the Multinomial Naive Bayes model
clf = MultinomialNB()

# Fit the model
clf.fit(training_data, train_label)
y_pred  = clf.predict(testing_data)

In [10]:
# Check the results of the prediction
pd.Series(y_pred).value_counts()

1    3590
0    3410
dtype: int64

In [11]:
# Check the results of the test
test_label.value_counts()

1    3547
0    3453
Name: label, dtype: int64

In [12]:
# Classification report for the test data
print(classification_report(test_label , y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      3453
           1       0.95      0.96      0.96      3547

    accuracy                           0.95      7000
   macro avg       0.95      0.95      0.95      7000
weighted avg       0.95      0.95      0.95      7000



In [13]:
# Classification report for the training data
y_pred_train = clf.predict(training_data)
print(classification_report(train_label , y_pred_train))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     13510
           1       0.96      0.96      0.96     14490

    accuracy                           0.96     28000
   macro avg       0.96      0.96      0.96     28000
weighted avg       0.96      0.96      0.96     28000



In [14]:
# Accuracy score for the training data
accuracy_score(train_label , y_pred_train)

0.9594285714285714

In [15]:
# Accuracy score for the test data
accuracy_score(test_label , y_pred)

0.9547142857142857

In [16]:
# Save the model
joblib.dump(clf , 'multinomial-NB-model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, open("multinomial-NB-pickle.pkl", "wb"))

### Manually testing the model

In [17]:
# create the values for the text cleaning
ps = WordNetLemmatizer()
stopwords = stopwords.words("english")
nltk.download("wordnet")

# Define a funtion to clean the text
def cleaning_data(row):
    
    # convert text to into lower case
    row = row.lower() 
    
    # this line of code only take words from text and remove number and special character using RegX
    row = re.sub('[^a-zA-Z]' , ' ' , row)
    
    # split the data and make token.
    token = row.split() 
    
    # lemmatise the word and remove stop words like a, an , the , is ,are ...
    news = [ps.lemmatize(word) for word in token if not word in stopwords]  
    
    # finaly join all the token with space
    cleaned_news = ' '.join(news) 
    
    # return cleanned data
    return cleaned_news 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jfrgr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# Run a single prediction with the model using true news from the dataset
truenews = cleaning_data(str("reuters highlight day u president donald trump administration tuesday trump knew week national security adviser michael flynn misled white house contact russia immediately force administration spokesman say defense secretary jim mattis playing impact flynn resignation might mattis debut trip week europe meet nato ally russia ukraine trump said expects russia return crimea ukraine reduce violence ukraine white house spokesman sean spicer say russia prospect better relation washington appear suffered another setback resignation trump national security adviser biggest blow yet consequential legal challenge trump travel ban proceed two track next day congressional democrat fail pressure republican seeking trump tax return saying flynn scandal made imperative find whether president business tie russia office government ethic say white house investigate public endorsement ivanka trump product trump senior adviser kellyanne conway consider disciplinary action four republican senator yet say support trump nominee head labor department andrew puzder creating suspense whether survive initial confirmation hearing week prime minister benjamin netanyahu preparing white house meeting trump wednesday work adviser align israeli u thinking middle east ensure gap remain federal reserve chair janet yellen response warning congressman halt international negotiation early stage trump presidency say u central bank authority consult foreign counterpart benefit united state canadian official trade expert say although trump say want tweak trade tie canada pledge renegotiate nafta focus mexico almost impossible trump meet wednesday chief executive officer eight large retailer including target corp tgt n best buy co inc bby n j c penney company inc jcp n discus tax reform infrastructure improvement source say"))
truenews



In [29]:
single_prediction = clf.predict(vectorizer.transform([truenews]).toarray())
single_prediction


print("The model says...")

if single_prediction == 0:
    print("Your article is legit!")
else:
    print("Your article is a load of rubbish!")

The model says...
Your article is legit!


In [30]:
# Run a single prediction with the model using true news from the dataset
fakenews = cleaning_data(str("left went nut immediately donald trump speech detroit economic club today term word salad used political hack cnn usual hit job trump spin spin spin plan great speech lot detail included trump website stephen moore larry kudlow two many bright mind worked economic plan republican candidate key part plan"))
fakenews

'left went nut immediately donald trump speech detroit economic club today term word salad used political hack cnn usual hit job trump spin spin spin plan great speech lot detail included trump website stephen moore larry kudlow two many bright mind worked economic plan republican candidate key part plan'

In [31]:
single_prediction = clf.predict(vectorizer.transform([fakenews]).toarray())
single_prediction


print("The model says...")

if single_prediction == 0:
    print("Your article is legit!")
else:
    print("Your article is a load of rubbish!")

The model says...
Your article is a load of rubbish!
