In [1]:
# Import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# nltk for text cleaning
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# wordcloud creation libraries
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

# ML libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
import joblib

In [2]:
# Load the dataset
dataset = pd.read_csv("dataset.csv")
dataset.head()

Unnamed: 0,text,label
0,reuters highlight u president donald trump adm...,0
1,washington reuters republican political consul...,0
2,moscow reuters russian troop took part war gam...,0
3,st century wire say democratic party establish...,1
4,mitt romney top adviser election accepted reas...,1


In [3]:
# Collect a sample of the data for X & y
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]

# Run vectorizer so we can convert the text into numerical features
vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))

In [4]:
# View X data (articles)
X.head()

0    reuters highlight u president donald trump adm...
1    washington reuters republican political consul...
2    moscow reuters russian troop took part war gam...
3    st century wire say democratic party establish...
4    mitt romney top adviser election accepted reas...
Name: text, dtype: object

In [5]:
# View y data (real/fake news flag) 0 = True, 1 = False
y.head()

0    0
1    0
2    0
3    1
4    1
Name: label, dtype: int64

In [6]:
# Get the variables for train, test & split
train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)
train_data.shape , test_data.shape

((28000,), (7000,))

In [7]:
# Transform the training data
vector_train = vectorizer.fit_transform(train_data)
vector_train = vector_train.toarray()

# Transform the test data
vector_test = vectorizer.transform(test_data).toarray()

In [8]:
# Create the dataframes for the test & train data
training_data = pd.DataFrame(vector_train , columns=vectorizer.get_feature_names())
testing_data = pd.DataFrame(vector_test , columns= vectorizer.get_feature_names())

In [9]:
# Prepare the Multinomial Naive Bayes model
clf = MultinomialNB()

# Fit the model
clf.fit(training_data, train_label)
y_pred  = clf.predict(testing_data)

In [10]:
# Check the results of the prediction
pd.Series(y_pred).value_counts()

1    3661
0    3339
dtype: int64

In [11]:
# Check the results of the test
test_label.value_counts()

1    3654
0    3346
Name: label, dtype: int64

In [12]:
# Classification report for the test data
print(classification_report(test_label , y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      3346
           1       0.96      0.96      0.96      3654

    accuracy                           0.95      7000
   macro avg       0.95      0.95      0.95      7000
weighted avg       0.95      0.95      0.95      7000



In [13]:
# Classification report for the training data
y_pred_train = clf.predict(training_data)
print(classification_report(train_label , y_pred_train))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96     13601
           1       0.96      0.96      0.96     14399

    accuracy                           0.96     28000
   macro avg       0.96      0.96      0.96     28000
weighted avg       0.96      0.96      0.96     28000



In [14]:
# Accuracy score for the training data
accuracy_score(train_label , y_pred_train)

0.9585

In [15]:
# Accuracy score for the test data
accuracy_score(test_label , y_pred)

0.9547142857142857

In [21]:
# Save the model
joblib.dump(clf , 'multinomial-NB-model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, open("multinomial-NB-pickle.pkl", "wb"))

### Manually testing the model

In [17]:
# create the values for the text cleaning
ps = WordNetLemmatizer()
stopwords = stopwords.words("english")
nltk.download("wordnet")

# Define a funtion to clean the text
def cleaning_data(row):
    
    # convert text to into lower case
    row = row.lower() 
    
    # this line of code only take words from text and remove number and special character using RegX
    row = re.sub('[^a-zA-Z]' , ' ' , row)
    
    # split the data and make token.
    token = row.split() 
    
    # lemmatise the word and remove stop words like a, an , the , is ,are ...
    news = [ps.lemmatize(word) for word in token if not word in stopwords]  
    
    # finaly join all the token with space
    cleaned_news = ' '.join(news) 
    
    # return cleanned data
    return cleaned_news 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jfrgr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
news = cleaning_data(str("Have a go at writing an article for yourself, or collecting some text from a news article online!"))
news

'go writing article collecting text news article online'

In [19]:
single_prediction = clf.predict(vectorizer.transform([news]).toarray())
single_prediction


print("The model says...")

if single_prediction == 0:
    print("Your article is legit!")
else:
    print("Your article is a load of rubbish!")

The model says...
Your article is a load of rubbish!
