In [2]:
# Importing the libraries
import pandas as pd
import numpy as np
import itertools
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import re

In [3]:
# Importing the news dataset 

df = pd.read_csv('news.csv')

In [6]:
# Remap labels from strings to int (Fake is 0 and Real is 1)

df['label'] = df["label"].map({"FAKE": 0, "REAL": 1})

In [8]:
# Assign text and labels (X and y)

X = df['text']
y = df['label']

In [9]:
lower_case = lambda x: re.sub(r"[^a-zA-Z]", " ", x.lower())
X = X.map(lower_case)

In [10]:
# Splitting data in train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
# Create a pipeline that first does preprocessing on the text and then applies Multinomial Naive Bayes
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
                    ('model', PassiveAggressiveClassifier())])

In [12]:
# Training
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.7, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 PassiveAggressiveClassifier(C=1.0, average=False,
                                             class_weight=None,
         

In [13]:
# Testing

predictions = pipeline.predict(X_test)

In [14]:
# Performance
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       677
           1       0.94      0.95      0.94       590

    accuracy                           0.95      1267
   macro avg       0.95      0.95      0.95      1267
weighted avg       0.95      0.95      0.95      1267

[[643  34]
 [ 32 558]]


In [71]:
y_train.sum()/len(y_train)

0.5458786936236392

In [15]:
# Serialising the model

with open('model.pickle', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)