# Passive-Aggressive Classifier

We are going to implement a Passive-Aggressive classifier on our dataset.

In [1]:
# This is the code used to preprocess our dataset. 
# Each step is explained in detail in the 'Data Pre-processing' notebook.
import numpy as np
import pandas as pd

df = pd.read_csv('news/news.csv')
df['news'] = df['title'] + ' ' + df['text']
convert_to_binary = {'REAL':1,'FAKE':0}
df['label'] = df['label'].map(convert_to_binary)
df = df.drop([df.columns[0],df.columns[1],df.columns[2]],axis=1)
df = df.reindex(columns=['news','label'])

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = stopwords.words('english')
stop_words.extend(['the','it','in'])
WNL = WordNetLemmatizer()

for index, row in df.iterrows():
    filtered_article = ''
    article = row['news']
    article = re.sub(r'[^\w\s]', '', article)
    words = [word.lower() for word in nltk.word_tokenize(article)]
    words = [word for word in words if not word in stop_words]
    words_lemmatized = []
    for word in words:
        if word == 'us':
            words_lemmatized.append(word)
        else:
            words_lemmatized.append(WNL.lemmatize(word))
    filtered_article = " ".join([word for word in words_lemmatized])
    df.loc[index, 'news'] = filtered_article
    
df.head()

Unnamed: 0,news,label
0,smell hillary fear daniel greenfield shillman ...,0
1,watch exact moment paul ryan committed politic...,0
2,kerry go paris gesture sympathy us secretary s...,1
3,bernie supporter twitter erupt anger dnc tried...,0
4,battle new york primary matter primary day new...,1


In [2]:
# Vectorization
df_input = df['news']
df_output = df['label']

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(df_input)
tf_idf_matrix

<6335x80967 sparse matrix of type '<class 'numpy.float64'>'
	with 1762247 stored elements in Compressed Sparse Row format>

In [3]:
label_column = df.loc[:,'label']
labels = label_column.values
print(labels)

[0 0 1 ... 0 1 1]


In [4]:
#Downloading the Passive-Aggressive classifier from sikit learn
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

To implement the Passive-aggressive classifier we need to split our dataset into training and test data.

In [5]:
x = tf_idf_matrix
y = df_output.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

#Defining our model with a regularisation parameter
model = PassiveAggressiveClassifier()

We can fit our model to our training data.

In [6]:
model.fit(x_train, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [7]:
#Making Predictions
y_predict = model.predict(x_test)
y_predict

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

# Measuring model performance

Now we will measure our Passive-Aggressive model's performance with it fit to our training data.

In [8]:
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9352972119936875

In [9]:
print("accuracy: %0.3f" % (accuracy_score(y_test, y_predict)))

accuracy: 0.933


So this passive-aggressive model has an accuracy of 93.4% which is good.

We will now see how this model performs with a different train-test split ratio of 75:25.

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
model.fit(x_train, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [11]:
y_predict = model.predict(x_test)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9299242424242424

In [12]:
print("accuracy: %0.3f" % (accuracy_score(y_test, y_predict)))

accuracy: 0.934


Hence the accuracy of this model is 93.2% which is surprisingly slightly less accurate than a train-test split of 70:30.

In [13]:
test_sizes = np.arange(0.01,0.5,0.01)
accuracys = []
for i in test_sizes:
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=i, random_state=42)
    model.fit(x_train, y_train)
    accuracy = model.score(x_test, y_test)
    accuracys.append(accuracy)
    
accuracys

[0.90625,
 0.9291338582677166,
 0.9528795811518325,
 0.9488188976377953,
 0.9369085173501577,
 0.9343832020997376,
 0.9301801801801802,
 0.9349112426035503,
 0.9316987740805605,
 0.9290220820189274,
 0.9325681492109039,
 0.9316688567674113,
 0.9344660194174758,
 0.9402480270574972,
 0.935856992639327,
 0.9388560157790927,
 0.9350046425255338,
 0.9360210341805434,
 0.9352159468438538,
 0.9376479873717443,
 0.9346356123215628,
 0.9325681492109039,
 0.9362139917695473,
 0.9355687047994741,
 0.9343434343434344,
 0.9350728155339806,
 0.9339567504383401,
 0.9334836527621195,
 0.9336235038084875,
 0.932140978432404,
 0.9368635437881874,
 0.9378698224852071,
 0.937350549976088,
 0.9350046425255338,
 0.933273219116321,
 0.9333625602805787,
 0.9304607508532423,
 0.9302325581395349,
 0.9307972480777014,
 0.9297553275453828,
 0.9257120862201693,
 0.9259676813228109,
 0.9262385321100918,
 0.922883787661406,
 0.9224833391792353,
 0.9238421955403088,
 0.9244459368703828,
 0.9273265373232489,
 0.92528

In [14]:
import matplotlib.pyplot as plt
plt.scatter(test_sizes, accuracys)
plt.xlabel('Test Size')
plt.ylabel('Passive-Aggressive Classifier Accuracy')

Text(0, 0.5, 'Passive-Aggressive Classifier Accuracy')

In [15]:
test_sizes[np.argmax(accuracys)]

0.03

Hence our Passive-Aggressive classifier model has greatest accuracy with a very small test size.

# Confusion Matrix

To visualise the performance of our model we can use a confusion matrix. This compares the predicted class with the actual class.

In [16]:
from sklearn import metrics
confusion = metrics.confusion_matrix(y_test, model.predict(x_test))
confusion

array([[1459,  102],
       [ 130, 1414]], dtype=int64)

In [17]:
print('Success rate of the model on fake news articles: ' + str(confusion[0][0]/(confusion[0][0] + confusion[0][1])))
print('Success rate of the model on real news articles: ' + str(confusion[1][1]/(confusion[1][0] + confusion[1][1])))

Success rate of the model on fake news articles: 0.934657270980141
Success rate of the model on real news articles: 0.9158031088082902


Hence the passive-agresive model is more accurate at classifying fake news articles compared to real news articles.

# Testing on unseen data

Now that we have built a Passive-Aggressive classifier model, we want to see how this model performs on unseen data. We will now preprocess two articles, one real and one fake, into vector form so we can test our model on them.

In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = stopwords.words('english')
stop_words.extend(['the','it','in'])
WNL = WordNetLemmatizer()
    

def article_preprocessor (article):
    filtered_article = ''
    article = re.sub(r'[^\w\s]', '', article)
    words = [word.lower() for word in nltk.word_tokenize(article)]
    words = [word for word in words if not word in stop_words]
    words_lemmatized = []
    for word in words:
        if word == 'us':
            words_lemmatized.append(word)
        else:
            words_lemmatized.append(WNL.lemmatize(word))
    filtered_article = " ".join([word for word in words_lemmatized])
    return filtered_article

In [19]:
def passive_aggressive_classifier (list_of_articles):
    
    #To preprocess these articles
    articles_pp = [article_preprocessor(article) for article in list_of_articles]
    new_input = df_input.append(pd.Series(articles_pp))
    tf_idf_matrix = vectorizer.fit_transform(new_input)
    orig_data_matrix = tf_idf_matrix[:len(df_input)]
    new_data_matrix = tf_idf_matrix[len(df_input):]
    
    #Performing the Passive-Aggressive classifier on the dataset
    x_train, x_test, y_train, y_test = train_test_split(orig_data_matrix, df_output, random_state=42)
    model = PassiveAggressiveClassifier()
    model.fit(x_train, y_train)
    accuracy = model.score(x_test,y_test)
    print('The Passive-Aggressive classifier model accuracy: ' +str(accuracy))
    
    prediction = model.predict(new_data_matrix)
    
    return prediction
    

In [20]:
# The top news story on the BBC
bbc_news_article = '''The furlough scheme will be extended until the end of September by the chancellor in the Budget later.
Rishi Sunak said the scheme - which pays 80% of employees' wages for the hours they cannot work in the pandemic - would help millions through "the challenging months ahead".
Some 600,000 more self-employed people will also be eligible for government help as access to grants is widened.
But Labour said the support schemes should have been extended "months ago".
Mr Sunak will outline a three-point plan to support people through the coming months, rebuild the economy and "fix" the public finances in the wake of the pandemic when he delivers his statement to the Commons at about 12:30 GMT.
But he has warned of tough economic times ahead and there are reports that he plans to raise some taxes.'''

# Here's a fake news article from the New York Mag
fake_article = '''Twelve days out from judgment day in an election in which he continues to trail badly, President Trump continues to hammer home an issue that will surely resonate with that small slice of still-undecided voters: his supposedly unfair treatment at the hands of CBS’s Lesley Stahl. After two days of promising to release unedited footage of an as-yet-unaired 60 Minutes interview, during which he walked out prematurely because he was upset with Stahl’s line of questioning, the president finally followed through on Thursday. Throughout the interview, Stahl presses Trump on issues from health care (the president says he hopes the Supreme Court strikes down Obamacare, a politically toxic position) to his derogatory comments about Anthony Fauci (Trump claims he was misinterpreted) to his false claims that the Obama campaign spied on him. The tone is of an adversarial back-and-forth, well within normal journalistic bounds. Nevertheless, Trump continuously claims that Joe Biden hasn’t been given similar treatment by CBS and cuts the proceedings short.'''

In [21]:
articles = [bbc_news_article,fake_article]
passive_aggressive_classifier(articles)

The Passive-Aggressive classifier model accuracy: 0.9311868686868687


array([1, 0], dtype=int64)

Hence our Passive-Aggressive classifier has correctly classified both the real and the fake news article!

# Hyperparameter Optimisation

We will now try and improve our model's performance using grid search. The Passive-Aggressive classifier does not have a learning rate, so we can see how changing the regularisation parameter would affect the performance of our model.

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [23]:
x = tf_idf_matrix
y = df_output.values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [24]:
model = PassiveAggressiveClassifier()
#C is the regularisation parameter of our Passive-Aggressive model
param_grid = {"C": [0.001, 0.002, 0.01, 0.02, 0.1, 0.2, 1.0, 2.0],}

In [25]:
model.fit(x_train, y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=False, fit_intercept=True,
                            loss='hinge', max_iter=1000, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [26]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')

In [27]:
results = grid.fit(x_train, y_train)
print('Mean Accuracy: %.3f' % results.best_score_)

Mean Accuracy: 0.937


In [28]:
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']

In [29]:
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))

>0.906 with: {'C': 0.001}
>0.916 with: {'C': 0.002}
>0.935 with: {'C': 0.01}
>0.937 with: {'C': 0.02}
>0.937 with: {'C': 0.1}
>0.936 with: {'C': 0.2}
>0.935 with: {'C': 1.0}
>0.936 with: {'C': 2.0}


Hence we can see that the accuracy of our Passive-Aggressive classifier model varies when we change the regularisation parameter, and our model performs best with a regularisation parameter of 0.02 with an accuracy of 93.8%.

Now to test whether varying the proportion of training data set aside as validation set will affect the performance of our model.

In [30]:
param_grid = {"validation_fraction": [0.001, 0.002, 0.01, 0.02, 0.1, 0.2],}
grid = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
results = grid.fit(x_train, y_train)
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))

>0.936 with: {'validation_fraction': 0.001}
>0.936 with: {'validation_fraction': 0.002}
>0.936 with: {'validation_fraction': 0.01}
>0.937 with: {'validation_fraction': 0.02}
>0.937 with: {'validation_fraction': 0.1}
>0.936 with: {'validation_fraction': 0.2}


Hence the accuracy doesnt change too much with different validation fractions, but has greatest accuracy of 93.7% with a validation fraction of 0.02.

We can also test whether the random state of our model will affect the accuracy.

In [31]:
param_grid = {"random_state": [0, 1, 5, 25, 42, 50],}
grid = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
results = grid.fit(x_train, y_train)
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))

>0.936 with: {'random_state': 0}
>0.935 with: {'random_state': 1}
>0.937 with: {'random_state': 5}
>0.937 with: {'random_state': 25}
>0.937 with: {'random_state': 42}
>0.936 with: {'random_state': 50}


Hence our model has a greater accuracy with a random state of 42 compared to the default random state of 0.