In [11]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import time
import h5py
import pickle

In [2]:
df = pd.read_csv('news.csv')
labels = df.label

In [3]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size = 0.2, random_state = 7)

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [12]:
t0 = time.perf_counter()
pac1 = PassiveAggressiveClassifier(max_iter = 50)
pac1.fit(tfidf_train, y_train)

y_pred1 = pac1.predict(tfidf_test)
score1 = accuracy_score(y_test, y_pred1)
tf = time.perf_counter() - t0
print(f'Took {round(tf, 4)} seconds')
print()
print('PAC model 1')
print(f'Accuracy: {round(score1*100,2)}%')
print()

confusion_matrix(y_test, y_pred1, labels = ['FAKE', 'REAL'])

Took 0.0839 seconds

PAC model 1
Accuracy: 92.9%



array([[589,  49],
       [ 41, 588]])

In [13]:
t0 = time.perf_counter()
pac2 = PassiveAggressiveClassifier(max_iter = 75)
pac2.fit(tfidf_train, y_train)

y_pred2 = pac2.predict(tfidf_test)
score2 = accuracy_score(y_test, y_pred2)
tf = time.perf_counter() - t0
print(f'Took {round(tf, 4)} seconds')
print()
print('PAC model 2')
print(f'Accuracy: {round(score2*100,2)}%')
print()

confusion_matrix(y_test, y_pred2, labels = ['FAKE', 'REAL'])

Took 0.0759 seconds

PAC model 2
Accuracy: 92.74%



array([[588,  50],
       [ 42, 587]])

In [14]:
t0 = time.perf_counter()
pac3 = PassiveAggressiveClassifier(max_iter = 100)
pac3.fit(tfidf_train, y_train)

y_pred3 = pac3.predict(tfidf_test)
score3 = accuracy_score(y_test, y_pred3)
tf = time.perf_counter() - t0
print(f'Took {round(tf, 4)} seconds')
print()
print('PAC model 2')
print(f'Accuracy: {round(score3*100,2)}%')
print()

confusion_matrix(y_test, y_pred3, labels = ['FAKE', 'REAL'])

Took 0.0767 seconds

PAC model 2
Accuracy: 92.66%



array([[588,  50],
       [ 43, 586]])

It seems like there is no descernable difference in max iterations, and the time difference between them is also very small.

In [15]:
pickle.dump(pac3, open('pac_model.p', 'wb'))