In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
df = pd.read_csv('data/news.csv', index_col=0)
labels = df.label

In [16]:
#Splitting the data for training and testing
x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=11) 

In [17]:
#Initialize TfidVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [18]:
#Initialize PassiveAggressiveClassifier and train model
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [19]:
#Predict on test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {round(score*100,2)}%")

Accuracy: 93.61%


In [20]:
#Build confusion matrix
confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

array([[581,  38],
       [ 43, 605]])