In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("news.csv").drop(columns=["Unnamed: 0"])

In [3]:
df.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [4]:
X = df["text"]
y = df["label"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.7)
t_train = tfidf_vectorizer.fit_transform(X_train)
t_test = tfidf_vectorizer.transform(X_test)
model = PassiveAggressiveClassifier(max_iter = 50)
model.fit(t_train, y_train)
y_pred = model.predict(t_test)
acc = model.score(t_test, y_test)
print(f"{round(acc*100, 3)}%")

93.923%


In [7]:
answer = {
    "Statement": np.array(X_test),
    "Predict": np.array(y_pred),
    "Actual": np.array(y_test)
}
data = pd.DataFrame(answer)

In [8]:
data

Unnamed: 0,Statement,Predict,Actual
0,The official list of debate moderators is out ...,REAL,REAL
1,"patrick martin & barry grey, wsws.org Politica...",FAKE,FAKE
2,"On any given day, in any police department in ...",REAL,REAL
3,It was a showdown millions of Americans have b...,REAL,REAL
4,John Oliver: America’s Increasingly Segregated...,FAKE,FAKE
...,...,...,...
1262,The move would make it easier for the Trump ad...,REAL,REAL
1263,The unexpected death of Justice Antonin Scalia...,REAL,REAL
1264,Tweet Home » Headlines » Finance News » 2017 S...,FAKE,FAKE
1265,Email \nEven as surveillance hawks such as FBI...,REAL,FAKE


In [10]:
tfidf_test = tfidf_vectorizer.transform(pd.Series("Trump Says he'll ban Tiktok by this saturday"))
model.predict(tfidf_test)

array(['REAL'], dtype='<U4')