In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/fake-and-real-news-dataset


In [16]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
true_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [10]:
true_news['label'] = 1
fake_news['label'] = 0

In [11]:
data = pd.concat([true_news, fake_news], ignore_index=True)

In [12]:
data = data.sample(frac=1).reset_index(drop=True)
print(data.head(5))

                                               title  \
0   Trump Gets Brutally Reminded Of The Sh*tty Wa...   
1  Iraq gives Kurdistan till Friday to hand over ...   
2  BOOM! MATH SHOWS Trump Would Have Beaten Obama...   
3  Vatican defends pope's avoidance of term 'Rohi...   
4  Meeting between Egyptian foreign minister and ...   

                                                text       subject  \
0  Donald Trump surely saw this response coming.A...          News   
1  BAGHDAD (Reuters) - The Iraqi government gave ...     worldnews   
2  It s easy to glance at Tuesday s popular vote ...      politics   
3  YANGON (Reuters) - The Vatican on Wednesday de...     worldnews   
4  CAIRO (Reuters) - Egyptian Foreign Minister Sa...  politicsNews   

                  date  label  
0        July 20, 2017      0  
1  September 26, 2017       1  
2         Nov 10, 2016      0  
3   November 29, 2017       1  
4     August 23, 2017       1  


In [13]:
stop_words = set(stopwords.words('english'))

In [14]:
def preprocess(text:str):
  tokens = word_tokenize(text.lower())
  tokens = [ word for word in tokens if word.isalpha() and word not in stop_words]
  return " ".join(tokens)

In [17]:
data['text'] = data['text'].apply(preprocess)

In [18]:
data.head(5)

Unnamed: 0,title,text,subject,date,label
0,Trump Gets Brutally Reminded Of The Sh*tty Wa...,donald trump surely saw response exactly respe...,News,"July 20, 2017",0
1,Iraq gives Kurdistan till Friday to hand over ...,baghdad reuters iraqi government gave kurdista...,worldnews,"September 26, 2017",1
2,BOOM! MATH SHOWS Trump Would Have Beaten Obama...,easy glance tuesday popular vote percent preci...,politics,"Nov 10, 2016",0
3,Vatican defends pope's avoidance of term 'Rohi...,yangon reuters vatican wednesday defended pope...,worldnews,"November 29, 2017",1
4,Meeting between Egyptian foreign minister and ...,cairo reuters egyptian foreign minister sameh ...,politicsNews,"August 23, 2017",1


In [19]:
X_train , X_test , y_train , y_test = train_test_split(data['text'] , data['label'] , test_size = 0.2 , random_state=42)

In [20]:
tf = TfidfVectorizer(max_df=0.7)
x_train_tf = tf.fit_transform(X_train)

In [21]:

pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(x_train_tf,y_train)

In [22]:
pred = pac.predict(tf.transform(X_test))

In [24]:
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))

0.9944320712694877
[[4704   20]
 [  30 4226]]



