In [1]:
import pandas as pd
import numpy as np 
import itertools
import warnings 
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [3]:
#Reading data into a dataframe
df = pd.read_csv("WELFake_Dataset.csv")
df.shape

(72134, 4)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [6]:
print(df.isnull().sum())


Unnamed: 0      0
title         558
text           39
label           0
dtype: int64


In [10]:
df=df.dropna()

In [11]:
df.shape

(71537, 4)

In [14]:
labels1 = df.label
labels1.head()

0    1
2    1
3    0
4    1
5    1
Name: label, dtype: int64

In [15]:
#0 implies Fake
#1 implies real
X_train, X_test, y_train, y_test = train_test_split(df['text'], labels1, test_size = 0.2, random_state = 42)

In [16]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words=["the", "a", "an", "in", "on", "at"], max_df=0.7)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(X_train) 
tfidf_test=tfidf_vectorizer.transform(X_test)

In [19]:
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)

y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)

print(f'Accuracy: {round(score*100, 2)}%')
precision = precision_score(y_test, y_pred, pos_label=0)
print(f'Precision: {round(precision*100, 2)}%')

recall = recall_score(y_test, y_pred, pos_label=0)
print(f'Recall: {round(recall*100, 2)}%')

Accuracy: 95.99%
Precision: 96.39%
Recall: 95.47%


In [21]:
labels = np.unique(y_test)
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(cm)

[[6760  321]
 [ 253 6974]]
