In [1]:
import pandas as pd
import numpy as np
import itertools as it

In [2]:
news = pd.read_csv('news.csv')

In [3]:
news.shape

(6335, 4)

In [4]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
labels = news['label']

In [6]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [7]:
from sklearn.model_selection import train_test_split 
train_x, val_x, train_y, val_y = train_test_split(news["text"], labels, test_size = 0.33, random_state = 1)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

## Using TFID vectorizer to trans form train and test set

In [9]:
# Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(train_x) 
tfidf_test=tfidf_vectorizer.transform(val_x)

## Create Passive Agressive Classifier Model 

In [10]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,train_y)
#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(val_y,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.26%


In [11]:
# Confusion Matrix
cnf = confusion_matrix(val_y,y_pred, labels=['FAKE','REAL'])

In [12]:
cnf

array([[1020,   74],
       [  67,  930]], dtype=int64)

##  Support Vector Classifier

In [15]:
from sklearn.svm import SVC
sv_class= SVC()
sv_class.fit(tfidf_train, train_y)
y_pred_svc = sv_class.predict(tfidf_test)
sv_score=accuracy_score(val_y,y_pred_svc)

In [17]:
print(f'SV_Accuracy: {round(sv_score*100,2)}%')
#INcrese of 0.04% in accuracy

SV_Accuracy: 93.3%


In [19]:
confusion_matrix(val_y,y_pred_svc, labels=['FAKE','REAL'])

array([[1046,   48],
       [  92,  905]], dtype=int64)

## Random Forest Classifier 

In [20]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(tfidf_train, train_y)
y_pred_rfc = rfc.predict(tfidf_test)
rfc_score = accuracy_score(val_y,y_pred_rfc)
print(f'RFC_Accuracy: {round(rfc_score*100,2)}%')

RFC_Accuracy: 90.29%


In [21]:
confusion_matrix(val_y,y_pred_rfc, labels=['FAKE','REAL'])

array([[970, 124],
       [ 79, 918]], dtype=int64)

The accracy seems to be reducing if we go to tree based model, trying out gradient boosting model next

## Gradient Boosting Classifier

In [22]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(tfidf_train, train_y)
y_pred_gbc = gbc.predict(tfidf_test)
gbc_score = accuracy_score(val_y,y_pred_gbc)
print(f'GBC_Accuracy: {round(gbc_score*100,2)}%')

GBC_Accuracy: 89.96%


In [27]:
confusion_matrix(val_y,y_pred_gbc, labels=['FAKE','REAL'])

array([[985, 109],
       [101, 896]], dtype=int64)

Gradient boosting also dosent seem to be working well, next trying out good'ol logistic regression

## Logistic Regression 

In [24]:
from sklearn.linear_model import LogisticRegression

In [26]:
lr = LogisticRegression()
lr.fit(tfidf_train, train_y)
y_pred_lr =lr.predict(tfidf_test)
lr_score = accuracy_score(val_y,y_pred_lr)
print(f'LR_Accuracy: {round(lr_score*100,2)}%')

LR_Accuracy: 92.16%


In [28]:
confusion_matrix(val_y,y_pred_lr, labels=['FAKE','REAL'])

array([[1028,   66],
       [  98,  899]], dtype=int64)

The accuracy is fine, but there is no real improvement in the score that we got. 

# Result 

We tried to make a fake news classifier using the news data which contained 6335 entries.
1. Preprocessing
    - reading the dataset using pandas read_csv
    - transfroming data usnig tfidvectorizer 
2. Model Creation
    - All models created in this part were trained without any parameter tuning and no feature engineering was done as dataset was already pretty clean and nice


 |Classifier                   |Accuracy Score|
 |-----------------------------|--------------|
 |Passive Aggressive Classifier|93.26         |
 |Support Vector Classifier    |93.30         |
 |Random Forest Classifier     |90.29         |
 |Gradient Boosting Classifier |89.96         |
 |Logistic Regression          |92.16         |