# **Fake News Detection Model Using Decision Tree & Logistic Regression**

## **Importing Libraries**

In [40]:
# To support numerical operations, functions, arrays and matrices
import numpy as np

# To manipulate and analyze the data
import pandas as pd

# For loops and iterations
import itertools

# To divide the data into training and testing sets
from sklearn.model_selection import train_test_split

# To transform the raw data into a matrix by feature extractions
from sklearn.feature_extraction.text import TfidfVectorizer

# To get access to model evaluation parameters and matrices
from sklearn.metrics import accuracy_score, confusion_matrix

## **Data storation and attributes**

In [41]:
# Storing the csv datafile in df
df=pd.read_csv("data.csv")

# Getting initial few rows of the data
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [42]:
# Getting shape of the data (Dimensions Row x Column)
df.shape

(4009, 4)

In [43]:
# Storing the label of dataframe in labels
labels=df.Label

# Getting labels of the data frame(few rows)
labels.head()

0    1
1    1
2    1
3    1
4    1
Name: Label, dtype: int64

## **Preprocessing the data**

In [44]:
# To drop missing values in df and creating a new dataframe
new_df=df.dropna()

# Check for missing values in new dataframe and count the number of missing values
new_df.isnull().sum()

# Get few rows of new dataframe
new_df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [32]:
# Get shape of the new dataset (Dimensions Row x Column)
new_df.shape

(3988, 4)

## Test Train Splitting

In [45]:
# Extracting body(attribute) as featrur
x=new_df["Body"]

# Extracting label as target variable
y=new_df["Label"]

# Splitting into training and testing data
# Test ratio 20% (80% for training, 20% for test)
# Random state to ensure the reproducibility and consistent results
x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2, random_state=7)

## **Text Vectorization**
### To form matrix from the attributes extracted from the text (input)

In [46]:
# Initialize a TfidfVectorizer
# Stop word "English" to remove common english words
# max_df=0.7 to ignore the words which are repeatative more than 70%
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train set and storing into a new variable
tfidf_train = tfidf_vectorizer.fit_transform(x_train)

# Transform test input and storing into a new variable
tfidf_test=tfidf_vectorizer.transform(x_test)

# Algotithms

### Logistic Regression
Probabilistic Classifier

In [47]:
# Importing logistic regression from sklearn library
from sklearn.linear_model import LogisticRegression

# Initializing the model
LR = LogisticRegression()

# Fitting model over training data
LR.fit(tfidf_train, y_train)

# Making predictions over test data
pred_LR = LR.predict(tfidf_test)

# Model evaluation: Accuracy over test data
LR.score(tfidf_test, y_test)


0.9786967418546366

In [49]:
# To get access to model evaluation parameters and matrices
import sklearn.metrics

# Generating classification report
print(sklearn.metrics.classification_report(y_test, pred_LR))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       409
           1       0.97      0.99      0.98       389

    accuracy                           0.98       798
   macro avg       0.98      0.98      0.98       798
weighted avg       0.98      0.98      0.98       798



In [50]:
# Getting confusion metrix
confusion_matrix(y_test, pred_LR, labels = [1, 0])

array([[385,   4],
       [ 13, 396]])

### **Decision Tree**
Hierarchical stuctured model

In [51]:
# Importing decision tree from sklearn library
from sklearn.tree import DecisionTreeClassifier

# Initializing the model
DT = DecisionTreeClassifier()

# Fitting model over training data
DT.fit(tfidf_train, y_train)

In [52]:
# Making predictions over test data
pred_dt = DT.predict(tfidf_test)

# Model evaluation: Accuracy over test data
DT.score(tfidf_test, y_test)

0.9586466165413534

In [54]:
# To get access to model evaluation parameters and matrices
import sklearn.metrics

# Generating classification report
print(sklearn.metrics.classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       409
           1       0.97      0.95      0.96       389

    accuracy                           0.96       798
   macro avg       0.96      0.96      0.96       798
weighted avg       0.96      0.96      0.96       798



In [55]:
# Getting confusion metrix
confusion_matrix(y_test, pred_dt, labels = [1, 0])

array([[368,  21],
       [ 12, 397]])

# **Results**
As the overall results of classification report and confusion matrox show that logistic regression is performing overall better than decision tree, thats why for the prediction purpose logistic regression is preferable choice

# **Prediction**


In [56]:
# Get an obervation to predict in X-new from the test data
X_new = tfidf_test[1]

# Fit trained logistic regression model to predict
prediction = LR.predict(X_new)

# To print the outcome of prediction label (1- real, 0-fake)
print(prediction)

# Checking predicted outcome to show the corresponding message
if (prediction[0]==0):
  print('The news is Fake')
else:
  print('The news is Real')

[1]
The news is Real
