In [1]:
#Importing required library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [3]:
#Inserting fake and real dataset
df_fake = pd.read_csv("False_news.csv")
df_true = pd.read_csv("True_news.csv")

In [4]:
df_fake.head(5)

Unnamed: 0,label,title,text
0,0,Should children wear a mask?,
1,0,Are there situations where children aged 5 yea...,
2,0,Should children with developmental disabilitie...,
3,0,Should children who have health issues or a me...,
4,0,What type of mask should children wear?,


In [5]:
df_true.head(5)

Unnamed: 0,label,title,text
0,1,"""Spraying chlorine or alcohol on the skin kill...",
1,1,"""Only older adults and young people are at risk""",
2,1,"""Children cannot get COVID-19""",
3,1,"""COVID-19 is just like the flu""",
4,1,"""Everyone with COVID-19 dies""",


In [6]:
#Inserting a column called "class" for fake and real news dataset to categories fake and true news.
df_fake["class"] = 0
df_true["class"] = 1

In [7]:
df_fake.shape, df_true.shape

((7533, 4), (1581, 4))

In [16]:
#Merging the fake and true dataframe
df_marge = pd.concat([df_fake, df_true], axis =0 )
df_marge.head(10)

Unnamed: 0,label,title,text,class
0,0,Should children wear a mask?,,0
1,0,Are there situations where children aged 5 yea...,,0
2,0,Should children with developmental disabilitie...,,0
3,0,Should children who have health issues or a me...,,0
4,0,What type of mask should children wear?,,0
5,0,How should children wear a mask?,,0
6,0,Should a child wear a mask at home?,,0
7,0,Should teachers or other adults working with c...,,0
8,0,Should children wear a mask when playing sport...,,0
9,0,Are there alternatives to fabric masks such as...,,0


In [17]:
df_marge.columns

Index(['label', 'title', 'text', 'class'], dtype='object')

In [18]:
#"title" and "label" columns is not required for detecting the fake news, so I am going to drop the columns.
df_marge = df_marge.fillna('')
df_marge['text'] = df_marge['title']+' '+df_marge['text']
df = df_marge.drop(["title", "label"], axis = 1)

In [19]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [21]:
#Randomly shuffling the dataframe
df = df.sample(frac = 1)
df.head()

Unnamed: 0,text,class
1464,Nearly 40% of Those Hospitalized for COVID-19 ...,0
6351,"""Hong Kong’s largest quarantine site with 300 ...",0
2333,Do All Seniors Need Same Coronavirus Precautio...,0
4921,Spreading so fast that it will infect 70% of h...,0
4798,The novel coronavirus (2019-nCoV) “will not la...,0


In [24]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)
df.columns

Index(['text', 'class'], dtype='object')

In [25]:
df.head()

Unnamed: 0,text,class
0,Nearly 40% of Those Hospitalized for COVID-19 ...,0
1,"""Hong Kong’s largest quarantine site with 300 ...",0
2,Do All Seniors Need Same Coronavirus Precautio...,0
3,Spreading so fast that it will infect 70% of h...,0
4,The novel coronavirus (2019-nCoV) “will not la...,0


In [26]:
#Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [27]:
df["text"] = df["text"].apply(wordopt)

In [28]:
#Defining dependent and independent variable as x and y
x = df["text"]
y = df["class"]

In [29]:
#Splitting the dataset into training set and testing set.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [30]:
#Converting text to vectors
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [40]:
#1. Logistic Regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(xv_train,y_train)
pred_lr=LR.predict(xv_test)
LR.score(xv_test, y_test)

0.8372093023255814

In [36]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      1890
           1       0.64      0.11      0.18       389

    accuracy                           0.84      2279
   macro avg       0.74      0.55      0.55      2279
weighted avg       0.81      0.84      0.79      2279



In [38]:
#2. Decision Tree Classification
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)
pred_dt = DT.predict(xv_test)
DT.score(xv_test, y_test)


0.7968407196138657

In [39]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      1890
           1       0.39      0.33      0.36       389

    accuracy                           0.80      2279
   macro avg       0.63      0.61      0.62      2279
weighted avg       0.79      0.80      0.79      2279



In [41]:
#3. Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)
pred_gbc = GBC.predict(xv_test)
GBC.score(xv_test, y_test)

0.840280824923212

In [42]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.85      0.99      0.91      1890
           1       0.68      0.12      0.21       389

    accuracy                           0.84      2279
   macro avg       0.76      0.55      0.56      2279
weighted avg       0.82      0.84      0.79      2279



In [43]:
#4. Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)
pred_rfc = RFC.predict(xv_test)
RFC.score(xv_test, y_test)

0.8319438350153576

In [44]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.84      0.98      0.91      1890
           1       0.54      0.11      0.18       389

    accuracy                           0.83      2279
   macro avg       0.69      0.54      0.54      2279
weighted avg       0.79      0.83      0.78      2279



In [45]:
#5. SVM
from sklearn import svm
from sklearn import metrics
clf = svm.SVC(kernel='linear')
clf.fit(xv_train, y_train)
pred_svm = clf.predict(xv_test)
clf.score(xv_test, y_test)

0.8516893374286968

In [46]:
print(classification_report(y_test, pred_svm))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1890
           1       0.65      0.28      0.39       389

    accuracy                           0.85      2279
   macro avg       0.76      0.62      0.65      2279
weighted avg       0.83      0.85      0.83      2279



In [49]:
#6. Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(np.asarray(xv_train.todense()), y_train)
pred_gnb = gnb.predict(np.asarray(xv_test.todense()))
gnb.score(np.asarray(xv_test.todense()), y_test)

0.6849495392716104

In [50]:
print(classification_report(y_test, pred_gnb))

              precision    recall  f1-score   support

           0       0.87      0.73      0.79      1890
           1       0.26      0.45      0.33       389

    accuracy                           0.68      2279
   macro avg       0.56      0.59      0.56      2279
weighted avg       0.76      0.68      0.71      2279

