# Fake News Detection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

## Importing Dataset

In [2]:
# train = pd.read_csv("kaggle/input/fake-news-dataset/train.csv")
train = pd.read_csv("train.csv")
train

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [3]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## Inserting a column "class" as target feature

In [4]:
train.shape

(20800, 5)

In [5]:
train.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

## Removing columns which are not required

In [6]:
train = train.drop(["title", "author"], axis = 1)
train

Unnamed: 0,id,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,Ever get the feeling your life circles the rou...,0
2,2,"Why the Truth Might Get You Fired October 29, ...",1
3,3,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...
20795,20795,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,When the Green Bay Packers lost to the Washing...,0
20797,20797,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [7]:
train.isnull().sum()

id        0
text     39
label     0
dtype: int64

In [8]:
# drop null values (row wise)
df = train.dropna()
df

Unnamed: 0,id,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,Ever get the feeling your life circles the rou...,0
2,2,"Why the Truth Might Get You Fired October 29, ...",1
3,3,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...
20795,20795,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,When the Green Bay Packers lost to the Washing...,0
20797,20797,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",1


## Random Shuffling the dataframe

In [9]:
df = df.sample(frac = 1)

In [10]:
df.head()

Unnamed: 0,id,text,label
17130,17130,Tweet \nIn this guest post for The UnReal Time...,1
18915,18915,WASHINGTON — After eight years of chafing u...,0
15486,15486,Post was not sent - check your email addresses...,1
2810,2810,\nWorld famous author Stephen King has a mes...,1
5792,5792,Share This \nNuclear weapons present the great...,1


In [11]:
df.drop(["id"], axis = 1, inplace = True)
df.reset_index(drop=True, inplace = True)

In [12]:
df.columns

Index(['text', 'label'], dtype='object')

In [13]:
df.head()

Unnamed: 0,text,label
0,Tweet \nIn this guest post for The UnReal Time...,1
1,WASHINGTON — After eight years of chafing u...,0
2,Post was not sent - check your email addresses...,1
3,\nWorld famous author Stephen King has a mes...,1
4,Share This \nNuclear weapons present the great...,1


## Creating a function to process the texts

In [14]:
def wordopt(text):
    text = str(text)
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [15]:
df["text"] = df["text"].apply(wordopt)

In [16]:
x = df["text"]
y = df["label"]

## Defining dependent and independent variables

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

## Convert text to vectors

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [19]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...
...,...,...,...,...
5195,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...
5196,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...
5197,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...
5198,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...


In [25]:
prediction = []
test = test.dropna()

for i in test['text'].head(5):
    prediction.append(manual_testing(i))
    
final_df = pd.DataFrame(prediction, columns=["News","LR", "DT", "RFC", "Bagging"])
final_df.to_csv("output.csv")
final_df

Unnamed: 0,News,LR,DT,RFC,Bagging
0,"PALO ALTO, Calif. — After years of scorning...",Fake News,Fake News,Fake News,Fake News
1,Videos #NoDAPL: Native American Leaders Vow to...,Not A Fake News,Not A Fake News,Not A Fake News,Not A Fake News
2,"If at first you don’t succeed, try a different...",Fake News,Fake News,Fake News,Fake News
3,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,Not A Fake News,Not A Fake News,Not A Fake News,Not A Fake News
4,"Sunday on NBC’s “Meet the Press,” House Minori...",Fake News,Fake News,Fake News,Fake News


In [26]:
def test_data(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)
    return 0 if [pred_LR[0],pred_DT[0],pred_RFC[0]].count(0) > [pred_LR[0],pred_DT[0],pred_RFC[0]].count(1) else 1

def calculate_accuracy(y_true, y_pred):
    # Ensure that the lengths of y_true and y_pred are the same
    if len(y_true) != len(y_pred):
        raise ValueError("Lengths of y_true and y_pred must be the same")

    # Calculate the number of correct predictions
    correct_predictions = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)

    # Calculate the total number of predictions
    total_predictions = len(y_true)

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions

    return accuracy

predicted_y = []
for i in x_test:
    predicted_y.append(test_data(i))

accuracy = calculate_accuracy(y_test, predicted_y)
print("Accuracy:", accuracy)

Accuracy: 0.9270406934745967


## Boosting

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize models
LR = LogisticRegression()
DT = DecisionTreeClassifier()
RFC = RandomForestClassifier(random_state=0)

# Train Logistic Regression model
LR.fit(xv_train, y_train)
pred_lr = LR.predict(xv_train)

# Train Decision Tree model on residuals from Logistic Regression
DT.fit(xv_train, y_train - pred_lr)
pred_dt = DT.predict(xv_train)

# Train Random Forest model on residuals from Decision Tree
RFC.fit(xv_train, y_train - (pred_lr + pred_dt))
pred_rfc = RFC.predict(xv_train)

# Combine predictions on training data
boosted_predictions_train = pred_lr + pred_dt + pred_rfc

# Make predictions on test data
pred_lr_test = LR.predict(xv_test)
pred_dt_test = DT.predict(xv_test)
pred_rfc_test = RFC.predict(xv_test)

# Combine predictions on test data
boosted_predictions_test = pred_lr_test + pred_dt_test + pred_rfc_test

# Calculate accuracy on test data
accuracy = accuracy_score(y_test, boosted_predictions_test)
print("Accuracy:", accuracy)

Accuracy: 0.9241512159884421


In [26]:
rmse = mean_squared_error(y_test, boosted_predictions_test, squared=False)
print("Root Mean Squared Error (Boosted Model):", rmse)

Root Mean Squared Error (Boosted Model): 0.2793132508961468


In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize models
LR = LogisticRegression()
DT = DecisionTreeClassifier()
RFC = RandomForestClassifier(random_state=0)

# Train Logistic Regression model
LR.fit(xv_train, y_train)
pred_lr = LR.predict(xv_train)

# Train Decision Tree model on residuals from Logistic Regression
DT.fit(xv_train, y_train - pred_lr)
pred_dt = DT.predict(xv_train)

# Train Random Forest model on residuals from Decision Tree
RFC.fit(xv_train, y_train - (pred_lr + pred_dt))
pred_rfc = RFC.predict(xv_train)

# Combine predictions on training data
boosted_predictions_train = pred_lr + pred_dt + pred_rfc

# Make predictions on test data
pred_lr_test = LR.predict(xv_test)
pred_dt_test = DT.predict(xv_test)
pred_rfc_test = RFC.predict(xv_test)

# Combine predictions on test data
boosted_predictions_test = pred_lr_test + pred_dt_test + pred_rfc_test

# Calculate accuracy on test data
accuracy = accuracy_score(y_test, boosted_predictions_test)
print("Accuracy:", accuracy)

rmse = mean_squared_error(y_test, boosted_predictions_test, squared=False)
print("Root Mean Squared Error (Boosted Model):", rmse)

Accuracy: 0.9212617385022875
Root Mean Squared Error (Boosted Model): 0.28570568594948736
