<a href="https://colab.research.google.com/github/BhavikRaninga/Fake-Vs.-Real-News/blob/main/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
fake = pd.read_csv('/content/drive/MyDrive/Kaggle News Data/Fake.csv')
true = pd.read_csv('/content/drive/MyDrive/Kaggle News Data/True.csv')

In [5]:
fake['label'] = "FAKE"
true['label'] = "REAL"

In [6]:
df = pd.concat([fake, true],axis = 0).sample(frac=1,random_state=42).reset_index(drop=True)

In [7]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
FAKE,23481
REAL,21417


In [8]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
def clean_text(text):
  text = re.sub(r'[^a-zA-z\s]','',text)
  text = text.lower()
  words = text.split()
  stop_words = set(stopwords.words('english'))
  words = [word for word in words if word not in stop_words]
  return ' '.join(words)


In [10]:
df['text'] = df['text'].apply(clean_text)

In [11]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,st century wire says ben stein reputable profe...,US_News,"February 13, 2017",FAKE
1,Trump drops Steve Bannon from National Securit...,washington reuters us president donald trump r...,politicsNews,"April 5, 2017",REAL
2,Puerto Rico expects U.S. to lift Jones Act shi...,reuters puerto rico governor ricardo rossello ...,politicsNews,"September 27, 2017",REAL
3,OOPS: Trump Just Accidentally Confirmed He Le...,monday donald trump embarrassed country accide...,News,"May 22, 2017",FAKE
4,Donald Trump heads for Scotland to reopen a go...,glasgow scotland reuters us presidential candi...,politicsNews,"June 24, 2016",REAL


In [12]:
X = df['text']
y = df['label']

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [14]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
fold = 1

In [15]:
for train_index,test_index in kf.split(X,y):
  #split
  X_train,X_test = X.iloc[train_index],X.iloc[test_index]
  y_train,y_test = y.iloc[train_index],y.iloc[test_index]
  #vectorization
  vectorizer = TfidfVectorizer(max_features = 5000)
  X_train_vec = vectorizer.fit_transform(X_train)
  X_test_vec = vectorizer.transform(X_test)
  #train a model
  model = LogisticRegression(max_iter = 1000)
  model.fit(X_train_vec,y_train)
  #predict & evaluate
  y_pred = model.predict(X_test_vec)
  acc = accuracy_score(y_test,y_pred)
  accuracies.append(acc)
  print(f"Fold {fold} Accuracy: {acc:.4f}")
  print(classification_report(y_test, y_pred, digits=4))
  fold += 1
print(f"Average Accuracy: {np.mean(accuracies):.4f}")

Fold 1 Accuracy: 0.9875
              precision    recall  f1-score   support

        FAKE     0.9897    0.9864    0.9881      4697
        REAL     0.9851    0.9888    0.9869      4283

    accuracy                         0.9875      8980
   macro avg     0.9874    0.9876    0.9875      8980
weighted avg     0.9875    0.9875    0.9875      8980

Fold 2 Accuracy: 0.9884
              precision    recall  f1-score   support

        FAKE     0.9916    0.9862    0.9889      4696
        REAL     0.9849    0.9909    0.9879      4284

    accuracy                         0.9884      8980
   macro avg     0.9883    0.9885    0.9884      8980
weighted avg     0.9884    0.9884    0.9884      8980

Fold 3 Accuracy: 0.9886
              precision    recall  f1-score   support

        FAKE     0.9906    0.9876    0.9891      4696
        REAL     0.9865    0.9897    0.9881      4284

    accuracy                         0.9886      8980
   macro avg     0.9886    0.9887    0.9886      8980
we