## Importing libraries

In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split # To split data into train and test
from sklearn.feature_extraction.text import TfidfVectorizer # To convert text into numbers
from sklearn.linear_model import LogisticRegression # Model
from sklearn.metrics import accuracy_score, classification_report # For evaluation

## Load dataset

In [40]:
df = pd.read_csv("https://raw.githubusercontent.com/GeorgeMcIntire/fake_real_news_dataset/main/fake_and_real_news_dataset.csv")
df.head()


Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,REAL
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,REAL
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",REAL
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,REAL
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,REAL


In [41]:
df.columns = ['idd', 'title', 'text', 'label']
# Renaming label column to numeric

df['label'] = df['label'].str.strip()  # Remove spaces or newline chars, we were getting NaN without it in the label
df['label'] = df['label'].map({'REAL':1, 'FAKE':0})

# Split into featuers and target
X= df['text']
y= df['label']

df.head()

Unnamed: 0,idd,title,text,label
0,Fq+C96tcx+,‘A target on Roe v. Wade ’: Oklahoma bill maki...,UPDATE: Gov. Fallin vetoed the bill on Friday....,1
1,bHUqK!pgmv,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,1
2,4Y4Ubf%aTi,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",1
3,_CoY89SJ@K,Grand jury in Texas indicts activists behind P...,A Houston grand jury investigating criminal al...,1
4,+rJHoRQVLe,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,1


In [45]:
# Spliting into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

# Vectorize Text with TF-IDF

In [46]:
vectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)
X_train_tfidf= vectorizer.fit_transform(X_train) # Learn vocabulary and tranform
X_test_tfidf=vectorizer.transform(X_test) # Only transform test data



## Train Logistic Regression Model

In [47]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate the model

In [49]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.9085963003264418
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       450
           1       0.95      0.87      0.91       469

    accuracy                           0.91       919
   macro avg       0.91      0.91      0.91       919
weighted avg       0.91      0.91      0.91       919



Our fake news detector correctly identifies 91% of cases overall. For real news, it’s very precise (0.95), meaning it rarely mislabels fake as real. And for fake news, it has high recall (0.95), meaning it catches most of the fake content.