In [3]:
import pandas as pd

# Create a sample DataFrame since 'data/Fake.csv' does not exist
df = pd.DataFrame({
	'Name': ['Alice', 'Bob', 'Charlie'],
	'Age': [25, 30, 35],
	'City': ['New York', 'Los Angeles', 'Chicago']
})


In [5]:
# The file 'data/True.csv' does not exist, so we will use the existing sample DataFrame 'df'
print(df)


      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [6]:
import pandas as pd

# Load data (after downloading and placing files)
fake = pd.read_csv('fake.csv')
real = pd.read_csv('true.csv')

# Add labels: 0 = fake, 1 = real
fake['label'] = 0
real['label'] = 1

# Combine, shuffle
df = pd.concat([fake, real]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Total articles:", df.shape)
print(df['label'].value_counts())


Total articles: (44898, 5)
label
0    23481
1    21417
Name: count, dtype: int64


In [7]:
import pandas as pd

# Load data (after downloading and placing files)
fake = pd.read_csv('fake.csv')
real = pd.read_csv('true.csv')

# Add labels: 0 = fake, 1 = real
fake['label'] = 0
real['label'] = 1

# Combine, shuffle
df = pd.concat([fake, real]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Total articles:", df.shape)
print(df['label'].value_counts())


Total articles: (44898, 5)
label
0    23481
1    21417
Name: count, dtype: int64


In [11]:
# 📘 Fake News Detection using Fake.csv and True.csv

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load the CSV files
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Step 2: Add labels
fake['label'] = 0  # Fake news
true['label'] = 1  # Real news

# Step 3: Combine the datasets
df = pd.concat([fake, true]).sample(frac=1).reset_index(drop=True)

# Step 4: Preprocessing - drop unneeded columns
df = df[['text', 'label']]  # Using only the article text

# Step 5: Train-test split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 6: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 7: Train the model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Step 8: Make predictions
y_pred = model.predict(X_test_vec)

# Step 9: Evaluate the model
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("📝 Classification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.9848552338530067
📊 Confusion Matrix:
 [[5763   97]
 [  73 5292]]
📝 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      5860
           1       0.98      0.99      0.98      5365

    accuracy                           0.98     11225
   macro avg       0.98      0.98      0.98     11225
weighted avg       0.98      0.98      0.98     11225



In [14]:
# 📘 Fake News Detection using Fake.csv and True.csv

# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Use already loaded DataFrames 'fake' and 'true'
# fake and true are already defined in the notebook

# Step 3: Add labels: 0 for fake, 1 for true
fake['label'] = 0
true['label'] = 1

# Step 4: Combine datasets and shuffle
df = pd.concat([fake, true])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 5: Keep only 'text' and 'label' columns
df = df[['text', 'label']]

# Step 6: Train-test split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 7: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 8: Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Step 9: Evaluate the model
y_pred = model.predict(X_test_vec)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📝 Classification Report:\n", classification_report(y_test, y_pred))

# Step 10: Custom prediction function
def predict_news(text):
    vec = vectorizer.transform([text])
    pred = model.predict(vec)
    return "REAL" if pred[0] == 1 else "FAKE"

# Step 11: Try custom input
sample = "Government launches new policy to improve education."


✅ Accuracy: 0.9824498886414254

📊 Confusion Matrix:
 [[5747  116]
 [  81 5281]]

📝 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      5863
           1       0.98      0.98      0.98      5362

    accuracy                           0.98     11225
   macro avg       0.98      0.98      0.98     11225
weighted avg       0.98      0.98      0.98     11225



In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9360801781737194


In [17]:
# The variables 'fake', 'true', 'df', 'X', and 'y' are already defined in the notebook.
# You can use them directly here without reloading or redefining.

# Example: Display the first few rows of the combined DataFrame
df.head()


Unnamed: 0,text,label
0,"21st Century Wire says Ben Stein, reputable pr...",0
1,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,"On Monday, Donald Trump once again embarrassed...",0
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.98      5950
           1       0.98      0.99      0.98      5275

    accuracy                           0.98     11225
   macro avg       0.98      0.98      0.98     11225
weighted avg       0.98      0.98      0.98     11225



In [25]:
def predict_news(text):
    vec = vectorizer.transform([text])
    if vec.shape[1] == 0 or vec.nnz == 0:
        return "Input text contains no known words."
    pred = model.predict(vec)
    if len(pred) == 0:
        return "Prediction could not be made."
    return "REAL" if pred[0] == 1 else "FAKE"
