Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix





In [None]:
import pandas as pd

# Load both datasets
df_fake = pd.read_csv("/content/Fake.csv")
df_real = pd.read_csv("/content/True.csv")

# Add labels: Fake = 1, Real = 0
df_fake["label"] = 1
df_real["label"] = 0

# Merge datasets
df = pd.concat([df_fake, df_real], axis=0)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save merged file
df.to_csv("/content/merged_news.csv", index=False)
print("Merged dataset saved as 'merged_news.csv'")


Merged dataset saved as 'merged_news.csv'


In [None]:
# Load Fake News dataset
df_fake = pd.read_csv("/content/Fake.csv")
df_fake["label"] = 1  # Assign label 1 for Fake news

# Load True News dataset
df_True = pd.read_csv("/content/True.csv")
df_True["label"] = 0  # Assign label 0 for Real news

# Merge both datasets
df = pd.concat([df_fake, df_True], ignore_index=True)

# Display dataset info
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB
None
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk

Load & Preprocess Data

In [None]:
import pandas as pd

# Load both datasets
df_fake = pd.read_csv("/content/Fake.csv")
df_real = pd.read_csv("/content/True.csv")

# Add labels: Fake = 1, Real = 0
df_fake["label"] = 1
df_real["label"] = 0

# Merge datasets
df = pd.concat([df_fake, df_real], axis=0)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save merged file
df.to_csv("/content/merged_news.csv", index=False)
print("Merged dataset saved as 'merged_news.csv'")



Merged dataset saved as 'merged_news.csv'


In [None]:
df = pd.read_csv("/content/merged_news.csv")


Preprocess Data

In [None]:
# Load merged dataset (Fake + Real)
df = pd.read_csv("/content/merged_news.csv")  # Change this to your merged dataset path

# Keep only relevant columns (assuming 'text' and 'label' exist)
df = df[['text', 'label']].dropna()

# Check class distribution
print(df['label'].value_counts())  # Ensure Fake (1) and Real (0) are balanced


label
1    23481
0    21417
Name: count, dtype: int64


Clean Text Data

In [None]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r"https?://\S+|www\.\S+", '', text)  # Remove URLs
    text = re.sub(r"<.*?>+", '', text)  # Remove HTML tags
    text = re.sub(r"[^\w\s]", '', text)  # Remove punctuation
    text = re.sub(r"\d+", '', text)  # Remove numbers
    text = text.strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['text'] = df['text'].apply(clean_text)


Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


Convert Text to Numerical Form (TF-IDF)


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


Train ML Model


In [None]:
# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.9863

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4270
           1       0.99      0.98      0.99      4710

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



Prediction Function

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Load dataset
df = pd.read_csv("/content/merged_news.csv")  # Change if needed

# Ensure correct columns
if 'text' not in df.columns or 'label' not in df.columns:
    print("Dataset must have 'text' and 'label' columns!")
else:
    # Prepare data
    X = df['text']
    y = df['label']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train a Logistic Regression model
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)

    # Save model and vectorizer
    joblib.dump(model, "/content/fake_news_model.pkl")
    joblib.dump(vectorizer, "/content/tfidf_vectorizer.pkl")

    print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
# Load the trained model and vectorizer
model = joblib.load("/content/fake_news_model.pkl")  # Correct model file
vectorizer = joblib.load("/content/tfidf_vectorizer.pkl")  # Correct vectorizer file

def predict_news(text):
    """
    Predicts if a given news article is fake or real.

    Args:
        text (str): The news article text.

    Returns:
        str: "Fake News" or "Real News"
    """
    text_tfidf = vectorizer.transform([text])
    prediction = model.predict(text_tfidf)[0]
    return "Fake News" if prediction == 1 else "Real News"

# Example usage
news_text = "FBI Russia probe helped by Australian diplomat tip-off: NYT"
print(predict_news(news_text))


Fake News
