In [2]:
import pandas as pd

# Load both CSV files
fake_df = pd.read_csv("Fake.csv")
real_df = pd.read_csv("True.csv")

# Add a label column
fake_df['label'] = 0  # 0 = Fake
real_df['label'] = 1  # 1 = Real

# Combine datasets
data = pd.concat([fake_df, real_df], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Display basic info
print("Dataset shape:", data.shape)
data.head()


Dataset shape: (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Sarah Silverman’s Hysterical Video For Bernie...,"On Monday, Sarah Silverman released a hilariou...",News,"March 28, 2016",0
1,BOOM! CLOCK BOY’S DAD LOSES Defamation Case In...,It s hard to be famous for being a victim in...,politics,"Jan 10, 2017",0
2,Trump Makes ANOTHER Racist Joke About Mexican...,Donald Trump has been making racist comments a...,News,"June 30, 2016",0
3,SHERIFF ARPAIO Bombshell: Obama’s Birth Certif...,Maricopa County Sheriff Joe Arpaio says a new ...,left-news,"Dec 16, 2016",0
4,Senate Democrats ask Trump attorney general pi...,WASHINGTON (Reuters) - Nine Democratic senator...,politicsNews,"January 17, 2017",1


In [3]:
import nltk
import string
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Simple text cleaning function
def clean_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply to the 'text' column
data['cleaned_text'] = data['text'].apply(clean_text)

# Preview
data[['text', 'cleaned_text']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fathima\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,cleaned_text
0,"On Monday, Sarah Silverman released a hilariou...",monday sarah silverman released hilarious vide...
1,It s hard to be famous for being a victim in...,hard famous victim arab gulf state settles dis...
2,Donald Trump has been making racist comments a...,donald trump making racist comments hispanic p...
3,Maricopa County Sheriff Joe Arpaio says a new ...,maricopa county sheriff joe arpaio says new vi...
4,WASHINGTON (Reuters) - Nine Democratic senator...,washington reuters nine democratic senators as...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Vectorization
vectorizer = TfidfVectorizer(max_df=0.7)
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9875278396436525

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4694
           1       0.99      0.99      0.99      4286

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [6]:
import os

# Create directory if it doesn't exist
os.makedirs("../saved_models", exist_ok=True)


In [7]:
import joblib

# Save model and vectorizer
joblib.dump(model, "../saved_models/fake_news_model.pkl")
joblib.dump(vectorizer, "../saved_models/tfidf_vectorizer.pkl")


['../saved_models/tfidf_vectorizer.pkl']