In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Make sure required resources are downloaded
nltk.download('stopwords')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Load the original datasets
df_real = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

# Add label column
df_real['label'] = 'REAL'
df_fake['label'] = 'FAKE'

# Combine both datasets
df = pd.concat([df_real, df_fake], ignore_index=True)

# Keep only necessary columns
df = df[['text', 'label']]

# Rename for consistency
df.rename(columns={'text': 'content'}, inplace=True)

# Preprocessing function: lowercase, remove punctuation, remove stopwords, stem
def preprocess(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize and remove stopwords
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    # Rejoin text
    return ' '.join(stemmed_words)

# Apply preprocessing
df['content'] = df['content'].apply(preprocess)

# Map labels to numeric
df['label'] = df['label'].map({'REAL': 0, 'FAKE': 1})

# Save the cleaned dataset
df.to_csv('cleaned_fake_news.csv', index=False)

# Optional preview
print(df.head())
print(df['label'].value_counts())
