In [4]:
# Import necessary libraries
import pandas as pd
import string
import nltk
import joblib

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to E:\Python\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to E:\Python\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Load dataset with full paths
df_fake = pd.read_csv(r'E:\jupyter notebooks\Task3\Fake.csv')  # Fake news dataset
df_real = pd.read_csv(r'E:\jupyter notebooks\Task3\True.csv')  # Real news dataset

# Add labels
df_fake['label'] = 0  # Fake news label
df_real['label'] = 1  # Real news label

In [7]:
# Sample 1,000 articles from each category
fake_sample = df_fake.sample(n=1000, random_state=42)  # Randomly select 1k fake articles
real_sample = df_real.sample(n=1000, random_state=42)  # Randomly select 1k real articles

# Combine the sampled datasets
df = pd.concat([fake_sample, real_sample], ignore_index=True)  # Merge samples
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle data

In [9]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define text cleaning function
def clean_text(text):
    if isinstance(text, str):  # Ensure it's a string before processing
        # Remove punctuation and convert to lowercase
        text = text.lower().translate(str.maketrans('', '', string.punctuation))
        # Tokenize, remove stopwords, and lemmatize
        words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
        return ' '.join(words)
    else:
        return ''  # Return empty string if not a valid text

# Apply the cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)


In [10]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])  # Transform cleaned text into numerical data
y = df['label']  # Labels (0 for fake, 1 for real)

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train a Naïve Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)  # Train the model

# Make predictions and evaluate accuracy
y_pred = model.predict(X_test)  # Predict on test data
print("Accuracy:", accuracy_score(y_test, y_pred))  # Print accuracy score

Accuracy: 0.935


In [13]:
# Save the trained model
joblib.dump(model, 'model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved!")

Model and vectorizer saved!
