In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the WELFake_Dataset
file_path1 = r"C:\Users\Armaan\OneDrive\Documents\Projects\FakeNewsDetector\WELFake_Dataset.csv"
data1 = pd.read_csv(file_path1)

# Load the FA-KES-Dataset with encoding specified
file_path2 = r"C:\Users\Armaan\OneDrive\Documents\Projects\FakeNewsDetector\FA-KES-Dataset.csv"
data2 = pd.read_csv(file_path2, encoding='ISO-8859-1')  # Try 'utf-8' if this doesn't work

# Standardize the FA-KES-Dataset column names to match WELFake_Dataset
data2 = data2.rename(columns={
    'article_title': 'title',
    'article_content': 'text',
    'labels': 'label'
})

# Keep only relevant columns
data1 = data1[['title', 'text', 'label']].dropna()
data2 = data2[['title', 'text', 'label']].dropna()

# Combine the datasets
combined_data = pd.concat([data1, data2], ignore_index=True)

# Combine 'title' and 'text' into a single field
combined_data['content'] = combined_data['title'] + " " + combined_data['text']

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply preprocessing to the combined data
combined_data['content'] = combined_data['content'].apply(preprocess_text)

# Define features and labels
X = combined_data['content']
y = combined_data['label']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# After loading data1
print("WELFake_Dataset Loaded:")
print(data1.head())
print(data1.info())

# After loading data2
print("\nFA-KES-Dataset Loaded:")
print(data2.head())
print(data2.info())

# After combining datasets
print("\nCombined Dataset:")
print(combined_data.head())
print(combined_data.info())

# After preprocessing
print("\nPreprocessed Content Sample:")
print(combined_data['content'].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Armaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


WELFake_Dataset Loaded:
                                               title  \
0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3  Bobby Jindal, raised Hindu, uses story of Chri...   
4  SATAN 2: Russia unvelis an image of its terrif...   
5  About Time! Christian Group Sues Amazon and SP...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  
5  All we can say on this one is it s about time ...      1  
<class 'pandas.core.frame.DataFrame'>
Index: 71537 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71537 non-null  object
 1   text    71537 non-null  object
 2   label   71537 n

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Vectorize the WELFake dataset
vectorizer = TfidfVectorizer(max_features=5000)

# Split WELFake dataset into training and testing sets
X_train_welfake, X_test_welfake, y_train_welfake, y_test_welfake = train_test_split(
    data1['text'].apply(preprocess_text),
    data1['label'],
    test_size=0.2,
    random_state=42,
    stratify=data1['label']
)

# Apply TF-IDF vectorization
X_train_welfake_tfidf = vectorizer.fit_transform(X_train_welfake)
X_test_welfake_tfidf = vectorizer.transform(X_test_welfake)

# Initialize Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the WELFake dataset
model.fit(X_train_welfake_tfidf, y_train_welfake)

# Evaluate the model on the WELFake test set
y_pred_welfake = model.predict(X_test_welfake_tfidf)

# Print the evaluation results
print("Evaluation on WELFake Dataset:")
print(classification_report(y_test_welfake, y_pred_welfake))
print("Accuracy on WELFake:", accuracy_score(y_test_welfake, y_pred_welfake))


Evaluation on WELFake Dataset:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      7006
           1       0.84      0.87      0.85      7302

    accuracy                           0.85     14308
   macro avg       0.85      0.85      0.85     14308
weighted avg       0.85      0.85      0.85     14308

Accuracy on WELFake: 0.848406485882024


In [13]:
# Evaluate the model on the WELFake test set
y_pred_welfake = model.predict(X_test_welfake_tfidf)

# Print the evaluation results
print("Evaluation on WELFake Dataset (Test Set):")
print(classification_report(y_test_welfake, y_pred_welfake))
print("Accuracy on WELFake Test Set:", accuracy_score(y_test_welfake, y_pred_welfake))


Evaluation on WELFake Dataset (Test Set):
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      7006
           1       0.84      0.87      0.85      7302

    accuracy                           0.85     14308
   macro avg       0.85      0.85      0.85     14308
weighted avg       0.85      0.85      0.85     14308

Accuracy on WELFake Test Set: 0.848406485882024


In [16]:
# Example new news articles for testing
new_articles = [
    "The government has announced a new plan to combat climate change. The initiative will focus on reducing carbon emissions and promoting renewable energy sources.",
    "Scientists have discovered a new species of dinosaur in Antarctica, which could shed light on the region's prehistoric ecosystem.",
    "Celebrity gossip: A famous singer is reportedly in a relationship with a fellow artist. Fans are excited about the new couple."
]

# Preprocess the new articles (apply the same preprocessing as before)
new_articles_preprocessed = [preprocess_text(article) for article in new_articles]

# Vectorize the new articles using the same TF-IDF vectorizer
new_articles_tfidf = vectorizer.transform(new_articles_preprocessed)

# Make predictions on the new articles
predictions = model.predict(new_articles_tfidf)

# Output the predictions (flip the label interpretation if necessary)
for i, article in enumerate(new_articles):
    print(f"Article {i+1}:")
    print(f"Content: {article}")
    print(f"Prediction: {'Fake' if predictions[i] == 1 else 'Real'}")  # Adjusted for correct output
    print("-" * 50)


Article 1:
Content: The government has announced a new plan to combat climate change. The initiative will focus on reducing carbon emissions and promoting renewable energy sources.
Prediction: Real
--------------------------------------------------
Article 2:
Content: Scientists have discovered a new species of dinosaur in Antarctica, which could shed light on the region's prehistoric ecosystem.
Prediction: Fake
--------------------------------------------------
Article 3:
Content: Celebrity gossip: A famous singer is reportedly in a relationship with a fellow artist. Fans are excited about the new couple.
Prediction: Fake
--------------------------------------------------


In [17]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'fake_news_model_welfake.pkl')

# Save the vectorizer to a file (since you'll need to use the same vectorizer for future predictions)
joblib.dump(vectorizer, 'tfidf_vectorizer_welfake.pkl')

print("Model and Vectorizer saved successfully!")


Model and Vectorizer saved successfully!
