In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\techl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\techl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\techl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\techl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
def advanced_preprocess(text):
    if not isinstance(text, str):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters but keep important punctuation
    text = re.sub(r'[^\w\s.,!?]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
             if token not in stop_words and len(token) > 2]
    
    return ' '.join(tokens)

In [4]:
def get_text_statistics(text):
    if not isinstance(text, str):
        return [0] * 8
    
    blob = TextBlob(text)
    
    # Basic statistics
    num_words = len(text.split())
    num_unique_words = len(set(text.split()))
    num_chars = len(text)
    num_sentences = len(blob.sentences)
    
    # Advanced features
    avg_word_length = num_chars / (num_words if num_words > 0 else 1)
    avg_sentence_length = num_words / (num_sentences if num_sentences > 0 else 1)
    
    # Sentiment analysis
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity
    
    return [
        num_words, num_unique_words, num_chars, num_sentences,
        avg_word_length, avg_sentence_length,
        sentiment_polarity, sentiment_subjectivity
    ]

In [6]:
print("Loading datasets...")
true_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')

Loading datasets...


In [7]:
# Add labels
true_news['label'] = 0
fake_news['label'] = 1

In [8]:
# Combine datasets
df = pd.concat([true_news, fake_news], axis=0, ignore_index=True)

In [9]:
print("Dataset shape:", df.shape)
print("\nLabel distribution:")
print(df['label'].value_counts())

Dataset shape: (44898, 5)

Label distribution:
label
1    23481
0    21417
Name: count, dtype: int64


In [10]:
# Combine title and text
df['full_text'] = df['title'] + ' ' + df['text']


In [11]:
# Preprocess text
print("\nPreprocessing text...")
df['processed_text'] = df['full_text'].apply(advanced_preprocess)


Preprocessing text...


In [12]:
# Extract text statistics
print("Extracting text features...")
text_stats = df['full_text'].apply(get_text_statistics)
text_stats_df = pd.DataFrame(text_stats.tolist(), columns=[
    'num_words', 'num_unique_words', 'num_chars', 'num_sentences',
    'avg_word_length', 'avg_sentence_length',
    'sentiment_polarity', 'sentiment_subjectivity'
])

Extracting text features...


In [13]:
# Combine all features
X = pd.concat([df['processed_text'], text_stats_df], axis=1)
y = df['label']

In [14]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
# Create preprocessing pipeline
text_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95,
        strip_accents='unicode',
        use_idf=True,
        smooth_idf=True
    ))
])

In [16]:
# Combine text features and statistical features
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'processed_text'),
        ('stats', 'passthrough', text_stats_df.columns)
    ]
)

In [17]:
# Create the full pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=200,
        max_depth=12,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    ))
])

In [18]:
# Train and evaluate using cross-validation
print("\nPerforming cross-validation...")
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")


Performing cross-validation...
Cross-validation accuracy: 0.9979 (+/- 0.0017)


In [19]:
# Train the final model
print("\nTraining final model...")
model.fit(X_train, y_train)


Training final model...


In [20]:
# Evaluate on test set
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, digits=4)
print(f"\nTest Set Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(report)


Test Set Accuracy: 0.9984

Classification Report:
              precision    recall  f1-score   support

           0     0.9984    0.9984    0.9984      4284
           1     0.9985    0.9985    0.9985      4696

    accuracy                         0.9984      8980
   macro avg     0.9984    0.9984    0.9984      8980
weighted avg     0.9984    0.9984    0.9984      8980



In [21]:
# Function to make predictions
def predict_news(title, text):
    # Prepare input
    full_text = f"{title} {text}"
    processed_text = advanced_preprocess(full_text)
    stats = get_text_statistics(full_text)
    
    # Create DataFrame with the same structure as training data
    input_df = pd.DataFrame({
        'processed_text': [processed_text],
        'num_words': [stats[0]],
        'num_unique_words': [stats[1]],
        'num_chars': [stats[2]],
        'num_sentences': [stats[3]],
        'avg_word_length': [stats[4]],
        'avg_sentence_length': [stats[5]],
        'sentiment_polarity': [stats[6]],
        'sentiment_subjectivity': [stats[7]]
    })
    
    # Make prediction
    prediction = model.predict(input_df)
    probability = model.predict_proba(input_df)
    
    return prediction[0], probability[0]

In [22]:
# Test cases
test_cases = [
    {
        "title": "New Scientific Study Reveals Breakthrough in Cancer Research",
        "text": "Researchers at a leading university have discovered a novel mechanism that could potentially lead to more effective cancer treatments. The peer-reviewed study, published in Nature, demonstrates significant results in laboratory trials."
    },
    {
        "title": "SHOCKING: You Won't Believe What Scientists Found on Mars!",
        "text": "Anonymous sources claim NASA is hiding evidence of alien structures discovered on Mars. Conspiracy theorists say government agencies are covering up the truth about extraterrestrial life."
    }
]

print("\nTest Predictions:")
for case in test_cases:
    prediction, probability = predict_news(case['title'], case['text'])
    print(f"\nTitle: {case['title']}")
    print(f"Text: {case['text']}")
    print(f"Prediction: {'Fake' if prediction == 1 else 'Real'}")
    print(f"Confidence: {max(probability):.4f}")


Test Predictions:

Title: New Scientific Study Reveals Breakthrough in Cancer Research
Text: Researchers at a leading university have discovered a novel mechanism that could potentially lead to more effective cancer treatments. The peer-reviewed study, published in Nature, demonstrates significant results in laboratory trials.
Prediction: Fake
Confidence: 0.9996

Title: SHOCKING: You Won't Believe What Scientists Found on Mars!
Text: Anonymous sources claim NASA is hiding evidence of alien structures discovered on Mars. Conspiracy theorists say government agencies are covering up the truth about extraterrestrial life.
Prediction: Fake
Confidence: 0.9997
