# Fake News Detector

## Installing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import time
import os

print("=" * 70)
print("    FAKE NEWS DETECTION - TRAINING WITH TITLE + TEXT COMBINED")
print("=" * 70)

os.makedirs("model", exist_ok=True)

    FAKE NEWS DETECTION - TRAINING WITH TITLE + TEXT COMBINED


## Loading the data

In [2]:
print("\n[1/7] Loading datasets...")
start_time = time.time()

data_fake = pd.read_csv('Datasets/Fake.csv')
data_true = pd.read_csv('Datasets/True.csv')

print(f"‚úì Fake news: {len(data_fake):,} articles")
print(f"‚úì True news: {len(data_true):,} articles")

data_fake["class"] = 0
data_true['class'] = 1


[1/7] Loading datasets...
‚úì Fake news: 23,481 articles
‚úì True news: 21,417 articles


### Data Preparation

In [3]:
print("\n[2/7] Preparing data...")

# Remove last 10 for manual testing
data_fake = data_fake[:-10]
data_true = data_true[:-10]

# Merge datasets
data_merge = pd.concat([data_fake, data_true], axis=0)

# Keep title, text, and class
data = data_merge[['title', 'text', 'class']].copy()

# Handle missing values
data['title'] = data['title'].fillna('')
data['text'] = data['text'].fillna('')

# Shuffle
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"‚úì Total samples: {len(data):,}")
print(f"‚úì Fake: {(data['class']==0).sum():,}, Real: {(data['class']==1).sum():,}")

# ============================================================
# 3. TEXT PREPROCESSING
# ============================================================
print("\n[3/7] Preprocessing text...")
preprocess_start = time.time()

def wordopt(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\s+', ' ', text)
    return text.strip()

# Clean both title and text
data['title_clean'] = data['title'].apply(wordopt)
data['text_clean'] = data['text'].apply(wordopt)

# COMBINE TITLE AND TEXT - This is the key change!
# Title gets more weight by being mentioned first
data['combined'] = data['title_clean'] + ' ' + data['title_clean'] + ' ' + data['text_clean']

# Remove empty entries
data = data[data['combined'].str.len() > 10]

print(f"‚úì Preprocessing done in {time.time()-preprocess_start:.2f}s")
print(f"‚úì Final dataset: {len(data):,} articles")



[2/7] Preparing data...
‚úì Total samples: 44,878
‚úì Fake: 23,471, Real: 21,407

[3/7] Preprocessing text...


  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)
  text = re.sub('\s+', ' ', text)


‚úì Preprocessing done in 30.53s
‚úì Final dataset: 44,878 articles


#### Train And Test Split

In [4]:
print("\n[4/7] Splitting data...")

X = data['combined']  # Use combined title+text
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print(f"‚úì Train: {len(X_train):,}, Test: {len(X_test):,}")



[4/7] Splitting data...
‚úì Train: 33,658, Test: 11,220


## Vectorization

In [5]:
print("\n[5/7] Vectorizing (2-4 minutes)...")
vec_start = time.time()

vectorizer = TfidfVectorizer(
    max_features=50000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1, 2),
    sublinear_tf=True
)

xv_train = vectorizer.fit_transform(X_train)
xv_test = vectorizer.transform(X_test)

print(f"‚úì Features: {xv_train.shape[1]:,}")
print(f"‚úì Time: {time.time()-vec_start:.2f}s")

# Save vectorizer
joblib.dump(vectorizer, "model/tfidf_vectorizer.pkl")
print("‚úì Vectorizer saved")


[5/7] Vectorizing (2-4 minutes)...
‚úì Features: 50,000
‚úì Time: 32.02s
‚úì Vectorizer saved


## Training models

In [6]:
print("\n[6/7] Training models...")
print("-" * 70)

models = {
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100, max_depth=100, random_state=42, n_jobs=-1
    ),
    "Decision Tree": DecisionTreeClassifier(
        max_depth=100, random_state=42
    )
}

results = {}

for name, model in models.items():
    print(f"\nüîÑ Training {name}...")
    t_start = time.time()
    
    model.fit(xv_train, y_train)
    
    train_acc = model.score(xv_train, y_train)
    test_acc = model.score(xv_test, y_test)
    t_time = time.time() - t_start
    
    print(f"  Train: {train_acc:.4f}, Test: {test_acc:.4f}, Time: {t_time:.2f}s")
    
    # Save model
    filename = f"model/{name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(model, filename)
    print(f"  üíæ Saved to {filename}")
    
    results[name] = {
        'model': model,
        'test_acc': test_acc,
        'train_acc': train_acc
    }


[6/7] Training models...
----------------------------------------------------------------------

üîÑ Training Gradient Boosting...
  Train: 1.0000, Test: 0.9982, Time: 1306.58s
  üíæ Saved to model/gradient_boosting_model.pkl

üîÑ Training Random Forest...
  Train: 1.0000, Test: 0.9947, Time: 7.50s
  üíæ Saved to model/random_forest_model.pkl

üîÑ Training Decision Tree...
  Train: 1.0000, Test: 0.9969, Time: 32.64s
  üíæ Saved to model/decision_tree_model.pkl


In [7]:
print("\n[7/7] Summary")
print("=" * 70)
print(f"‚è±Ô∏è  Total time: {(time.time()-start_time)/60:.2f} minutes\n")

print("üìä Performance:")
for name, data in results.items():
    print(f"  {name:20} Test: {data['test_acc']:.4f}")

best = max(results.items(), key=lambda x: x[1]['test_acc'])
print(f"\nüèÜ Best: {best[0]} ({best[1]['test_acc']:.4f})")

# ============================================================
# TESTING FUNCTION
# ============================================================
print("\n" + "=" * 70)
print("TESTING FUNCTION")
print("=" * 70)

def test_news(title, text, model_name="Gradient Boosting"):
    """
    Test news with title and text
    
    Args:
        title: News title (can be empty string)
        text: News content
        model_name: Which model to use
    """
    # Clean inputs
    title_clean = wordopt(title)
    text_clean = wordopt(text)
    
    # Combine same way as training (title appears twice for emphasis)
    combined = f"{title_clean} {title_clean} {text_clean}"
    
    # Vectorize
    vec = vectorizer.transform([combined])
    
    # Predict
    model = results[model_name]['model']
    pred = model.predict(vec)[0]
    
    result = "FAKE NEWS ‚ùå" if pred == 0 else "REAL NEWS ‚úÖ"
    
    if hasattr(model, 'predict_proba'):
        proba = model.predict_proba(vec)[0]
        print(f"\nüîç Model: {model_name}")
        print(f"üì∞ Prediction: {result}")
        print(f"üìä Confidence: Fake={proba[0]:.1%}, Real={proba[1]:.1%}")
    else:
        print(f"\nüîç Model: {model_name}")
        print(f"üì∞ Prediction: {result}")
    
    return pred

print("\n‚úÖ Testing function ready!")
print("\nüìù Usage:")
print('   test_news("Breaking News Title", "Full article text...", "Gradient Boosting")')
print("\nüí° You can now test with BOTH title and content!")

print("\n" + "=" * 70)
print("‚ú® TRAINING COMPLETE! ‚ú®")
print("=" * 70)
print("\nüéØ Your models now work with:")
print("   1. Just text (if title is empty)")
print("   2. Title + text (for better accuracy)")
print("   3. Just title (if text is empty, but less accurate)")
print("\nüöÄ Ready for Streamlit deployment!")
print("=" * 70)


[7/7] Summary
‚è±Ô∏è  Total time: 23.53 minutes

üìä Performance:
  Gradient Boosting    Test: 0.9982
  Random Forest        Test: 0.9947
  Decision Tree        Test: 0.9969

üèÜ Best: Gradient Boosting (0.9982)

TESTING FUNCTION

‚úÖ Testing function ready!

üìù Usage:
   test_news("Breaking News Title", "Full article text...", "Gradient Boosting")

üí° You can now test with BOTH title and content!

‚ú® TRAINING COMPLETE! ‚ú®

üéØ Your models now work with:
   1. Just text (if title is empty)
   2. Title + text (for better accuracy)
   3. Just title (if text is empty, but less accurate)

üöÄ Ready for Streamlit deployment!
