<a href="https://colab.research.google.com/github/2303A51529/NLP-LAB/blob/main/NLP_LAB_TEST_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

def run_robust_baseline():
    start_time = time.time()
    print("--- Starting Robust Baseline (Target: ~1 min) ---")

    # 1. Load Data (Full Dataset)
    # removing metadata (headers/footers) makes the task harder and more realistic
    print("Loading dataset...")
    data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

    # 2. Split Data
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

    # 3. Create a Pipeline
    # TfidfVectorizer: Weighs words by importance (less frequent = more important)
    # ngram_range=(1, 2): Looks at single words AND pairs of words (bi-grams)
    pipeline = make_pipeline(
        TfidfVectorizer(stop_words='english', ngram_range=(1, 2)),
        MultinomialNB()
    )

    # 4. Grid Search for Hyperparameters (This consumes the most time)
    # We test different 'alpha' values to smooth the model
    parameters = {
        'multinomialnb__alpha': [0.01, 0.1, 1.0],
        'tfidfvectorizer__max_features': [5000, 10000]
    }

    print("Starting Grid Search (Cross-Validation)...")
    # cv=3 means it trains the model 3 times for EVERY parameter combination
    grid_search = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print(f"Best Parameters found: {grid_search.best_params_}")

    # 5. Evaluate
    print("Evaluating on Test Set...")
    predicted = grid_search.predict(X_test)
    accuracy = grid_search.score(X_test, y_test)

    end_time = time.time()
    duration = end_time - start_time

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Total Execution Time: {duration:.2f} seconds")

if __name__ == "__main__":
    run_robust_baseline()

--- Starting Robust Baseline (Target: ~1 min) ---
Loading dataset...
Starting Grid Search (Cross-Validation)...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Parameters found: {'multinomialnb__alpha': 0.1, 'tfidfvectorizer__max_features': 10000}
Evaluating on Test Set...

Accuracy: 0.7122
Total Execution Time: 145.99 seconds


In [2]:
import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

def run_fast_baseline():
    start_time = time.time()
    print("--- Starting Fast Baseline (Target: < 30 sec) ---")

    # 1. Load Data
    # We keep headers/footers here as they often contain "giveaway" keywords that make classification easier/faster
    print("Loading dataset...")
    data = fetch_20newsgroups(subset='all')

    # 2. Split Data
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

    # 3. Create Simple Pipeline
    # CountVectorizer: Just counts word frequency (faster than TF-IDF)
    model = make_pipeline(
        CountVectorizer(stop_words='english'),
        MultinomialNB(alpha=0.1) # Using a static alpha usually works well enough
    )

    # 4. Fit Model (Single pass, very fast)
    print("Training Model...")
    model.fit(X_train, y_train)

    # 5. Evaluate
    print("Evaluating...")
    predicted = model.predict(X_test)
    accuracy = model.score(X_test, y_test)

    end_time = time.time()
    duration = end_time - start_time

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Total Execution Time: {duration:.2f} seconds")

if __name__ == "__main__":
    run_fast_baseline()

--- Starting Fast Baseline (Target: < 30 sec) ---
Loading dataset...
Training Model...
Evaluating...

Accuracy: 0.8918
Total Execution Time: 7.36 seconds
