# 01 Data Loading and Naive Baseline

In [None]:

import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
from tqdm import tqdm


In [None]:

dataset = load_dataset("ag_news")

df = pd.DataFrame(dataset["train"])
df = df.sample(n=5000, random_state=42)

texts = df["text"].str.lower()
labels = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


In [None]:

KEYWORDS = {
    0: ["war", "government", "president", "election", "country"],
    1: ["game", "team", "match", "season", "score", "player"],
    2: ["market", "company", "stock", "bank", "economy", "business"],
    3: ["technology", "science", "software", "ai", "computer", "chip"]
}

def keyword_baseline_predict(text):
    scores = {k: 0 for k in KEYWORDS}
    for label, words in KEYWORDS.items():
        for w in words:
            if re.search(rf"\b{w}\b", text):
                scores[label] += 1
    return max(scores, key=scores.get)


In [None]:

baseline_preds = [keyword_baseline_predict(t) for t in tqdm(X_test)]
baseline_acc = accuracy_score(y_test, baseline_preds)

print(f"Baseline Accuracy: {baseline_acc:.4f}")
