In [1]:
import json
import numpy as np
import joblib

from sklearn.metrics import classification_report, accuracy_score, f1_score


In [2]:
tfidf_data = joblib.load("../data/processed/tfidf_data.joblib")

X_train_tfidf = tfidf_data["X_train"]
y_train = np.array(tfidf_data["y_train"])

X_valid_tfidf = tfidf_data["X_valid"]
y_valid = np.array(tfidf_data["y_valid"])

X_test_tfidf = tfidf_data["X_test"]
y_test = np.array(tfidf_data["y_test"])

X_train_tfidf.shape, X_valid_tfidf.shape, X_test_tfidf.shape

((102000, 20000), (18000, 20000), (7600, 20000))

## Model klasyczny - regresja logistyczna

Pierwszym modelem jest klasyczny algorytm regresji logistycznej
zastosowany do wektorowej reprezentacji tekstu TF-IDF.
Model ten pełni rolę baseline’u, do którego porównywane będą
bardziej zaawansowane podejścia oparte na sieciach neuronowych.


In [3]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(
    solver="saga",
    penalty="l2",
    max_iter=300,
    n_jobs=-1,
    verbose=1
)

logreg.fit(X_train_tfidf, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.


Epoch 1, change: 1
Epoch 2, change: 0.1812539
Epoch 3, change: 0.084554396
Epoch 4, change: 0.054966523
Epoch 5, change: 0.055527235
Epoch 6, change: 0.034622331
Epoch 7, change: 0.029089694
Epoch 8, change: 0.021870258
Epoch 9, change: 0.015244889
Epoch 10, change: 0.0087437035
Epoch 11, change: 0.018006587
Epoch 12, change: 0.0048211208
Epoch 13, change: 0.0072323736
Epoch 14, change: 0.0022022674
Epoch 15, change: 0.0012978492
Epoch 16, change: 0.00067422805
Epoch 17, change: 0.0016327541
Epoch 18, change: 0.0014647031
Epoch 19, change: 0.0005096835
Epoch 20, change: 0.00019382424
Epoch 21, change: 0.00027994963
Epoch 22, change: 0.00018145925
Epoch 23, change: 0.00010387255
convergence after 24 epochs took 2 seconds


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,300


## Ewaluacja modelu klasycznego

Model oceniany jest na zbiorach walidacyjnym oraz testowym
z wykorzystaniem miar Accuracy oraz F1-score (macro).

In [4]:
y_pred_val = logreg.predict(X_valid_tfidf)
print("VALID")
print(classification_report(y_valid, y_pred_val, digits=4))
print("Accuracy:", accuracy_score(y_valid, y_pred_val))
print("F1 macro:", f1_score(y_valid, y_pred_val, average="macro"))

y_pred_test = logreg.predict(X_test_tfidf)
print("\nTEST")
print(classification_report(y_test, y_pred_test, digits=4))
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("F1 macro:", f1_score(y_test, y_pred_test, average="macro"))

VALID
              precision    recall  f1-score   support

           0     0.9300    0.9038    0.9167      4500
           1     0.9496    0.9800    0.9646      4500
           2     0.8924    0.8904    0.8914      4500
           3     0.9010    0.8996    0.9003      4500

    accuracy                         0.9184     18000
   macro avg     0.9183    0.9184    0.9182     18000
weighted avg     0.9183    0.9184    0.9182     18000

Accuracy: 0.9184444444444444
F1 macro: 0.9182428092281933

TEST
              precision    recall  f1-score   support

           0     0.9311    0.9032    0.9169      1900
           1     0.9500    0.9795    0.9645      1900
           2     0.8859    0.8784    0.8821      1900
           3     0.8918    0.8984    0.8951      1900

    accuracy                         0.9149      7600
   macro avg     0.9147    0.9149    0.9147      7600
weighted avg     0.9147    0.9149    0.9147      7600

Accuracy: 0.9148684210526316
F1 macro: 0.9146671699448687


In [5]:
logreg_metrics = {
    "val_accuracy": float(accuracy_score(y_valid, y_pred_val)),
    "val_f1_macro": float(f1_score(y_valid, y_pred_val, average="macro")),
    "test_accuracy": float(accuracy_score(y_test, y_pred_test)),
    "test_f1_macro": float(f1_score(y_test, y_pred_test, average="macro")),
}

with open("../results/logreg_metrics.json", "w") as f:
    json.dump(logreg_metrics, f, indent=2)

joblib.dump(logreg, "../models/logreg_tfidf.joblib")

['../models/logreg_tfidf.joblib']

## Sieć neuronowa zbudowana od zera

W tej części zdefiniowana została prosta architektura sieci neuronowej
zbudowanej od podstaw w bibliotece PyTorch.
Model wykorzystuje warstwę embeddingów do reprezentacji słów,
a następnie warstwy w pełni połączone do klasyfikacji tekstu.


In [6]:
import torch
import joblib

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)
print("Torch:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

seq_data = joblib.load("../data/processed/sequence_data.joblib")

X_train_seq = torch.tensor(seq_data["X_train_seq"], dtype=torch.long)
y_train_t   = torch.tensor(seq_data["y_train"], dtype=torch.long)

X_valid_seq = torch.tensor(seq_data["X_valid_seq"], dtype=torch.long)
y_valid_t   = torch.tensor(seq_data["y_valid"], dtype=torch.long)

X_test_seq  = torch.tensor(seq_data["X_test_seq"], dtype=torch.long)
y_test_t    = torch.tensor(seq_data["y_test"], dtype=torch.long)

X_train_seq.shape, X_valid_seq.shape, X_test_seq.shape

Device: mps
Torch: 2.2.1
MPS available: True


(torch.Size([102000, 200]), torch.Size([18000, 200]), torch.Size([7600, 200]))

In [7]:
from torch.utils.data import TensorDataset, DataLoader

batch_size = 1024

train_loader = DataLoader(TensorDataset(X_train_seq, y_train_t), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(TensorDataset(X_valid_seq, y_valid_t), batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(TensorDataset(X_test_seq,  y_test_t),  batch_size=batch_size, shuffle=False)

len(train_loader), len(valid_loader), len(test_loader)

(100, 18, 8)

In [8]:
from torch import nn

vocab_size = int(X_train_seq.max().item()) + 1
num_classes = 4

print("vocab_size:", vocab_size)

class SimpleTextNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        pooled = emb.mean(dim=1)
        h = torch.relu(self.fc1(pooled))
        h = self.dropout(h)
        logits = self.fc2(h)
        return logits

nn_model = SimpleTextNN(vocab_size=vocab_size, embed_dim=128, hidden_dim=128, num_classes=num_classes).to(device)

nn_model

vocab_size: 20000


SimpleTextNN(
  (embedding): Embedding(20000, 128, padding_idx=0)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=128, out_features=4, bias=True)
)

## Trening sieci neuronowej

Sieć neuronowa trenowana jest z użyciem funkcji straty
CrossEntropyLoss oraz optymalizatora Adam.
Zastosowano prosty mechanizm early stopping, aby ograniczyć
ryzyko przeuczenia modelu.

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=1e-3)

def run_epoch(model, loader, train: bool):
    model.train() if train else model.eval()

    total_loss = 0.0
    total_correct = 0
    total = 0

    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            logits = model(xb)
            loss = criterion(logits, yb)

            if train:
                loss.backward()
                optimizer.step()

        total_loss += loss.item() * xb.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == yb).sum().item()
        total += xb.size(0)

    return total_loss / total, total_correct / total

best_val_loss = float("inf")
best_state = None
patience = 2
patience_left = patience

for epoch in range(1, 11):
    train_loss, train_acc = run_epoch(nn_model, train_loader, train=True)
    val_loss, val_acc     = run_epoch(nn_model, valid_loader, train=False)

    print(f"Epoch {epoch}: "
          f"train_loss={train_loss:.4f}, train_acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")

    if val_loss < best_val_loss - 1e-4:
        best_val_loss = val_loss
        best_state = {k: v.detach().cpu().clone() for k, v in nn_model.state_dict().items()}
        patience_left = patience
    else:
        patience_left -= 1
        if patience_left == 0:
            print("Early stopping.")
            break

if best_state is not None:
    nn_model.load_state_dict(best_state)

  from .autonotebook import tqdm as notebook_tqdm


Epoch 1: train_loss=1.2506, train_acc=0.5365 | val_loss=0.9033, val_acc=0.7387
Epoch 2: train_loss=0.6164, train_acc=0.8017 | val_loss=0.4370, val_acc=0.8567
Epoch 3: train_loss=0.3912, train_acc=0.8714 | val_loss=0.3497, val_acc=0.8861
Epoch 4: train_loss=0.3226, train_acc=0.8962 | val_loss=0.3122, val_acc=0.8981
Epoch 5: train_loss=0.2856, train_acc=0.9078 | val_loss=0.2923, val_acc=0.9051
Epoch 6: train_loss=0.2589, train_acc=0.9161 | val_loss=0.2784, val_acc=0.9081
Epoch 7: train_loss=0.2388, train_acc=0.9225 | val_loss=0.2700, val_acc=0.9097
Epoch 8: train_loss=0.2224, train_acc=0.9275 | val_loss=0.2630, val_acc=0.9122
Epoch 9: train_loss=0.2088, train_acc=0.9322 | val_loss=0.2615, val_acc=0.9124
Epoch 10: train_loss=0.1975, train_acc=0.9363 | val_loss=0.2555, val_acc=0.9139


## Ewaluacja sieci neuronowej

Po zakończeniu treningu sieć neuronowa oceniana jest
na zbiorze walidacyjnym oraz testowym.

In [10]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
import json

def predict_all(model, loader):
    model.eval()
    ys, ps = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            logits = model(xb)
            preds = logits.argmax(dim=1).cpu().numpy()
            ys.append(yb.numpy())
            ps.append(preds)
    return np.concatenate(ys), np.concatenate(ps)

y_val_true, y_val_pred = predict_all(nn_model, valid_loader)
print("VALID")
print(classification_report(y_val_true, y_val_pred, digits=4))
val_acc = accuracy_score(y_val_true, y_val_pred)
val_f1 = f1_score(y_val_true, y_val_pred, average="macro")
print("Accuracy:", val_acc)
print("F1 macro:", val_f1)

y_test_true, y_test_pred = predict_all(nn_model, test_loader)
print("\nTEST")
print(classification_report(y_test_true, y_test_pred, digits=4))
test_acc = accuracy_score(y_test_true, y_test_pred)
test_f1 = f1_score(y_test_true, y_test_pred, average="macro")
print("Accuracy:", test_acc)
print("F1 macro:", test_f1)

nn_metrics = {
    "val_accuracy": float(val_acc),
    "val_f1_macro": float(val_f1),
    "test_accuracy": float(test_acc),
    "test_f1_macro": float(test_f1),
}

torch.save(nn_model.state_dict(), "../models/simple_text_nn.pt")
with open("../results/nn_metrics.json", "w") as f:
    json.dump(nn_metrics, f, indent=2)

nn_metrics

VALID
              precision    recall  f1-score   support

           0     0.9213    0.8969    0.9089      4500
           1     0.9561    0.9724    0.9642      4500
           2     0.8944    0.8787    0.8864      4500
           3     0.8840    0.9078    0.8957      4500

    accuracy                         0.9139     18000
   macro avg     0.9139    0.9139    0.9138     18000
weighted avg     0.9139    0.9139    0.9138     18000

Accuracy: 0.9139444444444444
F1 macro: 0.91382117444005

TEST
              precision    recall  f1-score   support

           0     0.9235    0.9021    0.9127      1900
           1     0.9560    0.9732    0.9645      1900
           2     0.8878    0.8742    0.8809      1900
           3     0.8819    0.9000    0.8909      1900

    accuracy                         0.9124      7600
   macro avg     0.9123    0.9124    0.9122      7600
weighted avg     0.9123    0.9124    0.9122      7600

Accuracy: 0.9123684210526316
F1 macro: 0.9122478495119385


{'val_accuracy': 0.9139444444444444,
 'val_f1_macro': 0.91382117444005,
 'test_accuracy': 0.9123684210526316,
 'test_f1_macro': 0.9122478495119385}

## Model transformerowy – DistilBERT

Ostatnim modelem jest transformer DistilBERT, który został
poddany fine-tuningowi na zbiorze AG News.
Model ten wykorzystuje mechanizm uwagi i kontekstową
reprezentację tekstu, co pozwala na uchwycenie
bardziej złożonych zależności semantycznych.

In [11]:
import re
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

dataset = load_dataset("sh0416/ag_news")

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

def prep(ex):
    ex["text_clean"] = clean_text(ex["title"] + " " + ex["description"])
    ex["labels"] = ex["label"] - 1
    return ex

dataset = dataset.map(prep)

df = dataset["train"].to_pandas()

train_df, valid_df = train_test_split(
    df,
    test_size=0.15,
    stratify=df["labels"],
    random_state=42
)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
valid_ds = Dataset.from_pandas(valid_df, preserve_index=False)
test_ds  = dataset["test"]

len(train_ds), len(valid_ds), len(test_ds)

(102000, 18000, 7600)

In [12]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tok = AutoTokenizer.from_pretrained(model_name)

def tok_fn(batch):
    return tok(
        batch["text_clean"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_tok = train_ds.map(tok_fn, batched=True)
valid_tok = valid_ds.map(tok_fn, batched=True)
test_tok  = test_ds.map(tok_fn,  batched=True)

remove_cols = [c for c in ["title", "description", "text_clean", "label"] if c in train_tok.column_names]
train_tok = train_tok.remove_columns(remove_cols)
valid_tok = valid_tok.remove_columns(remove_cols)
test_tok  = test_tok.remove_columns(remove_cols)

train_tok.column_names

Map: 100%|██████████| 102000/102000 [00:03<00:00, 29131.66 examples/s]
Map: 100%|██████████| 18000/18000 [00:00<00:00, 27617.31 examples/s]


['labels', 'input_ids', 'attention_mask']

In [13]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

acc_metric = evaluate.load("accuracy")
f1_metric  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

bert = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

args = TrainingArguments(
    output_dir="../models/distilbert_agnews",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    logging_steps=100
)

trainer = Trainer(
    model=bert,
    args=args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    tokenizer=tok,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1667,0.179174,0.943944,0.943846
2,0.1224,0.177595,0.948722,0.948735


TrainOutput(global_step=12750, training_loss=0.18469649741228888, metrics={'train_runtime': 3219.8933, 'train_samples_per_second': 63.356, 'train_steps_per_second': 3.96, 'total_flos': 6756078292992000.0, 'train_loss': 0.18469649741228888, 'epoch': 2.0})

In [15]:
bert_eval = trainer.evaluate(test_tok)
bert_eval

{'eval_loss': 0.19547230005264282,
 'eval_accuracy': 0.9455263157894737,
 'eval_f1_macro': 0.9455794274047234,
 'eval_runtime': 33.5397,
 'eval_samples_per_second': 226.597,
 'eval_steps_per_second': 7.096,
 'epoch': 2.0}

In [16]:
import json

bert_eval = trainer.evaluate(test_tok)
print("DistilBERT - TEST")
print("Accuracy:", bert_eval["eval_accuracy"])
print("F1 macro:", bert_eval["eval_f1_macro"])

bert_metrics = {
    "val_accuracy": float(trainer.evaluate(valid_tok)["eval_accuracy"]),
    "val_f1_macro": float(trainer.evaluate(valid_tok)["eval_f1_macro"]),
    "test_accuracy": float(bert_eval["eval_accuracy"]),
    "test_f1_macro": float(bert_eval["eval_f1_macro"]),
}

with open("../results/distilbert_metrics.json", "w") as f:
    json.dump(bert_metrics, f, indent=2)

trainer.save_model("../models/distilbert_finetuned_agnews")
tok.save_pretrained("../models/distilbert_finetuned_agnews")

bert_metrics


=== DistilBERT – TEST ===
Accuracy: 0.9455263157894737
F1 macro: 0.9455794274047234


{'val_accuracy': 0.9487222222222222,
 'val_f1_macro': 0.9487353919043173,
 'test_accuracy': 0.9455263157894737,
 'test_f1_macro': 0.9455794274047234}