# Przetwarzanie wstępne

Importy i wczytywanie danych

In [18]:
from datasets import load_dataset
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import AutoTokenizer

dataset = load_dataset("sh0416/ag_news")

label2name = {
    1: "World",
    2: "Sports",
    3: "Business",
    4: "Sci/Tech"
}

Czyszczenie i scalanie tytułu z opisem

In [19]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

def prepare_text(example):
    full_text = example["title"] + " " + example["description"]
    example["text"] = full_text
    example["text_clean"] = clean_text(full_text)
    example["label_zero_based"] = example["label"] - 1
    return example

dataset = dataset.map(prepare_text)
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'description', 'text', 'text_clean', 'label_zero_based'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['label', 'title', 'description', 'text', 'text_clean', 'label_zero_based'],
        num_rows: 7600
    })
})

In [20]:
from datasets import Dataset

df_train = dataset["train"].to_pandas()

train_df, valid_df = train_test_split(
    df_train,
    test_size=0.15,
    stratify=df_train["label_zero_based"],
    random_state=42
)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
valid_ds = Dataset.from_pandas(valid_df, preserve_index=False)
test_ds  = dataset["test"]

len(train_ds), len(valid_ds), len(test_ds)


(102000, 18000, 7600)

Przygotowanie list i etykiet

In [21]:
X_train_text = [ex["text_clean"] for ex in train_ds]
y_train      = [ex["label_zero_based"] for ex in train_ds]

X_valid_text = [ex["text_clean"] for ex in valid_ds]
y_valid      = [ex["label_zero_based"] for ex in valid_ds]

X_test_text  = [ex["text_clean"] for ex in test_ds]
y_test       = [ex["label_zero_based"] for ex in test_ds]

Reprezentacja 1: TF-IDF

In [22]:
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_valid_tfidf = tfidf.transform(X_valid_text)
X_test_tfidf  = tfidf.transform(X_test_text)

X_train_tfidf.shape, X_valid_tfidf.shape, X_test_tfidf.shape


((102000, 20000), (18000, 20000), (7600, 20000))

In [23]:
import os

os.makedirs("../data/processed", exist_ok=True)
os.makedirs("../models", exist_ok=True)

joblib.dump(tfidf, "../models/tfidf_vectorizer.joblib")
joblib.dump({
    "X_train": X_train_tfidf,
    "y_train": y_train,
    "X_valid": X_valid_tfidf,
    "y_valid": y_valid,
    "X_test":  X_test_tfidf,
    "y_test":  y_test
}, "../data/processed/tfidf_data.joblib")


['../data/processed/tfidf_data.joblib']

Reprezentacja 2: Sekwencje + Embedding

In [24]:
max_words = 20000
max_len   = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_train_text),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_valid_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_valid_text),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_test_seq = pad_sequences(
    tokenizer.texts_to_sequences(X_test_text),
    maxlen=max_len,
    padding="post",
    truncating="post"
)

X_train_seq.shape, X_valid_seq.shape, X_test_seq.shape


((102000, 200), (18000, 200), (7600, 200))

In [25]:
joblib.dump(tokenizer, "../models/tokenizer_keras.joblib")
joblib.dump({
    "X_train_seq": X_train_seq,
    "y_train": y_train,
    "X_valid_seq": X_valid_seq,
    "y_valid": y_valid,
    "X_test_seq":  X_test_seq,
    "y_test":  y_test
}, "../data/processed/sequence_data.joblib")

['../data/processed/sequence_data.joblib']

Reprezentacja 3: Tokeny transformera

In [26]:
model_name = "distilbert-base-uncased"
hf_tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(batch):
    return hf_tokenizer(
        batch["text_clean"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_valid = valid_ds.map(tokenize_function, batched=True)
tokenized_test  = test_ds.map(tokenize_function,  batched=True)

tokenized_train = tokenized_train.remove_columns(["title", "description", "text", "text_clean", "label"])
tokenized_valid = tokenized_valid.remove_columns(["title", "description", "text", "text_clean", "label"])
tokenized_test  = tokenized_test.remove_columns(["title", "description", "text", "text_clean", "label"])

tokenized_train = tokenized_train.rename_column("label_zero_based", "labels")
tokenized_valid = tokenized_valid.rename_column("label_zero_based", "labels")
tokenized_test  = tokenized_test.rename_column("label_zero_based", "labels")

tokenized_train.set_format("torch")
tokenized_valid.set_format("torch")
tokenized_test.set_format("torch")


Map: 100%|██████████| 102000/102000 [00:03<00:00, 25734.78 examples/s]
Map: 100%|██████████| 18000/18000 [00:00<00:00, 28571.61 examples/s]


In [27]:
hf_tokenizer.save_pretrained("../models/distilbert_tokenizer")


('../models/distilbert_tokenizer/tokenizer_config.json',
 '../models/distilbert_tokenizer/special_tokens_map.json',
 '../models/distilbert_tokenizer/vocab.txt',
 '../models/distilbert_tokenizer/added_tokens.json',
 '../models/distilbert_tokenizer/tokenizer.json')