# Label Analysis — Legal Text Decoder

Címkedisztribúció és minta token/bigram top listák címkénként.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer

data_dir = Path("../data/final")
train = pd.read_csv(data_dir / "train.csv")
test = pd.read_csv(data_dir / "test.csv")

train_counts = train["label"].value_counts().sort_index()
test_counts = test["label"].value_counts().sort_index()

display(train_counts)
display(test_counts)

In [1]:
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
train_counts.plot(kind="bar", ax=ax[0], color="steelblue", title="Train labels")
test_counts.plot(kind="bar", ax=ax[1], color="darkorange", title="Test labels")
for a in ax:
    a.set_xlabel("Label")
    a.set_ylabel("Count")
plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

In [None]:
# Arányok train vs test
ratio = pd.DataFrame({
    "train_pct": (train_counts / len(train)).round(3),
    "test_pct": (test_counts / len(test)).round(3),
})
ratio

In [None]:
# Szöveghossz címkénként (train)
train["text_len"] = train["text"].str.len()
len_by_label = train.groupby("label")["text_len"].describe()
len_by_label

In [None]:
# Rövid példák címkénként
samples = []
for lbl in sorted(train["label"].unique()):
    sample_text = train.loc[train["label"] == lbl, "text"].iloc[0]
    samples.append({"label": lbl, "sample": sample_text[:250]})
samples

In [None]:
# Top token címkénként (max 500 soronként, gyors tokenizálás)
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

top_tokens_by_label = {}
for lbl in sorted(train["label"].unique()):
    cnt = Counter()
    subset = train.loc[train["label"] == lbl, "text"].head(500)
    for t in subset:
        cnt.update(tokenize(t))
    top_tokens_by_label[lbl] = cnt.most_common(10)
top_tokens_by_label

In [None]:
# Top bigram címkénként (CountVectorizer, max 10)
bigram_by_label = {}
for lbl in sorted(train["label"].unique()):
    subset = train.loc[train["label"] == lbl, "text"].head(500)
    vec = CountVectorizer(ngram_range=(2,2), max_features=10)
    X = vec.fit_transform(subset)
    sums = X.sum(axis=0).A1
    df = pd.DataFrame({"bigram": vec.get_feature_names_out(), "count": sums}).sort_values("count", ascending=False)
    bigram_by_label[lbl] = df
bigram_by_label