In [1]:
!pip install transformers datasets scikit-learn pandas --quiet
!pip install --upgrade transformers --quiet
!pip install sacremoses --quiet
!pip install transformers datasets sentencepiece


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import os
import torch

wczytanie danych

In [None]:
# === Wczytaj dane XML (np. MAMS) ===
def load_xml_dataset(path):
    tree = ET.parse(path)
    root = tree.getroot()
    parsed = []

    for sentence in root.findall("sentence"):
        text = sentence.find("text").text
        aspects = sentence.find("aspectCategories")
        if aspects is not None:
            for aspect in aspects.findall("aspectCategory"):
                parsed.append({
                    "text": text,
                    "aspect": aspect.attrib["category"],
                    "label": aspect.attrib["polarity"]
                })

    df = pd.DataFrame(parsed)
    return df[df["label"].isin(["positive", "negative", "neutral", "conflict"])]

# === Wczytaj dane CSV (SemEval 2014) ===
def load_csv_dataset(path):
    df = pd.read_csv(path)
    df = df[df["polarity"].isin(["positive", "negative", "neutral", "conflict"])]
    df = df.rename(columns={
        "Sentence": "text",
        "Aspect Term": "aspect",
        "polarity": "label"
    })
    return df

# === Połącz dane treningowe z XML i CSV ===
df_xml2 = load_xml_dataset("/content/data/train/val.xml")
df_xml = load_xml_dataset("/content/data/train/train.xml")
df_csv = load_csv_dataset("/content/data/train/Restaurants_Train_v2.csv")
df_combined = pd.concat([df_csv, df_xml, df_xml2], ignore_index=True).sample(frac=1).reset_index(drop=True)
print(f"Dane treningowe: {len(df_combined)} przykładów")

# === Wczytaj dane testowe z XML ===
df_test = load_xml_dataset("/content/data/test/test.xml")

print(f"Dane testowe: {len(df_test)} przykładów")


Dane treningowe: 11671 przykładów
Dane testowe: 901 przykładów


Przygotowanie danych do treningu


In [None]:
# Mapowanie etykiet tekstowych (polarity) na liczby
label2id = {"negative": 0, "neutral": 1, "positive": 2, "conflict": 3}
id2label = {v: k for k, v in label2id.items()}

# Przygotowanie danych treningowych: "text [SEP] aspect" + etykieta liczbowa
dataset = [
    {
        "text": f"{row['text']} [SEP] {row['aspect']}",
        "label": label2id[row["label"]]
    }
    for _, row in df_combined.iterrows()
]

# Tworzymy HuggingFace Dataset i dzielimy go
hf_dataset = Dataset.from_list(dataset).train_test_split(test_size=0.15)

# Przygotowanie danych testowych
df_test["input_text"] = df_test["text"] + " [SEP] " + df_test["aspect"]
df_test["label_id"] = df_test["label"].map(label2id)

hf_test_dataset = Dataset.from_pandas(
    df_test[["input_text", "label_id"]].rename(columns={"input_text": "text", "label_id": "label"})
)

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

# Tokenizacja (wspólna funkcja)
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

encoded_test_dataset = hf_test_dataset.map(tokenize)
encoded_dataset = hf_dataset.map(tokenize)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)eb5afc8b2b397fe5e04beabb9b1ef355255ade81:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/901 [00:00<?, ? examples/s]

Map:   0%|          | 0/9920 [00:00<?, ? examples/s]

Map:   0%|          | 0/1751 [00:00<?, ? examples/s]

Trenowanie modelu do rozpoznawania *sentymentu*

In [None]:



# Ustawienia treningowe
training_args = TrainingArguments(
    output_dir="./absa-roberta",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    report_to="none"  # zamiast os.environ["WANDB_DISABLED"]
)

# Trener
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer
)

# Trening
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6877,0.635042
2,0.5314,0.536536
3,0.4204,0.58096
4,0.3657,0.627326


TrainOutput(global_step=2480, training_loss=0.4816324069615333, metrics={'train_runtime': 969.7409, 'train_samples_per_second': 40.918, 'train_steps_per_second': 2.557, 'total_flos': 2610108538552320.0, 'train_loss': 0.4816324069615333, 'epoch': 4.0})

In [None]:
import shutil
from IPython.display import FileLink

# 1. Zapisz model i tokenizer do folderu
model.save_pretrained("./absa-roberta")
tokenizer.save_pretrained("./absa-roberta")

# 2. Spakuj folder do ZIP
shutil.make_archive("absa-roberta", 'zip', "./absa-roberta")

# 3. Stwórz link do pobrania
display(FileLink("absa-roberta.zip"))

model_path = "./absa-roberta"

# Wczytaj tokenizer i model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

KeyboardInterrupt: 

In [None]:
import shutil
from IPython.display import FileLink
model_path = "./absa-roberta"

# Wczytaj tokenizer i model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

wczytanie danych rozpoznanie aspektu

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from transformers import AutoTokenizer
from datasets import Dataset

# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)

# === BIO labels ===
label_list = ["O", "B-ASP", "I-ASP"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


# === Wczytaj dane CSV (SemEval 2014) ===
def load_csv_dataset(path):
    df = pd.read_csv(path)
    df = df[df["polarity"].isin(["positive", "negative", "neutral", "conflict"])]
    df = df.rename(columns={
        "Sentence": "text",
        "Aspect Term": "aspect",
        "polarity": "label"
    })
    return df
# === Load XML with aspect terms (from-to offsets) ===
def load_term_based_xml_dataset(path):
    tree = ET.parse(path)
    root = tree.getroot()
    parsed = []

    for sentence in root.findall(".//sentence"):
        text_el = sentence.find("text")
        if text_el is None:
            continue
        text = text_el.text
        opinions = sentence.find("Opinions")
        aspects = []
        if opinions is not None:
            for opinion in opinions.findall("Opinion"):
                term = opinion.attrib.get("target")
                try:
                    start = int(opinion.attrib["from"])
                    end = int(opinion.attrib["to"])
                    if term == "NULL" or start == -1:
                        continue
                    aspects.append({"term": term, "start": start, "end": end})
                except:
                    continue
        if aspects:
            parsed.append({"Sentence": text, "Aspects": aspects})

    return pd.DataFrame(parsed)

# === Load category-based XML with fake offsets (optional fallback) ===
def load_category_based_xml_dataset(path):
    tree = ET.parse(path)
    root = tree.getroot()
    parsed = []

    for sentence in root.findall(".//sentence"):
        text_el = sentence.find("text")
        if text_el is None:
            continue
        text = text_el.text
        aspect_cats = sentence.find("aspectCategories")
        aspects = []
        if aspect_cats is not None:
            for aspect in aspect_cats.findall("aspectCategory"):
                term = aspect.attrib.get("category")
                try:
                    start = text.lower().index(term.lower())
                    end = start + len(term)
                    aspects.append({"term": term, "start": start, "end": end})
                except:
                    continue
        if aspects:
            parsed.append({"Sentence": text, "Aspects": aspects})

    return pd.DataFrame(parsed)

# === Tokenize and align BIO labels ===
def tokenize_and_align_labels(batch):
    texts = batch["Sentence"]
    aspects_batch = batch["Aspects"]

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True
    )

    all_labels = []
    for i in range(len(texts)):
        labels = ["O"] * len(tokenized["offset_mapping"][i])
        for asp in aspects_batch[i]:
            start_idx = asp["start"]
            end_idx = asp["end"]
            for j, (start, end) in enumerate(tokenized["offset_mapping"][i]):
                if start == end:
                    continue
                if start >= start_idx and end <= end_idx:
                    labels[j] = "B-ASP" if start == start_idx else "I-ASP"
        label_ids = [label2id[l] for l in labels]
        all_labels.append(label_ids)

    tokenized.pop("offset_mapping")
    tokenized["labels"] = all_labels
    return tokenized

# === Load and merge datasets ===
df1 = load_term_based_xml_dataset("/content/data/train/ABSA16_Restaurants_Train_v2.xml")
df2 = load_term_based_xml_dataset("/content/data/train/Restaurants_Train.xml")
df3 = load_term_based_xml_dataset("/content/data/train/Restaurants_Test_Data_phaseB.xml")
df4 = load_term_based_xml_dataset("/content/data/train/train.xml")
df5 = load_term_based_xml_dataset("/content/data/train/val.xml")

df_combined = pd.concat([df1, df2, df3], ignore_index=True).sample(frac=1).reset_index(drop=True)

# === Clean and convert ===
df_combined = df_combined.dropna(subset=["Sentence", "Aspects"])
df_combined = df_combined[df_combined["Aspects"].map(lambda x: len(x) > 0)]

hf_dataset = Dataset.from_pandas(df_combined)
encoded_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)
encoded_dataset = encoded_dataset.train_test_split(test_size=0.15)

print(f"Ztokenizowano {len(encoded_dataset['train']) + len(encoded_dataset['test'])} przykładów.")


Map:   0%|          | 0/1234 [00:00<?, ? examples/s]

Ztokenizowano 1234 przykładów.


Generowanie parafraz z zachowwaniem aspektów

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
paraphrase_model = T5ForConditionalGeneration.from_pretrained("ramsrigouthamg/t5_paraphraser")
paraphrase_tokenizer = T5Tokenizer.from_pretrained("ramsrigouthamg/t5_paraphraser")

def generate_paraphrase(sentence, max_length=128):
    text = f"paraphrase: {sentence} </s>"
    encoding = paraphrase_tokenizer.encode_plus(text, padding="max_length", return_tensors="pt", max_length=max_length, truncation=True)
    input_ids, attention_masks = encoding["input_ids"], encoding["attention_mask"]

    outputs = paraphrase_model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        max_length=max_length,
        num_beams=5,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    paraphrased = paraphrase_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
import copy

def augment_with_paraphrases(df, n_samples=100):
    augmented_rows = []

    for i, row in df.sample(n=n_samples, random_state=42).iterrows():
        original_sentence = row["Sentence"]
        aspects = row["Aspects"]
        try:
            paraphrased = generate_paraphrase(original_sentence)
        except Exception as e:
            print(f"Błąd w zdaniu {i}: {e}")
            continue

        # Szukamy nowych offsetów aspektów (po nazwie)
        new_aspects = []
        for asp in aspects:
            term = asp["term"]
            try:
                start = paraphrased.lower().index(term.lower())
                end = start + len(term)
                new_aspects.append({"term": term, "start": start, "end": end})
            except ValueError:
                continue  # pomijamy, jeśli nie ma aspektu w parafrazie

        if new_aspects:
            augmented_rows.append({"Sentence": paraphrased, "Aspects": new_aspects})

    return pd.DataFrame(augmented_rows)


In [None]:
df_augmented = augment_with_paraphrases(df_combined, n_samples=500)  # np. 500 parafraz
df_all = pd.concat([df_combined, df_augmented]).reset_index(drop=True)

# Dalej jak wcześniej:
hf_dataset = Dataset.from_pandas(df_all)
encoded_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)
encoded_dataset = encoded_dataset.train_test_split(test_size=0.15)


przygotowanie danych

In [None]:
from datasets import Dataset

hf_dataset = Dataset.from_pandas(df_combined)
encoded_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)
encoded_dataset = encoded_dataset.train_test_split(test_size=0.15)
# Usuń niepotrzebne kolumny tekstowe (mogą przeszkadzać Trainerowi)
encoded_dataset = encoded_dataset.remove_columns(
    [col for col in encoded_dataset["train"].column_names if col not in ["input_ids", "attention_mask", "labels"]]
)
# Szybki podgląd etykiet i tokenów w danych treningowych
from transformers import AutoTokenizer

tokenizer_check = AutoTokenizer.from_pretrained("bert-base-cased")

for i in range(3):
    tokens = tokenizer_check.convert_ids_to_tokens(encoded_dataset["train"][i]["input_ids"])
    labels = encoded_dataset["train"][i]["labels"]
    decoded_labels = [id2label[l] for l in labels]
    print("==== PRZYKŁAD ====")
    for token, label in zip(tokens, decoded_labels):
        print(f"{token:15} -> {label}")
    print()




Map:   0%|          | 0/1234 [00:00<?, ? examples/s]

==== PRZYKŁAD ====
[CLS]           -> O
Our             -> O
family          -> O
never           -> O
expected        -> O
such            -> O
incredible      -> O
entertainment   -> O
in              -> O
a               -> O
restaurant      -> B-ASP
.               -> O
[SEP]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]           -> O
[PAD]      

trening aspekt

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./bert-aspect-ner",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0236,0.018878
2,0.0122,0.014942
3,0.0049,0.017793
4,0.0024,0.019344


TrainOutput(global_step=264, training_loss=0.02182378702458333, metrics={'train_runtime': 119.7016, 'train_samples_per_second': 35.02, 'train_steps_per_second': 2.205, 'total_flos': 273841476820992.0, 'train_loss': 0.02182378702458333, 'epoch': 4.0})

zapisanie modelu

In [None]:
import shutil
from IPython.display import FileLink
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# 1. Zapisz model i tokenizer do folderu
trainer.save_model("./bert-aspect-ner")
tokenizer.save_pretrained("./bert-aspect-ner")

# Ścieżka do wytrenowanego modelu
model_path = "./bert-aspect-ner"

tokenizer_en = AutoTokenizer.from_pretrained(model_path)
model_en = AutoModelForTokenClassification.from_pretrained(model_path)
model_en.eval().to("cuda" if torch.cuda.is_available() else "cpu")

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

testowanie wykrywania aspektu

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Wczytaj tokenizer i model NER
model_path = "./bert-aspect-ner"  # Ścieżka do wytrenowanego modelu
tokenizer_en = AutoTokenizer.from_pretrained(model_path)
model_en = AutoModelForTokenClassification.from_pretrained(model_path)
model_en.eval().to("cuda" if torch.cuda.is_available() else "cpu")

# Funkcja do wykrywania aspektów w angielskim tekście
def extract_aspects(text):
    inputs = tokenizer_en(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model_en.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model_en(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    tokens = tokenizer_en.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [model_en.config.id2label[pred] for pred in predictions]

    aspects = []
    current_aspect = []
    for token, label in zip(tokens, labels):
        if label == "B-ASP":
            if current_aspect:
                aspects.append(" ".join(current_aspect))
                current_aspect = []
            current_aspect.append(token)
        elif label == "I-ASP" and current_aspect:
            current_aspect.append(token)
        else:
            if current_aspect:
                aspects.append(" ".join(current_aspect))
                current_aspect = []
    if current_aspect:
        aspects.append(" ".join(current_aspect))

    # Oczyść tokeny (usuń ##)
    aspects = [aspect.replace("##", "") for aspect in aspects]
    return aspects


In [None]:
def extract_aspects_en(text):
    tokens = tokenizer_en(text, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    input_ids = tokens["input_ids"].to(model_en.device)
    attention_mask = tokens["attention_mask"].to(model_en.device)
    offset_mapping = tokens["offset_mapping"][0]

    with torch.no_grad():
        outputs = model_en(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

    aspects = []
    current = ""
    for i, (label_id, (start, end)) in enumerate(zip(predictions, offset_mapping)):
        if start == end:
            continue
        word = text[start:end]
        label = model_en.config.id2label[label_id]

        if label == "B-ASP":
            if current:
                aspects.append(current.strip())
            current = word
        elif label == "I-ASP" and current:
            current += " " + word
        else:
            if current:
                aspects.append(current.strip())
                current = ""

    if current:
        aspects.append(current.strip())

    return list(set(aspects))


In [None]:
def analyze_comment(text_pl):
    print(f"POLSKI: {text_pl}")

    # 1. Tłumaczenie komentarza
    text_en = translate_pl_to_en([text_pl])[0]
    print(f"EN: {text_en}")

    # 2. Wykrywanie aspektów
    aspects_en = extract_aspects_en(text_en)
    print(f"Aspekty (EN): {aspects_en}")

    # 3. Tłumaczenie aspektów z powrotem
    aspects_pl = translate_en_to_pl(aspects_en)
    print(f"Aspekty (PL): {aspects_pl}")

    return aspects_pl


In [None]:
# Przykładowe polskie komentarze
examples = [
    "Obsługa była tragiczna, ale pizza naprawdę świetna.",
    "Czas oczekiwania był bardzo długi, choć jedzenie pyszne.",
    "Nieprzyjemny zapach w lokalu i kelner był nieuprzejmy.",
    "The food was great but the service was terrible."
]

# Tłumaczenie komentarzy z polskiego na angielski
translated_comments = translate_pl_to_en(examples)
("The food was great but the service was terrible.")

# Wykrywanie aspektów w angielskich komentarzach
for pl_comment, en_comment in zip(examples, translated_comments):
    aspects_en = extract_aspects(en_comment)
    print(f"POLSKI: {pl_comment}")
    print(f"EN: {en_comment}")
    print(f"Aspekty (EN): {aspects_en}")
    print()


POLSKI: Obsługa była tragiczna, ale pizza naprawdę świetna.
EN: The service was tragic, but the pizza was really great.
Aspekty (EN): ['service', 'pizza']

POLSKI: Czas oczekiwania był bardzo długi, choć jedzenie pyszne.
EN: The waiting time was very long, although the food was delicious.
Aspekty (EN): ['waiting time', 'food']

POLSKI: Nieprzyjemny zapach w lokalu i kelner był nieuprzejmy.
EN: The unpleasant smell at the place and the waiter was rude.
Aspekty (EN): ['smell', 'waiter']

POLSKI: The food was great but the service was terrible.
EN: The food was great but the service was terrible.
Aspekty (EN): ['food', 'service']



testowanie wykrywania sentymentu

In [None]:
# Twoje przykłady z etykietami (aspekt + sentyment znany)
examples = [
    ("Serwowali jedzenie w Ślesinie na turnieju szachowym w dowód food tracku i czas oczekiwania to jakiś żart", "czas oczekiwania", "negative"),
]

# Połączone dane (wszystkie jako (text, aspect) — bez etykiety w tym momencie)
input_data = [(text, aspect) for text, aspect, *_ in examples]


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Model tłumaczenia PL -> EN
model_name = "Helsinki-NLP/opus-mt-pl-en"
tokenizer_pl_en = MarianTokenizer.from_pretrained(model_name)
model_pl_en = MarianMTModel.from_pretrained(model_name)

def translate_pl_to_en(sentences):
    inputs = tokenizer_pl_en(sentences, return_tensors="pt", padding=True, truncation=True)
    translated = model_pl_en.generate(**inputs, max_length=128)
    return [tokenizer_pl_en.decode(t, skip_special_tokens=True) for t in translated]

# Oddziel teksty do tłumaczenia
texts_pl = [x[0] for x in input_data]
aspects_pl = [x[1] for x in input_data]

# Tłumaczenie
translated_texts_en = translate_pl_to_en(texts_pl)
translated_aspects_en = translate_pl_to_en(aspects_pl)


In [None]:
def predict(text, aspect):
    input_text = f"{text} [SEP] {aspect}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    # Wykryj, gdzie jest model (CPU lub GPU)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predykcja
    outputs = model(**inputs)
    pred = outputs.logits.argmax(dim=-1).item()

    id2label = {0: 'negatywny', 1: 'neutralny', 2: 'pozytywny', 3: 'sprzeczny'}
    return id2label[pred]


In [None]:
for pl_text, pl_aspect, en_text, en_aspect in zip(texts_pl, aspects_pl, translated_texts_en, translated_aspects_en):
    wynik = predict(en_text, en_aspect)
    print(f"KOMENTARZ: {pl_text}\n ASPEKT: {pl_aspect}\n OCENA: {wynik}\n")


KOMENTARZ: Serwowali jedzenie w Ślesinie na turnieju szachowym w dowód food tracku i czas oczekiwania to jakiś żart
 ASPEKT: czas oczekiwania
 OCENA: negatywny

