In [None]:
!pip install -U transformers datasets accelerate


In [None]:
!pip install -q transformers torch sentencepiece

In [None]:
from transformers import pipeline

In [None]:
long_text = """
It is a truth universally acknowledged, that a single man in possession of a good fortune,
must be in want of a wife. However little known the feelings or views of such a man may be
on his first entering a neighbourhood, this truth is so well fixed in the minds of the
surrounding families, that he is considered the rightful property of some one or other of
their daughters.

‚ÄúMy dear Mr. Bennet,‚Äù said his lady to him one day, ‚Äúhave you heard that Netherfield Park
is let at last?‚Äù Mr. Bennet replied that he had not. ‚ÄúBut it is,‚Äù returned she; ‚Äúfor Mrs.
Long has just been here, and she told me all about it.‚Äù Mr. Bennet made no answer.

‚ÄúDo you not want to know who has taken it?‚Äù cried his wife impatiently.
‚ÄúYou want to tell me, and I have no objection to hearing it.‚Äù This was invitation enough.

Why, my dear, you must know, Mrs. Long says that Netherfield is taken by a young man of large
fortune from the north of England; that he came down on Monday in a chaise and four to see
the place, and was so much delighted with it, that he agreed with Mr. Morris immediately;
that he is to take possession before Michaelmas, and some of his servants are to be in the
house by the end of next week.
"""



summarizer= pipeline(
    "summarization",
    model="facebook/bart-large-cnn"
)

summary=summarizer(
    long_text,
    max_length=130,
    min_length=40,
    do_sample=False
)

print("SUMMARY:\n")
print(summary[0]["summary_text"])

In [None]:
from datasets import load_dataset
dataset= load_dataset("tomaarsen/setfit-absa-semeval-restaurants")
dataset

In [None]:
sample=dataset["train"][0]
sample

In [None]:
for i in range(5):
   print(dataset["train"][i])
   print("-"*30)

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import torch
from collections import defaultdict

dataset = load_dataset("tomaarsen/setfit-absa-semeval-restaurants")

tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")
label2id={
    "O":0,
    "B-ASP":1,
    "I-ASP":2
}
id2label = {v: k for k, v in label2id.items()}#basically the reverse of label2id

# ========== FIX: Aggregate all aspects for each unique text ==========
def aggregate_aspects(dataset_split):
    """Group all aspects by their text to avoid conflicting labels"""
    text_to_aspects = defaultdict(list)
    for example in dataset_split:
        text_to_aspects[example["text"]].append(example["span"])

    # Create new dataset with aggregated aspects
    aggregated = []
    for text, aspects in text_to_aspects.items():
        aggregated.append({"text": text, "aspects": list(set(aspects))})  # remove duplicates
    return Dataset.from_list(aggregated)

train_dataset = aggregate_aspects(dataset["train"])
test_dataset = aggregate_aspects(dataset["test"])

def tokenize_and_align_labels(example):
    text = example["text"]
    aspects = example["aspects"]  # Now a LIST of aspects

    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True
    )

    offsets = tokenized["offset_mapping"]
    labels = [label2id["O"]] * len(offsets)  # Initialize all as O

    # Mark special tokens
    for i, (start, end) in enumerate(offsets):
        if start == end:
            labels[i] = -100

    # ========== FIX: Label ALL aspects in the text ==========
    for aspect in aspects:
        aspect_start = text.find(aspect)
        if aspect_start == -1:
            continue
        aspect_end = aspect_start + len(aspect)

        is_first_token = True
        for i, (start, end) in enumerate(offsets):
            if start == end:  # Skip special tokens
                continue
            # Token overlaps with aspect span
            if start >= aspect_start and end <= aspect_end:
                if is_first_token:
                    labels[i] = label2id["B-ASP"]
                    is_first_token = False
                else:
                    labels[i] = label2id["I-ASP"]

    tokenized["labels"] = labels
    tokenized.pop("offset_mapping")  # model doesn't need this
    return tokenized

train_dataset = train_dataset.map(tokenize_and_align_labels)
test_dataset = test_dataset.map(tokenize_and_align_labels)

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)


from transformers import BertForTokenClassification

#loading bert for token classification
model = BertForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,          #output labels per token
    id2label=id2label,
    label2id=label2id
)


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./aspect_model",          # where checkpoints & final model go
    eval_strategy="epoch",           # evaluate after each epoch
    learning_rate=2e-5,                    # standard BERT fine-tuning LR
    per_device_train_batch_size=16,        # training batch size
    per_device_eval_batch_size=16,         # evaluation batch size
    num_train_epochs=3,                    # how many full passes over training data
    weight_decay=0.01,                     # regularization
    logging_steps=50,                      # log training loss every 50 steps
    save_strategy="epoch",                 # save model after each epoch
    load_best_model_at_end=True            # keep best checkpoint (based on eval loss)
)


from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("./aspect_model")
tokenizer.save_pretrained("./aspect_model")

In [None]:
import transformers
print(transformers.__version__)


In [None]:
print(TrainingArguments)


In [None]:
from transformers import BertForTokenClassification, AutoTokenizer
import torch

# Load the model and tokenizer from the folder you saved
model = BertForTokenClassification.from_pretrained("./aspect_model")
tokenizer = AutoTokenizer.from_pretrained("./aspect_model")

# Put model in eval mode (no gradient calculation needed)
model.eval()


In [None]:
def extract_aspects(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)

    # Get predictions (logits)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # shape: [batch_size, seq_len, num_labels]

    # Get predicted label ids
    predictions = torch.argmax(logits, dim=2)  # shape: [batch_size, seq_len]

    # Convert ids to labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [model.config.id2label[pred.item()] for pred in predictions[0]]

    # Collect aspects (FIXED: proper wordpiece handling)
    aspects = []
    current_aspect = ""
    for token, label in zip(tokens, labels):
        # Skip special tokens
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        if label == "B-ASP":
            if current_aspect:
                aspects.append(current_aspect.strip())
            # Start new aspect
            current_aspect = token.replace("##", "")
        elif label == "I-ASP":
            # Continue aspect - no space if it's a subword (##)
            if token.startswith("##"):
                current_aspect += token.replace("##", "")
            else:
                current_aspect += " " + token
        else:  # O label
            if current_aspect:
                aspects.append(current_aspect.strip())
                current_aspect = ""

    if current_aspect:
        aspects.append(current_aspect.strip())

    return aspects

In [None]:
text = "The sushi was fresh and the drinks were cold."
extracted_aspects = extract_aspects(text)
print("Extracted Aspects:", extracted_aspects)


In [None]:
#sentiment model

from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer
)
import torch

# Load the dataset
dataset = load_dataset("tomaarsen/setfit-absa-semeval-restaurants")

# ============================================
# FIXED: Better data preparation
# ============================================
def prepare_sentiment_data(dataset_split):
    """Convert to sentiment classification with BETTER formatting"""
    examples = []
    skipped = 0

    for item in dataset_split:
        # Skip if label is invalid
        if not item['label'] or item['label'] not in ["positive", "neutral", "negative", "conflict"]:
            skipped += 1
            continue

        # FIXED: Use full sentence as input (aspect is already in the text!)
        aspect = item['span']
        text = item['text']
        input_text = f"{aspect}: {text}"

        # Map labels to ids
        label_map = {
            "positive": 2,
            "neutral": 1,
            "negative": 0,
            "conflict": 1  # Treat conflict as neutral
        }
        label = label_map[item['label']]

        examples.append({
            "text": input_text,
            "label": label
        })

    print(f"  Kept: {len(examples)} examples")
    if skipped > 0:
        print(f"  Skipped: {skipped} examples (invalid labels)")

    return Dataset.from_list(examples)

# Process training data
print("Processing training data...")
full_train = prepare_sentiment_data(dataset["train"])

# Split into train (80%) and validation (20%)
print("\nSplitting into train/validation...")
train_test_split = full_train.train_test_split(test_size=0.2, seed=42)
train_sentiment = train_test_split["train"]
test_sentiment = train_test_split["test"]

print(f"\nFinal split:")
print(f"  Training: {len(train_sentiment)} examples")
print(f"  Validation: {len(test_sentiment)} examples")

# Check label distribution
print("\nLabel distribution in training set:")
label_counts = {}
for example in train_sentiment:
    label = example['label']
    label_counts[label] = label_counts.get(label, 0) + 1

for label_id, count in sorted(label_counts.items()):
    label_name = {0: "negative", 1: "neutral", 2: "positive"}[label_id]
    print(f"  {label_name}: {count} ({count/len(train_sentiment)*100:.1f}%)")

print(f"\nSample:", train_sentiment[0])

# Initialize tokenizer
print("\nLoading tokenizer...")
sentiment_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return sentiment_tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Tokenize datasets
print("Tokenizing datasets...")
train_sentiment = train_sentiment.map(tokenize_function, batched=True)
test_sentiment = test_sentiment.map(tokenize_function, batched=True)

# Set format for PyTorch
train_sentiment.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_sentiment.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Initialize sentiment model
print("Loading BERT model...")
sentiment_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2}
)

# BETTER Training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,  # One more epoch
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    warmup_steps=100  # Added warmup
)

# Train sentiment model
trainer = Trainer(
    model=sentiment_model,
    args=training_args,
    train_dataset=train_sentiment,
    eval_dataset=test_sentiment,
    tokenizer=sentiment_tokenizer
)

print("\n" + "="*60)
print("TRAINING SENTIMENT CLASSIFIER")
print("="*60)
trainer.train()

# Save sentiment model
print("\nSaving model...")
trainer.save_model("./sentiment_model")
sentiment_tokenizer.save_pretrained("./sentiment_model")
print("‚úì Sentiment model saved to ./sentiment_model/")
print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)

In [None]:
from transformers import BertForTokenClassification, BertForSequenceClassification, AutoTokenizer
import torch

# Load models
print("Loading models...")
aspect_model = BertForTokenClassification.from_pretrained("./aspect_model")
aspect_tokenizer = AutoTokenizer.from_pretrained("./aspect_model")
sentiment_model = BertForSequenceClassification.from_pretrained("./sentiment_model")
sentiment_tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")

aspect_model.eval()
sentiment_model.eval()
print("‚úì Models loaded!\n")

# Extract aspects function
def extract_aspects(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [model.config.id2label[pred.item()] for pred in predictions[0]]

    aspects = []
    current_aspect = ""
    for token, label in zip(tokens, labels):
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue
        if label == "B-ASP":
            if current_aspect:
                aspects.append(current_aspect.strip())
            current_aspect = token.replace("##", "")
        elif label == "I-ASP":
            if token.startswith("##"):
                current_aspect += token.replace("##", "")
            else:
                current_aspect += " " + token
        else:
            if current_aspect:
                aspects.append(current_aspect.strip())
                current_aspect = ""
    if current_aspect:
        aspects.append(current_aspect.strip())
    return aspects

# Analyze sentiment function
def analyze_sentiment(aspect, text, model, tokenizer):
    input_text = f"{aspect}: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(predictions, dim=1).item()
    sentiment = model.config.id2label[predicted_class]
    confidence = predictions[0][predicted_class].item()
    return sentiment, confidence

# TEST EXAMPLES
test_texts = [
    "The sushi was fresh and the drinks were cold.",
    "Great food but terrible service.",
    "The atmosphere was cozy but the prices were too high."
]

for text in test_texts:
    print("="*60)
    print(f"TEXT: {text}")
    print("="*60)

    aspects = extract_aspects(text, aspect_model, aspect_tokenizer)
    print(f"Found aspects: {aspects}\n")

    for aspect in aspects:
        sentiment, confidence = analyze_sentiment(aspect, text, sentiment_model, sentiment_tokenizer)
        print(f"  ‚Ä¢ {aspect:15} ‚Üí {sentiment:8} ({confidence:.1%})")
    print()

In [None]:
# Run this to see the problem:
from datasets import load_dataset
dataset = load_dataset("tomaarsen/setfit-absa-semeval-restaurants")

label_counts = {}
for item in dataset["train"]:
    label = item['label']
    label_counts[label] = label_counts.get(label, 0) + 1

print("Label distribution in dataset:")
for label, count in sorted(label_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {label}: {count} ({count/len(dataset['train'])*100:.1f}%)")

In [None]:
# More test examples
test_texts = [
    # Mixed sentiments
    "The pasta was delicious but the waiter was rude.",
    "Loved the ambiance, hated the music.",
    "The steak was overcooked but the wine was excellent.",

    # All positive
    "Amazing pizza, friendly staff, and great location!",
    "The dessert was heavenly and the coffee was perfect.",

    # All negative
    "Terrible food, slow service, and dirty tables.",
    "The soup was cold and the bread was stale.",

    # Neutral-ish
    "The menu had many options and the restaurant was busy.",

    # Multi-word aspects
    "The fish tacos were fresh but the french fries were soggy.",
    "The ice cream was amazing but the apple pie was disappointing."
]

for text in test_texts:
    print("="*60)
    print(f"TEXT: {text}")
    print("="*60)

    aspects = extract_aspects(text, aspect_model, aspect_tokenizer)
    print(f"Found aspects: {aspects}\n")

    for aspect in aspects:
        sentiment, confidence = analyze_sentiment(aspect, text, sentiment_model, sentiment_tokenizer)
        print(f"  ‚Ä¢ {aspect:15} ‚Üí {sentiment:8} ({confidence:.1%})")
    print()

In [None]:
!pip install -q huggingface_hub

# Login to Hugging Face
from huggingface_hub import login
login()

In [None]:
from huggingface_hub import HfApi

# Replace with your Hugging Face username!
USERNAME = "AnasAhmadz"  # ‚ö†Ô∏è CHANGE THIS!

# Upload Aspect Model
print("Uploading Aspect Extraction Model...")
aspect_model.push_to_hub(f"{USERNAME}/aspect-extraction-bert")
aspect_tokenizer.push_to_hub(f"{USERNAME}/aspect-extraction-bert")
print("‚úì Aspect model uploaded!")

# Upload Sentiment Model
print("\nUploading Sentiment Model...")
sentiment_model.push_to_hub(f"{USERNAME}/aspect-sentiment-bert")
sentiment_tokenizer.push_to_hub(f"{USERNAME}/aspect-sentiment-bert")
print("‚úì Sentiment model uploaded!")

print("\n" + "="*60)
print("üéâ BOTH MODELS UPLOADED!")
print("="*60)
print(f"Aspect Model: https://huggingface.co/{USERNAME}/aspect-extraction-bert")
print(f"Sentiment Model: https://huggingface.co/{USERNAME}/aspect-sentiment-bert")