<a href="https://colab.research.google.com/github/Abhinanda24/LLM_lab/blob/main/Experiment_no_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import torch
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
dataset = load_dataset("imdb", split="test")
dataset = dataset.shuffle(seed=42)
dataset = dataset.select(range(100))
inputs = dataset["text"]
true_labels = np.array(
    ["Positive" if label == 1 else "Negative"
     for label in dataset["label"]]
)
print("\nClass Distribution:")
print(np.unique(true_labels, return_counts=True))
def zero_shot_prompt(text):
    return f"""
Classify the sentiment of this movie review as Positive or Negative.
Answer with only one word.

Review: {text}
Answer:
"""

def one_shot_prompt(text):
    return f"""
Classify the sentiment.

Review: I loved this movie. It was fantastic.
Answer: Positive

Review: {text}
Answer:
"""
def few_shot_prompt(text):
    return f"""
Classify the sentiment.

Review: I loved this movie.
Answer: Positive

Review: This movie was terrible.
Answer: Negative

Review: Amazing acting and great story.
Answer: Positive

Review: {text}
Answer:
"""
def cot_prompt(text):
    return f"""
Classify the sentiment step by step.

Review: I loved this movie.
Reasoning: The phrase shows strong positive emotion.
Answer: Positive

Review: {text}
Reasoning:
"""


def generate_answer(prompt):
    """
    Converts prompt to tokens,
    generates model output,
    converts output tokens back to text.
    """
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=15
    )

    decoded_output = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )

    return decoded_output.strip()

def evaluate_prompting(strategy_function, name):

    predictions = []

    print(f"\nEvaluating {name}...\n")

    for text in tqdm(inputs):

        prompt = strategy_function(text[:300])
        output = generate_answer(prompt).lower()


        if "positive" in output:
            predictions.append("Positive")
        elif "negative" in output:
            predictions.append("Negative")
        else:

            predictions.append("Negative")

    predictions = np.array(predictions)


    y_true = np.array([1 if x == "Positive" else 0 for x in true_labels])
    y_pred = np.array([1 if x == "Positive" else 0 for x in predictions])


    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n{name} Results:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print("Confusion Matrix:\n", cm)

    return acc, f1

zero_acc, zero_f1 = evaluate_prompting(zero_shot_prompt, "Zero-Shot")
one_acc, one_f1 = evaluate_prompting(one_shot_prompt, "One-Shot")
few_acc, few_f1 = evaluate_prompting(few_shot_prompt, "Few-Shot")
cot_acc, cot_f1 = evaluate_prompting(cot_prompt, "Chain-of-Thought")



print("\n============================")
print("FINAL SUMMARY")
print("============================")

print(f"Zero-Shot  → Acc: {zero_acc:.4f}, F1: {zero_f1:.4f}")
print(f"One-Shot   → Acc: {one_acc:.4f}, F1: {one_f1:.4f}")
print(f"Few-Shot   → Acc: {few_acc:.4f}, F1: {few_f1:.4f}")
print(f"CoT        → Acc: {cot_acc:.4f}, F1: {cot_f1:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/190 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]


Class Distribution:
(array(['Negative', 'Positive'], dtype='<U8'), array([53, 47]))

Evaluating Zero-Shot...



100%|██████████| 100/100 [00:43<00:00,  2.31it/s]



Zero-Shot Results:
Accuracy:  0.8500
Precision: 0.9000
Recall:    0.7660
F1-score:  0.8276
Confusion Matrix:
 [[49  4]
 [11 36]]

Evaluating One-Shot...



100%|██████████| 100/100 [00:33<00:00,  2.97it/s]



One-Shot Results:
Accuracy:  0.8500
Precision: 0.9000
Recall:    0.7660
F1-score:  0.8276
Confusion Matrix:
 [[49  4]
 [11 36]]

Evaluating Few-Shot...



100%|██████████| 100/100 [00:40<00:00,  2.49it/s]



Few-Shot Results:
Accuracy:  0.8500
Precision: 0.8810
Recall:    0.7872
F1-score:  0.8315
Confusion Matrix:
 [[48  5]
 [10 37]]

Evaluating Chain-of-Thought...



100%|██████████| 100/100 [00:54<00:00,  1.85it/s]


Chain-of-Thought Results:
Accuracy:  0.8500
Precision: 0.8636
Recall:    0.8085
F1-score:  0.8352
Confusion Matrix:
 [[47  6]
 [ 9 38]]

FINAL SUMMARY
Zero-Shot  → Acc: 0.8500, F1: 0.8276
One-Shot   → Acc: 0.8500, F1: 0.8276
Few-Shot   → Acc: 0.8500, F1: 0.8315
CoT        → Acc: 0.8500, F1: 0.8352



