In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers accelerate bitsandbytes

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

In [4]:
merged_model_path = "/content/drive/MyDrive/Colab Notebooks/fine_tuning/qwen-merged"

In [5]:
print("üìÅ Files in merged model directory:")
if os.path.exists(merged_model_path):
    for f in sorted(os.listdir(merged_model_path))[:10]:
        print(f"  {f}")
else:
    print("‚ùå Directory not found! Check the path.")
    base_path = "/content/drive/MyDrive/Colab Notebooks/fine_tuning"
    if os.path.exists(base_path):
        print(f"\nüìÇ Available folders in {base_path}:")
        for item in os.listdir(base_path):
            print(f"  {item}")

üìÅ Files in merged model directory:
  chat_template.jinja
  config.json
  generation_config.json
  model.safetensors
  tokenizer.json
  tokenizer_config.json


In [6]:
print("\n‚è≥ Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    merged_model_path,
    padding_side="left",
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("‚ö†Ô∏è Set pad_token = eos_token")


‚è≥ Loading Tokenizer...


In [7]:
print("\n‚è≥ Loading Merged Model... (this may take 2-3 minutes)")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"üñ•Ô∏è Using device: {device}")

merged_model = AutoModelForCausalLM.from_pretrained(
    merged_model_path,
    torch_dtype=torch.float16,
)

`torch_dtype` is deprecated! Use `dtype` instead!



‚è≥ Loading Merged Model... (this may take 2-3 minutes)


Loading weights:   0%|          | 0/291 [00:01<?, ?it/s]



In [8]:
merged_model.config.use_cache = True
merged_model.eval()

print("‚úÖ Model loaded successfully!")
print(f"üìä Model device map: {merged_model.device if hasattr(merged_model, 'device') else 'See device_map above'}")

‚úÖ Model loaded successfully!
üìä Model device map: cpu


In [9]:
import pandas as pd
from datasets import Dataset

file_path = '/content/drive/MyDrive/Colab Notebooks/fine_tuning/amazon_product_details.csv'
df = pd.read_csv(file_path)

df['category'] = df['category'].apply(lambda x: x.split('|')[-1])

products = df[['category', 'product_name']].rename(columns={'product_name': 'text'})
description = df[['category', 'about_product']].rename(columns={'about_product': 'text'})

products['task_type'] = 'Product Name'
description['task_type'] = 'Product Description'

df_combined = pd.concat([products, description], ignore_index=True)

dataset = Dataset.from_pandas(df_combined)
dataset = dataset.shuffle(seed=0)
dataset = dataset.train_test_split(test_size=0.25)

print(f"‚úÖ Dataset loaded!")
print(f"   Train: {len(dataset['train'])} samples")
print(f"   Test:  {len(dataset['test'])} samples")
print(f"\nüìå Sample: {dataset['test'][0]}")

‚úÖ Dataset loaded!
   Train: 2197 samples
   Test:  733 samples

üìå Sample: {'category': 'InstantWaterHeaters', 'text': "Easy to Install, does not need space like other Geysers.|3-5 Seconds Instant Hot Water.|Heating tube: high-purity copper liner heating element Rated voltage: 220V/50HZ Rated Power: 3000W|The display will real-time display the current temperature.|A must kitchen accessory for housewives: your hands won't feel cold when washing dishes", 'task_type': 'Product Description'}


In [10]:
test_prompt = """Given the product category, you need to generate a 'Product Name'.
### Category: Smartphones
### Product Name:"""

inputs = tokenizer(test_prompt, return_tensors='pt', truncation=True,
                   max_length=400, padding="max_length")

with torch.no_grad():
    output = merged_model.generate(**inputs, max_new_tokens=100,
                                    repetition_penalty=1.15)

print("üß™ Quick Test Result:")
print(tokenizer.decode(output[0], skip_special_tokens=True))
print("‚úÖ Model is working!")

üß™ Quick Test Result:
Given the product category, you need to generate a 'Product Name'.
### Category: Smartphones
### Product Name:
Samsung Galaxy A71
‚úÖ Model is working!


In [11]:
def generate_prediction(model, tokenizer, sample, max_new_tokens=200):
    eval_prompt = f"""Given the product category, you need to generate a '{sample['task_type']}'.
### Category: {sample['category']}
### {sample['task_type']}:"""

    model_input = tokenizer(
        eval_prompt, return_tensors='pt', truncation=True,
        max_length=400, padding="max_length"
    )
    model_input = {k: v.to(model.device) for k, v in model_input.items()}

    with torch.no_grad():
        output = model.generate(
            **model_input,
            max_new_tokens=max_new_tokens,
            repetition_penalty=1.15,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    full_output = tokenizer.decode(output[0], skip_special_tokens=True)
    pred_text = full_output[len(eval_prompt):].strip()
    # Clean: remove anything after next ### if exists
    pred_text = pred_text.split("###")[0].strip()

    return pred_text, eval_prompt

In [14]:
import numpy as np
from tqdm import tqdm

In [15]:
num_test_samples = min(20, len(dataset['test']))
num_train_samples = min(10, len(dataset['train']))

def collect_predictions(data_split, num_samples, split_name):
    samples = data_split.select(range(num_samples))
    results = {'category': [], 'task_type': [], 'reference': [],
               'prediction': [], 'prompt': []}

    print(f"\nüîÑ Generating predictions for {split_name} ({num_samples} samples)...")
    for i in tqdm(range(num_samples)):
        sample = samples[i]
        pred, prompt = generate_prediction(merged_model, tokenizer, sample)
        results['category'].append(sample['category'])
        results['task_type'].append(sample['task_type'])
        results['reference'].append(sample['text'])
        results['prediction'].append(pred)
        results['prompt'].append(prompt)
    return results

test_results = collect_predictions(dataset['test'], num_test_samples, "Test")
train_results = collect_predictions(dataset['train'], num_train_samples, "Train")
print("‚úÖ All predictions generated!")


üîÑ Generating predictions for Test (20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [14:05<00:00, 42.26s/it]



üîÑ Generating predictions for Train (10 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [07:47<00:00, 46.80s/it]

‚úÖ All predictions generated!





In [17]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict
import os
from google.colab import drive

In [18]:
!pip install rouge-score nltk bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.1/61.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=dcc8dec09c74d8634e9b4304623b37c852567d3fb987111a95cf4cc595f1ad62
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [19]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score as nltk_meteor
from bert_score import score as bert_score_fn

In [21]:
import torch

def calculate_rouge(references, predictions):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for ref, pred in zip(references, predictions):
        if not pred.strip():
            pred = "empty"
        result = scorer.score(ref, pred)
        scores['rouge1'].append(result['rouge1'].fmeasure)
        scores['rouge2'].append(result['rouge2'].fmeasure)
        scores['rougeL'].append(result['rougeL'].fmeasure)
    return {k: np.mean(v) for k, v in scores.items()}, scores

def calculate_bleu(references, predictions):
    smoother = SmoothingFunction().method1
    bleu_scores = {'bleu1': [], 'bleu2': [], 'bleu3': [], 'bleu4': []}
    for ref, pred in zip(references, predictions):
        ref_tokens = nltk.word_tokenize(ref.lower())
        pred_tokens = nltk.word_tokenize(pred.lower()) if pred.strip() else ["empty"]
        bleu_scores['bleu1'].append(sentence_bleu([ref_tokens], pred_tokens,
                                    weights=(1, 0, 0, 0), smoothing_function=smoother))
        bleu_scores['bleu2'].append(sentence_bleu([ref_tokens], pred_tokens,
                                    weights=(0.5, 0.5, 0, 0), smoothing_function=smoother))
        bleu_scores['bleu3'].append(sentence_bleu([ref_tokens], pred_tokens,
                                    weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoother))
        bleu_scores['bleu4'].append(sentence_bleu([ref_tokens], pred_tokens,
                                    weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoother))
    return {k: np.mean(v) for k, v in bleu_scores.items()}, bleu_scores

def calculate_meteor(references, predictions):
    scores = []
    for ref, pred in zip(references, predictions):
        ref_tokens = nltk.word_tokenize(ref.lower())
        pred_tokens = nltk.word_tokenize(pred.lower()) if pred.strip() else ["empty"]
        score = nltk_meteor([ref_tokens], pred_tokens)
        scores.append(score)
    return {'meteor': np.mean(scores)}, scores

def calculate_bertscore(references, predictions):
    clean_preds = [p if p.strip() else "empty" for p in predictions]
    P, R, F1 = bert_score_fn(clean_preds, references, lang="en",
                              verbose=True, batch_size=16, device="cpu")
    return {
        'bertscore_precision': P.mean().item(),
        'bertscore_recall': R.mean().item(),
        'bertscore_f1': F1.mean().item()
    }, {'precision': P.tolist(), 'recall': R.tolist(), 'f1': F1.tolist()}

def calculate_perplexity(model, tokenizer, texts, max_length=400):
    device = next(model.parameters()).device
    model.eval()
    perplexities = []
    for text in tqdm(texts, desc="Calculating Perplexity"):
        if not text.strip():
            continue
        encodings = tokenizer(text, return_tensors='pt', truncation=True,
                              max_length=max_length)
        input_ids = encodings['input_ids'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            ppl = torch.exp(outputs.loss).item()
            if ppl < 10000:
                perplexities.append(ppl)
    return {
        'perplexity_mean': np.mean(perplexities),
        'perplexity_median': np.median(perplexities),
        'perplexity_std': np.std(perplexities)
    }, perplexities




merged_model = merged_model.to("cpu")

print("\n" + "="*70)
print("üìä CALCULATING METRICS FOR TEST SET")
print("="*70)

print("\nüìè ROUGE...")
test_rouge_avg, test_rouge_all = calculate_rouge(test_results['reference'], test_results['prediction'])

print("üìè BLEU...")
test_bleu_avg, test_bleu_all = calculate_bleu(test_results['reference'], test_results['prediction'])

print("üìè METEOR...")
test_meteor_avg, test_meteor_all = calculate_meteor(test_results['reference'], test_results['prediction'])

print("üìè BERTScore...")
test_bert_avg, test_bert_all = calculate_bertscore(test_results['reference'], test_results['prediction'])

print("üìè Perplexity...")
test_ppl_avg, test_ppl_all = calculate_perplexity(merged_model, tokenizer, test_results['prediction'])

print("\n" + "="*70)
print("üìä CALCULATING METRICS FOR TRAIN SET")
print("="*70)

print("\nüìè ROUGE...")
train_rouge_avg, train_rouge_all = calculate_rouge(train_results['reference'], train_results['prediction'])

print("üìè BLEU...")
train_bleu_avg, train_bleu_all = calculate_bleu(train_results['reference'], train_results['prediction'])

print("üìè METEOR...")
train_meteor_avg, train_meteor_all = calculate_meteor(train_results['reference'], train_results['prediction'])

print("üìè BERTScore...")
train_bert_avg, train_bert_all = calculate_bertscore(train_results['reference'], train_results['prediction'])

print("üìè Perplexity...")
train_ppl_avg, train_ppl_all = calculate_perplexity(merged_model, tokenizer, train_results['prediction'])

print("\n‚úÖ All metrics calculated!")


üìä CALCULATING METRICS FOR TEST SET

üìè ROUGE...
üìè BLEU...
üìè METEOR...
üìè BERTScore...


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 57.40 seconds, 0.35 sentences/sec
üìè Perplexity...


Calculating Perplexity: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [01:42<00:00,  5.13s/it]



üìä CALCULATING METRICS FOR TRAIN SET

üìè ROUGE...
üìè BLEU...
üìè METEOR...
üìè BERTScore...


Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 56.58 seconds, 0.18 sentences/sec
üìè Perplexity...


Calculating Perplexity: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:56<00:00,  5.69s/it]


‚úÖ All metrics calculated!





In [23]:
import pandas as pd

metrics_data = {
    'Metric': [
        'ROUGE-1', 'ROUGE-2', 'ROUGE-L',
        'BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4',
        'METEOR',
        'BERTScore Precision', 'BERTScore Recall', 'BERTScore F1',
        'Perplexity (Mean)', 'Perplexity (Median)', 'Perplexity (Std)'
    ],
    'Test Set': [
        test_rouge_avg['rouge1'], test_rouge_avg['rouge2'], test_rouge_avg['rougeL'],
        test_bleu_avg['bleu1'], test_bleu_avg['bleu2'], test_bleu_avg['bleu3'], test_bleu_avg['bleu4'],
        test_meteor_avg['meteor'],
        test_bert_avg['bertscore_precision'], test_bert_avg['bertscore_recall'], test_bert_avg['bertscore_f1'],
        test_ppl_avg['perplexity_mean'], test_ppl_avg['perplexity_median'], test_ppl_avg['perplexity_std']
    ],
    'Train Set': [
        train_rouge_avg['rouge1'], train_rouge_avg['rouge2'], train_rouge_avg['rougeL'],
        train_bleu_avg['bleu1'], train_bleu_avg['bleu2'], train_bleu_avg['bleu3'], train_bleu_avg['bleu4'],
        train_meteor_avg['meteor'],
        train_bert_avg['bertscore_precision'], train_bert_avg['bertscore_recall'], train_bert_avg['bertscore_f1'],
        train_ppl_avg['perplexity_mean'], train_ppl_avg['perplexity_median'], train_ppl_avg['perplexity_std']
    ]
}

df_metrics = pd.DataFrame(metrics_data)

df_metrics['Test Set'] = df_metrics['Test Set'].apply(lambda x: f"{x:.4f}" if x < 10 else f"{x:.2f}")
df_metrics['Train Set'] = df_metrics['Train Set'].apply(lambda x: f"{x:.4f}" if x < 10 else f"{x:.2f}")

print("\n" + "=" * 70)
print("üìä COMPLETE EVALUATION RESULTS")
print("=" * 70)
print(df_metrics.to_string(index=False))

summary_data = {
    'Metric Group': ['ROUGE-L', 'BLEU-4', 'METEOR', 'BERTScore F1', 'Perplexity (Median)'],
    'Test': [
        f"{test_rouge_avg['rougeL']:.4f}",
        f"{test_bleu_avg['bleu4']:.4f}",
        f"{test_meteor_avg['meteor']:.4f}",
        f"{test_bert_avg['bertscore_f1']:.4f}",
        f"{test_ppl_avg['perplexity_median']:.2f}"
    ],
    'Train': [
        f"{train_rouge_avg['rougeL']:.4f}",
        f"{train_bleu_avg['bleu4']:.4f}",
        f"{train_meteor_avg['meteor']:.4f}",
        f"{train_bert_avg['bertscore_f1']:.4f}",
        f"{train_ppl_avg['perplexity_median']:.2f}"
    ],
    'Interpretation': [
        '‚¨ÜÔ∏è Higher = Better (word overlap)',
        '‚¨ÜÔ∏è Higher = Better (n-gram match)',
        '‚¨ÜÔ∏è Higher = Better (flexible match)',
        '‚¨ÜÔ∏è Higher = Better (semantic similarity)',
        '‚¨áÔ∏è Lower = Better (model confidence)'
    ]
}

df_summary = pd.DataFrame(summary_data)

print("\n" + "=" * 70)
print("üìã KEY METRICS SUMMARY")
print("=" * 70)
print(df_summary.to_string(index=False))

print("\n" + "=" * 70)
print("üîç ANALYSIS")
print("=" * 70)
print(f"""
üìå Key Findings:
   ‚Ä¢ BERTScore F1 (Test): {test_bert_avg['bertscore_f1']:.4f} ‚Üí Model captures MEANING well
   ‚Ä¢ ROUGE-L (Test):      {test_rouge_avg['rougeL']:.4f} ‚Üí Low word-level overlap (different wording)
   ‚Ä¢ BLEU-4 (Test):       {test_bleu_avg['bleu4']:.4f} ‚Üí Low exact n-gram match
   ‚Ä¢ Perplexity Median:   {test_ppl_avg['perplexity_median']:.2f} ‚Üí Model is confident on most outputs

üìå Gap Analysis (Train vs Test):
   ‚Ä¢ BERTScore F1 gap:    {abs(train_bert_avg['bertscore_f1'] - test_bert_avg['bertscore_f1']):.4f} ‚Üí {'‚úÖ Small gap (good generalization)' if abs(train_bert_avg['bertscore_f1'] - test_bert_avg['bertscore_f1']) < 0.05 else '‚ö†Ô∏è Notable gap'}
   ‚Ä¢ ROUGE-L gap:         {abs(train_rouge_avg['rougeL'] - test_rouge_avg['rougeL']):.4f} ‚Üí {'‚úÖ Small gap' if abs(train_rouge_avg['rougeL'] - test_rouge_avg['rougeL']) < 0.1 else '‚ö†Ô∏è Notable gap'}

üìå Conclusion:
   The model generates semantically relevant responses (high BERTScore)
   but uses different wording than references (low ROUGE/BLEU).
   This is COMMON for generative models ‚Äî they paraphrase rather than copy.
""")


üìä COMPLETE EVALUATION RESULTS
             Metric Test Set Train Set
            ROUGE-1   0.1179    0.2268
            ROUGE-2   0.0134    0.0270
            ROUGE-L   0.0773    0.1490
             BLEU-1   0.0556    0.0781
             BLEU-2   0.0127    0.0217
             BLEU-3   0.0047    0.0074
             BLEU-4   0.0024    0.0036
             METEOR   0.0605    0.0865
BERTScore Precision   0.8598    0.8643
   BERTScore Recall   0.7932    0.8049
       BERTScore F1   0.8247    0.8334
  Perplexity (Mean)   228.84   1847.86
Perplexity (Median)   5.7125    6.5539
   Perplexity (Std)   545.29   2712.24

üìã KEY METRICS SUMMARY
       Metric Group   Test  Train                           Interpretation
            ROUGE-L 0.0773 0.1490        ‚¨ÜÔ∏è Higher = Better (word overlap)
             BLEU-4 0.0024 0.0036        ‚¨ÜÔ∏è Higher = Better (n-gram match)
             METEOR 0.0605 0.0865      ‚¨ÜÔ∏è Higher = Better (flexible match)
       BERTScore F1 0.8247 0.8334 ‚¨ÜÔ∏è Hi