# Gemma-2-9b Model Evaluation

<hr> <hr>


### Installing Required Libraries
This cell installs the libraries necessary for model evaluation, such as `datasets`, `transformers`, and `evaluate`.
Ensure that you have an active internet connection while running this cell.


In [None]:
%%capture
!pip install datasets
!pip install bitsandbytes peft trl accelerate transformers huggingface_hub
!pip install gradio
!pip install evaluate
!pip install sacrebleu rouge_score jiwer cer

## Importing Model from Huggingface Hub


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "roger33303/gemma-2-9b-Instruct-Finetune-website-QnA"

### Initializing Tokenizer and Model
Here, we initialize the tokenizer and load the fine-tuned Gemma-2-9b model. The model is configured to utilize GPU resources
(if available) for faster inference and better performance.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,device_map="cuda:0")

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Testing Model

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""

from transformers import TextStreamer
def chatml(question,model):
    inputs = tokenizer([alpaca_prompt.format(question, "", "", )], return_tensors = "pt").to("cuda")

    text_streamer = TextStreamer(tokenizer,skip_special_tokens=True,
                                 skip_prompt=True)

    return model.generate(**inputs, streamer = text_streamer,
                          max_new_tokens = 512,
                          do_sample = True,
                          temperature = 0.9,
                          top_p = 0.5,
                          top_k = 20,
                          repetition_penalty = 1.1,
                          eos_token_id = tokenizer.eos_token_id,
                          use_cache = True,
                          )


In [None]:
question = "Which course is related to AI and Communication at westminster?"
x = chatml(question,model)

Course title: AI and Communication MA, Collage: University of Westminster, Campus: Harrow, North-West London. For full-time course: UK fees is £9,700 and international fees is £17,500. Duration of the course is 1 year, link: https://www.westminster.ac.uk/media-and-communication-courses/2025-26/september/full-time/ai-and-communication-ma.


## Evaluating Model Performance


In [None]:
import torch
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm
import pandas as pd

In [None]:
eval_data = load_dataset("csv", data_files='/content/drive/MyDrive/msc_project/gemma_test_data.csv', split='train')

Generating train split: 0 examples [00:00, ? examples/s]

## Loading test dataset and formating it and generating output from llm for evaluation

In [None]:
y_true =[]
y_pred=[]
model.eval()
eval_loader = DataLoader(eval_data, batch_size=1, shuffle=False)
for batch in tqdm(eval_loader):
        query = batch["instructions"][0]
        ot = batch['output'][0]
        y_true.append(alpaca_prompt.format(query, "", ot,))
        tklen = len(query) + 4
        inputs = tokenizer([alpaca_prompt.format(query, "","",)], return_tensors = "pt")
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=512)
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        y_pred.append(generated_text)


100%|██████████| 69/69 [32:49<00:00, 28.55s/it]


In [None]:
gemma_y_pred_y_test = pd.DataFrame({"y_pred":y_pred,"y_test":y_true})

In [None]:
gemma_y_pred_y_test.to_csv("/content/drive/MyDrive/msc_project/gemma_y_pred_y_test.csv",index=False)

# Evaluation

These cells evaluates the performance of the fine-tuned model using various metrics such as BLEU, ROUGE, and CER.
The goal is to measure the model's accuracy and the relevance of its outputs.

In [None]:
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
import torch
import evaluate
from evaluate import load
from tqdm import tqdm
import pandas as pd

gemma_y_pred_y_test = pd.read_csv("/content/drive/MyDrive/msc_project/gemma_y_pred_y_test.csv")

y_pred  = gemma_y_pred_y_test["y_pred"].tolist()
y_true = gemma_y_pred_y_test["y_test"].tolist()


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

{'score': 49.93856940340726,
 'counts': [10799, 10156, 9744, 9447],
 'totals': [20176, 20107, 20038, 19969],
 'precisions': [53.523988897700235,
  50.50977271596956,
  48.62760754566324,
  47.3083279082578],
 'bp': 1.0,
 'sys_len': 20176,
 'ref_len': 11292}

## Rouge Score evaluation

In [None]:
rouge = evaluate.load('rouge')
result1 = rouge.compute(predictions=y_pred, references=y_true)
result1

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.6943897345402694,
 'rouge2': 0.6542542984542257,
 'rougeL': 0.686169149696205,
 'rougeLsum': 0.6923101244243446}

In [None]:
scoreboard = {}
scoreboard.update({'bleu_score':results['score']})

## CER Score Evaluation

In [None]:
exact_match_metric = load("character")
results2 = exact_match_metric.compute(predictions=y_pred, references=y_true)
results2

Downloading builder script:   0%|          | 0.00/7.99k [00:00<?, ?B/s]

{'cer_score': 0.45013484791015}

## Meteor Score Evaluation

In [None]:
meteor = evaluate.load('meteor')
result3 = meteor.compute(predictions=y_pred, references=y_true)
result3

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


{'meteor': 0.7964187834026633}

## Roge Score Evaluation

In [None]:
rouge = evaluate.load('rouge')
result4 = rouge.compute(predictions=y_pred, references=y_true)
result4

{'rouge1': 0.6943897345402694,
 'rouge2': 0.6542542984542257,
 'rougeL': 0.686169149696205,
 'rougeLsum': 0.6923101244243446}

In [None]:
scoreboard.update(results2)
scoreboard.update(result3)
scoreboard.update(result4)
scoreboard.update({'model':'gemma'})

## Final evaluation Metrics

In [None]:
scoreboard

{'bleu_score': 49.93856940340726,
 'cer_score': 0.45013484791015,
 'meteor': 0.7964187834026633,
 'rouge1': 0.6943897345402694,
 'rouge2': 0.6542542984542257,
 'rougeL': 0.686169149696205,
 'rougeLsum': 0.6923101244243446,
 'model': 'gemma'}

In [None]:
x = pd.DataFrame.from_dict([scoreboard])

### Saving the model performance

In [None]:
import pandas as pd
try:
    df = pd.read_csv("/content/drive/MyDrive/msc_project/model_results.csv")
    df = pd.concat([df,x])
    df.reset_index(drop=True, inplace=True)
    df.to_csv("/content/drive/MyDrive/msc_project/model_results.csv",index=False)
except:
    x.to_csv("/content/drive/MyDrive/msc_project/model_results.csv",index=False)
finally:
    df = pd.read_csv("/content/drive/MyDrive/msc_project/model_results.csv")

## DataFrame for Model Evaluation

In [None]:
df

Unnamed: 0,bleu_score,cer_score,meteor,rouge1,rouge2,rougeL,rougeLsum,model
0,49.938569,0.450135,0.796419,0.69439,0.654254,0.686169,0.69231,gemma
