# Mistral-7B Model Evaluation

<hr> <hr>


### Installing Required Libraries
This cell installs the libraries necessary for model evaluation, such as `datasets`, `transformers`, and `evaluate`.
Ensure that you have an active internet connection while running this cell.


In [None]:
%%capture
!pip install datasets
!pip install bitsandbytes peft trl accelerate transformers huggingface_hub
!pip install evaluate
!pip install sacrebleu rouge_score jiwer cer

## Importing Model from Huggingface Hub


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "roger33303/mistral-7b-Instruct-Finetune-website-QnA"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,device_map="cuda:0")

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

# Testing Model

In [None]:
from transformers import TextStreamer
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
def chatml(question,model):
    # FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer([alpaca_prompt.format(question, "", "", )],
                       return_tensors = "pt").to("cuda")
    text_streamer = TextStreamer(tokenizer,skip_special_tokens=True,
                                 skip_prompt=True)
    return model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512)

In [None]:
question = "Which course is related to AI and Communication at westminster?"
x = chatml(question,model)

Yes, the University of Westminster provides a course on AI and Communication:
Course title: AI and Communication, Collage: University of Westminster, Campus: Regent, Central London. For full-time course: UK fees is £9,250 and international fees is £17,000. Duration of the course is 1 year, link: https://www.westminster.ac.uk/computer-science-and-engineering-courses/2025-26/september/full-time/ai-and-communication.



## Evaluating Model Performance


In [None]:
import torch
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm
import pandas as pd

In [None]:
eval_data = load_dataset("csv", data_files='/content/drive/MyDrive/msc_project/mistral_test_data.csv', split='train')

Generating train split: 0 examples [00:00, ? examples/s]

## Loading test dataset and formating it and generating output from llm for evaluation

In [None]:
y_true =[]
y_pred=[]
model.eval()
eval_loader = DataLoader(eval_data, batch_size=1, shuffle=False)
for batch in tqdm(eval_loader):
        query = batch["instructions"][0]
        ot = batch['output'][0]
        y_true.append(alpaca_prompt.format(query, "", ot,))
        tklen = len(query) + 4
        inputs = tokenizer([alpaca_prompt.format(query, "","",)], return_tensors = "pt")
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=512)
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        y_pred.append(generated_text)


100%|██████████| 69/69 [08:04<00:00,  7.02s/it]


In [None]:
mistral_y_pred_y_test = pd.DataFrame({"y_pred":y_pred,"y_test":y_true})

In [None]:
mistral_y_pred_y_test.to_csv("/content/drive/MyDrive/msc_project/mistral_y_pred_y_test.csv",index=False)

# Evaluation

In [None]:
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
import torch
import evaluate
from evaluate import load
from tqdm import tqdm
import pandas as pd

gemma_y_pred_y_test = pd.read_csv("/content/drive/MyDrive/msc_project/mistral_y_pred_y_test.csv")

y_pred  = gemma_y_pred_y_test["y_pred"].tolist()
y_true = gemma_y_pred_y_test["y_test"].tolist()

sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=y_pred, references=y_true)
results

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

{'score': 76.66157402657194,
 'counts': [9552, 8807, 8326, 8001],
 'totals': [10700, 10631, 10562, 10493],
 'precisions': [89.27102803738318,
  82.84263004421032,
  78.82976708956637,
  76.25083388925951],
 'bp': 0.9388645004566223,
 'sys_len': 10700,
 'ref_len': 11375}

## Rouge Score

In [None]:
rouge = evaluate.load('rouge')
result1 = rouge.compute(predictions=y_pred, references=y_true)
result1

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.8790879423566391,
 'rouge2': 0.8156524117241757,
 'rougeL': 0.8728056902463668,
 'rougeLsum': 0.8784102527524325}

In [None]:
scoreboard = {}
scoreboard.update({'bleu_score':results['score']})

## CER Score

In [None]:
exact_match_metric = evaluate.load("character")
results2 = exact_match_metric.compute(predictions=y_pred, references=y_true)
results2

Downloading builder script:   0%|          | 0.00/7.99k [00:00<?, ?B/s]

{'cer_score': 0.23424067418286332}

## Meteor Score

In [None]:
meteor = evaluate.load('meteor')
result3 = meteor.compute(predictions=y_pred, references=y_true)
result3

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


{'meteor': 0.8079513016029687}

## Rouge Score

In [None]:
rouge = evaluate.load('rouge')
result4 = rouge.compute(predictions=y_pred, references=y_true)
result4

{'rouge1': 0.8790879423566391,
 'rouge2': 0.8156524117241757,
 'rougeL': 0.8728056902463668,
 'rougeLsum': 0.8784102527524325}

In [None]:
scoreboard.update(results2)
scoreboard.update(result3)
scoreboard.update(result4)
scoreboard.update({'model':'mistral'})

In [None]:
scoreboard

{'bleu_score': 76.66157402657194,
 'cer_score': 0.23424067418286332,
 'meteor': 0.8079513016029687,
 'rouge1': 0.8790879423566391,
 'rouge2': 0.8156524117241757,
 'rougeL': 0.8728056902463668,
 'rougeLsum': 0.8784102527524325,
 'model': 'mistral'}

In [None]:
x = pd.DataFrame.from_dict([scoreboard])

In [None]:
x

Unnamed: 0,bleu_score,cer_score,meteor,rouge1,rouge2,rougeL,rougeLsum,model
0,76.661574,0.234241,0.807951,0.879088,0.815652,0.872806,0.87841,mistral


## Saving Score Data

In [None]:
import pandas as pd
try:
    df = pd.read_csv("/content/drive/MyDrive/msc_project/model_results.csv")
    df = pd.concat([df,x])
    df.reset_index(drop=True, inplace=True)
    df.to_csv("/content/drive/MyDrive/msc_project/model_results.csv",index=False)
except:
    x.to_csv("/content/drive/MyDrive/msc_project/model_results.csv",index=False)
finally:
    df = pd.read_csv("/content/drive/MyDrive/msc_project/model_results.csv")

## Score Card

In [None]:
df

Unnamed: 0,bleu_score,cer_score,meteor,rouge1,rouge2,rougeL,rougeLsum,model
0,49.938569,0.450135,0.796419,0.69439,0.654254,0.686169,0.69231,gemma
1,76.661574,0.234241,0.807951,0.879088,0.815652,0.872806,0.87841,mistral
