### Group 100 -  Conversational AI Assignment 2


```
1. Amit Kumar Sharma      2023ac05454       100%

2. Mohammed Faisal Sait   2023aa05525       100%

3. Chachiya Faiz Arif     2023ac05420       100%

4. Parveen Kumar          2023ac05467       100%

5. Sachchinda Nand Singh  2023ac05002       100%

In [1]:
%pip install transformers datasets peft accelerate evaluate

Note: you may need to restart the kernel to use updated packages.


### 1. Imports

In [None]:
# ============================================
## importing libraries
# ============================================
import time
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    TrainingArguments, Trainer, pipeline,
    DataCollatorForSeq2Seq,EarlyStoppingCallback
)
from datasets import Dataset
import evaluate
from peft import LoraConfig, get_peft_model, PeftModel
import os

### Generate df from Q&A csv

In [4]:
# List of CSV files
csv_paths = [
    "../data/Q&A/qa_dataset.csv",
    "../data/Q&A/new_financial_qa.csv"
]

# Read and concatenate all CSV files
df_list = []
for path in csv_paths:
    if os.path.exists(path):
        df = pd.read_csv(path)
        df_list.append(df)
    else:
        print(f"Warning: File not found: {path}")

# Concatenate all dataframes into one
df = pd.concat(df_list, axis=0, ignore_index=True)

print(f"Loaded combined dataset with {len(df)} samples.")
print(df.head())

Loaded combined dataset with 98 samples.
                                            Question         Answer
0              What was SAP's total revenue in 2023?  €31.2 billion
1              What was SAP's cloud revenue in 2023?  €13.6 billion
2  What was SAP's software licenses revenue in 2023?   €2.7 billion
3           What was SAP's services revenue in 2023?   €4.3 billion
4           What was SAP's operating profit in 2023?   €5.8 billion


### Create dataset for fine tuning

In [5]:
dataset = Dataset.from_pandas(df)

### Generate Train and Test dataset 20%

In [6]:
# Train/test split
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

### 3.2 Model Selection

In [None]:
# ============================================
# Tokenizer & Model Selection
# ============================================
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],  # for T5
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(base_model, lora_config)

In [None]:
# ============================================
# Preprocessing function (mask pad tokens)
# ============================================
def preprocess_function(examples):
    inputs = ["question: " + q.strip() for q in examples["Question"]]
    targets = [a.strip() for a in examples["Answer"]]

    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            truncation=True,
            padding="max_length"
        ).input_ids

    # Mask pad tokens in labels
    labels = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 78/78 [00:00<00:00, 718.05 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 1594.40 examples/s]


In [None]:
# ==========================================
# 5. Data collator
# =======================================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### 3.3 Baseline Benchmarking (Pre-Fine-Tuning)

In [None]:
# ============================================
# Benchmarking function
# ========================================
def benchmark_model(model, tokenizer, test_df, n=10):
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)

    logs = []
    correct = 0

    for _, row in test_df.head(n).iterrows():
        q = row["Question"]
        a_true = str(row["Answer"]).strip()

        prompt = "question: " + q

        start_time = time.time()
        output = pipe(prompt, max_new_tokens=32, do_sample=False)
        end_time = time.time()

        a_pred = output[0]['generated_text'].strip()

        # Confidence proxy
        inputs = tokenizer(prompt, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=32,
                return_dict_in_generate=True,
                output_scores=True
            )
        conf = None
        if outputs.scores:
            probs = torch.nn.functional.softmax(outputs.scores[0][0], dim=-1)
            conf = probs.max().item()

        # Manual accuracy: case-insensitive substring match
        acc = 1 if a_true.lower() in a_pred.lower() else 0
        correct += acc

        logs.append({
            "Question": q,
            "True Answer": a_true,
            "Predicted Answer": a_pred,
            "Accuracy": acc,
            "Confidence": round(conf, 4) if conf else None,
            "Inference Time (s)": round(end_time - start_time, 3)
        })

    avg_accuracy = correct / n
    df_log = pd.DataFrame(logs)
    print("\nAverage Accuracy:", round(avg_accuracy, 4))
    return df_log


# =========================================
# Baseline before finetuning
# ============================================
print("Baseline Model Evaluation...")
baseline_results = benchmark_model(model, tokenizer, test_dataset.to_pandas(), n=10)
print(baseline_results)


Device set to use cpu


Baseline Model Evaluation...

Average Accuracy: 0.0
                                            Question  \
0  What is SAP’s capital stock as of December 31,...   
1          What is SAP Business Technology Platform?   
2  What percentage of revenue came from the Ameri...   
3       How many customers does SAP serve worldwide?   
4  How much does SAP plan to invest in AI over th...   
5  What percentage of the world's GDP touches SAP...   
6              How much was support revenue in 2023?   
7                            What is SAP Datasphere?   
8                 When did Christian Klein join SAP?   
9        What is SAP's Intelligent Spend Management?   

                                         True Answer  \
0  SAP’s capital stock as of December 31, 2023, w...   
1     A platform integrating data, AI, and analytics   
2  The Americas contributed 41% to total revenue ...   
3                             Over 400,000 customers   
4                                    Over €1 billio

### 3.4 Fine-Tuning

In [None]:

training_args = TrainingArguments(
    output_dir="./finetuned_model",
    eval_strategy="epoch",
    metric_for_best_model="eval_loss",
    greater_is_better=False,          # Because lower loss is better
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=80,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    save_strategy="epoch",
    fp16=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()


  trainer = Trainer(
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss
1,3.7787,3.182318
2,3.6893,3.124519
3,3.8798,3.05159
4,3.5733,2.971312
5,3.3188,2.895342
6,3.2395,2.837496
7,3.2165,2.799684
8,3.04,2.772446
9,3.4777,2.749204
10,3.6043,2.732127


TrainOutput(global_step=1600, training_loss=2.8769822406768797, metrics={'train_runtime': 781.8814, 'train_samples_per_second': 7.981, 'train_steps_per_second': 2.046, 'total_flos': 293286916915200.0, 'train_loss': 2.8769822406768797, 'epoch': 80.0})

### Benchmarking model

In [None]:
def benchmark_model(model, tokenizer, test_df, n=10):
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)

    logs = []
    correct = 0
    total_time = 0

    for i, row in test_df.head(n).iterrows():
        q = row["Question"]
        a_true = str(row["Answer"]).strip()

        start_time = time.time()
        output = pipe(q, max_length=256, num_return_sequences=1)  # removed return_full_text
        end_time = time.time()

        a_pred = output[0]['generated_text'].strip()
        total_time += (end_time - start_time)

        # Normalize for comparison
        norm_true = a_true.lower().replace(",", "")
        norm_pred = a_pred.lower().replace(",", "")

        acc = 1 if norm_true in norm_pred else 0
        correct += acc

        logs.append({
            "Question": q,
            "True Answer": a_true,
            "Predicted Answer": a_pred,
            "Match": bool(acc),
            "Inference Time (s)": round(end_time - start_time, 3)
        })

    accuracy = correct / n
    avg_time = total_time / n

    print(f"Average Accuracy: {accuracy*100:.2f}%")
    print(f"Average Inference Time: {avg_time:.3f}s")

    return pd.DataFrame(logs)


# ==========================================
# Post-finetuning evaluation
# ============================================
print("Post-Finetuning Model Evaluation...")
finetuned_results = benchmark_model(model, tokenizer, test_dataset.to_pandas(), n=10)
print(finetuned_results)


Device set to use mps:0


Post-Finetuning Model Evaluation...


  test_elements = torch.tensor(test_elements)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/trans

Average Accuracy: 0.00%
Average Inference Time: 3.709s
                                            Question  \
0  What is SAP’s capital stock as of December 31,...   
1          What is SAP Business Technology Platform?   
2  What percentage of revenue came from the Ameri...   
3       How many customers does SAP serve worldwide?   
4  How much does SAP plan to invest in AI over th...   
5  What percentage of the world's GDP touches SAP...   
6              How much was support revenue in 2023?   
7                            What is SAP Datasphere?   
8                 When did Christian Klein join SAP?   
9        What is SAP's Intelligent Spend Management?   

                                         True Answer  \
0  SAP’s capital stock as of December 31, 2023, w...   
1     A platform integrating data, AI, and analytics   
2  The Americas contributed 41% to total revenue ...   
3                             Over 400,000 customers   
4                                    Over €1 bil

### Save FT model 

In [None]:
# ============================================
### Save fine-tuned adapter + tokenizer
# ============================================
save_path = "./qa_finetuned_model_saved"
model = model.merge_and_unload()

# Save full model + tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./qa_finetuned_model_saved/tokenizer_config.json',
 './qa_finetuned_model_saved/special_tokens_map.json',
 './qa_finetuned_model_saved/tokenizer.json')

### Test fine tuned model

In [17]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

save_path = "./qa_finetuned_model_saved"

# Reload merged model
model_ft = AutoModelForSeq2SeqLM.from_pretrained(save_path)
tokenizer_ft = AutoTokenizer.from_pretrained(save_path)

qa_pipeline_ft = pipeline("text2text-generation", model=model_ft, tokenizer=tokenizer_ft, device=-1)

question = "What was SAP's total revenue in 2023?"
prompt = "question: " + question
print(qa_pipeline_ft(prompt, max_new_tokens=32, do_sample=False)[0]["generated_text"])


Device set to use cpu


Total revenue in 2023 was €12.8 billion.


### Testing FT model

In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
tok = AutoTokenizer.from_pretrained("google/flan-t5-small")
m = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small", low_cpu_mem_usage=True)
pipe = pipeline("text2text-generation", model=m, tokenizer=tok, device=-1)
print(pipe("What was SAP's cloud revenue in 2023?", max_new_tokens=16))

Device set to use cpu


[{'generated_text': 'net profit'}]


## Testing FT model

In [24]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
path = "qa_finetuned_model_saved"

tokenizer = AutoTokenizer.from_pretrained(path)

# Try loading with conservative settings
model = AutoModelForSeq2SeqLM.from_pretrained(
    path,
    torch_dtype="auto",            # let HF pick; change to torch.float32 if needed
    low_cpu_mem_usage=True,        # reduces memory peak on CPU
    device_map=None                # ensure CPU
)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)
print(pipe("What was SAP's cloud revenue in 2024?", max_new_tokens=32))

Device set to use cpu


[{'generated_text': "SAP's cloud revenue in 2024 accounted for 36% of its cloud revenue in 2024."}]


In [20]:
finetuned_model_path = "qa_finetuned_model_saved"
tokenizer_ft = AutoTokenizer.from_pretrained(finetuned_model_path)
model_ft = AutoModelForSeq2SeqLM.from_pretrained(
    finetuned_model_path,
    torch_dtype="auto",
    low_cpu_mem_usage=True,
    device_map=None
)
qa_pipeline_ft = pipeline("text2text-generation", model=model_ft, tokenizer=tokenizer_ft, device=-1)
print("Fine-tuned model loaded successfully.")
print(qa_pipeline_ft("What was SAP's cloud revenue in 2023?", max_new_tokens=64))

Device set to use cpu


Fine-tuned model loaded successfully.
[{'generated_text': "SAP's cloud revenue in 2023 was €13.6 billion."}]
