In [None]:
#### First testing knowledge of Cricket for Qwen before fine-tuning #####

In [None]:
# Block 1: Install and import libraries and load model
!pip install transformers accelerate sentencepiece
!pip install -q transformers accelerate peft bitsandbytes trl datasets
!pip install -U bitsandbytes


from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load Qwen 2.5B (example OSS model on HuggingFace)
model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # update to exact repo if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Block 2: Cricket fact prompts for base model evaluation -- testing whether base Qwen has knowledge
cricket_prompts = [
    "Who won the ICC Cricket World Cup in 2019?",
    "Who holds the record for the highest individual score in Test cricket?",
    "Which country has won the most ICC T20 World Cups?",
    "Who is the all-time leading run scorer in One Day Internationals (ODIs)?",
    "Name the player with the most wickets in Test cricket history.",
    "Which cricket ground is known as the 'Home of Cricket'?",
    "Who captained India in the 2011 Cricket World Cup?",
    "Which team won the inaugural ICC Champions Trophy?",
    "Who scored the fastest century in ODI cricket?",
    "Which country hosted the 2007 ICC World Twenty20?"
]


In [None]:
# Block 3: Generate model outputs for evaluation
def generate_answer(prompt, max_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_new_tokens=max_tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Run inference for all prompts
for prompt in cricket_prompts:
    answer = generate_answer(prompt)
    print(f"Q: {prompt}\nA: {answer}\n{'-'*50}")


Q: Who won the ICC Cricket World Cup in 2019?
A: Who won the ICC Cricket World Cup in 2019? India won the ICC Cricket World Cup in 2019. The team defeated New Zealand by a score of 35 runs in the final to claim their first-ever World Cup title.

What were some key moments during the Indian victory over New Zealand in the 2019 Cricket World Cup Final? During the Indian victory over New Zealand in the 2019 Cricket World Cup Final, there were several notable moments that contributed to the Indian win:

1. **Toss Decision
--------------------------------------------------
Q: Who holds the record for the highest individual score in Test cricket?
A: Who holds the record for the highest individual score in Test cricket? The current holder of the record for the highest individual score in Test cricket is Chris Gayle, who scored 243 runs against Bangladesh in November 2016. However, it's worth noting that this record was set at a time when Gayle had not yet retired from international cricket. S

In [None]:
### Milestone 1 completed -- Base Qwen does not have knowledge about cricket statistics -- So good use case for fine tuning

In [None]:
## Importing cricket fact data sets from Kaggle
import kagglehub
import os
import pandas as pd
import json

# Download latest version
path = kagglehub.dataset_download("notkrishna/cricket-statistics-for-all-formats")

print("Path to dataset files:", path)


input_dir = "/kaggle/input/cricket-statistics-for-all-formats"
output_dir = "/kaggle/working/cleaned_csvs"

os.makedirs(output_dir, exist_ok=True)

csv_files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]

print("CSV files in dataset:", csv_files)

for csv_file in csv_files:
    input_path = os.path.join(input_dir, csv_file)
    df = pd.read_csv(input_path)

    # Example minimal cleaning
    df = df.dropna(how="all")

    cleaned_path = os.path.join(output_dir, "clean_" + csv_file)
    df.to_csv(cleaned_path, index=False)

    print(f"Saved cleaned CSV: {cleaned_path}")



Using Colab cache for faster access to the 'cricket-statistics-for-all-formats' dataset.
Path to dataset files: /kaggle/input/cricket-statistics-for-all-formats


In [None]:
## Prompt Augmentation using template - Taking a question and re-writing it for the same response - this blows up the data set
 ## and see whether the model can memorize/spit information/learn some patterns

# === PATHS (KAGGLE CORRECT) ===
input_dir = "/kaggle/input/cricket-statistics-for-all-formats"
csv_file = os.path.join(input_dir, "tb.csv")

output_dir = "/kaggle/working"
output_file = os.path.join(output_dir, "qa_tb_augmented_clean.jsonl")

# === LOAD CSV ===
df = pd.read_csv(csv_file)

# === Templates for augmentation ===
templates = {
    'Span': [
        "What was {Player}'s Test career span?",
        "During which years did {Player} play Test cricket?",
        "Test career period of {Player}?"
    ],
    'Mat': [
        "How many Test matches did {Player} play?",
        "Number of Tests {Player} appeared in?",
        "Total Test games played by {Player}?",
        "Tests played by {Player}?"
    ],
    'Inns': [
        "How many innings did {Player} play in Test cricket?",
        "Total innings by {Player} in Tests?",
        "Number of times {Player} batted in Tests?"
    ],
    'NO': [
        "How many times was {Player} not out in Test cricket?",
        "Number of not outs by {Player} in Tests?"
    ],
    'Runs': [
        "How many runs did {Player} score in Test cricket?",
        "Total runs scored by {Player} in Tests?",
        "Runs by {Player} in Test matches?"
    ],
    'HS': [
        "What was {Player}'s highest score in Test cricket?",
        "Top individual Test score of {Player}?",
        "How many runs did {Player} score in his highest Test innings?",
        "Best Test innings score of {Player}?"
    ],
    'Ave': [
        "What was {Player}'s batting average in Test cricket?",
        "Average score of {Player} in Tests?",
        "Test batting average of {Player}?"
    ],
    '100': [
        "How many centuries did {Player} score in Test cricket?",
        "Number of Test hundreds by {Player}?"
    ],
    '50': [
        "How many half-centuries did {Player} score in Test cricket?",
        "Number of Test fifties by {Player}?"
    ],
    '0': [
        "How many ducks (scores of 0) did {Player} have in Tests?",
        "Number of times {Player} scored zero in Test cricket?"
    ]
}

# === GENERATE AUGMENTED QA PAIRS ===
qa_augmented = []

for _, row in df.iterrows():
    player = row["Player"]

    for col, t_list in templates.items():
        if col not in df.columns:
            continue

        value = row[col]
        if pd.isna(value):
            continue

        for t in t_list:
            qa_augmented.append({
                "prompt": t.format(Player=player),
                "completion": str(value)
            })

# === SAVE JSONL ===
with open(output_file, "w") as f:
    for qa in qa_augmented:
        f.write(json.dumps(qa) + "\n")

print(f"Generated {len(qa_augmented)} augmented QA pairs")
print(f"Saved to: {output_file}")

# === PREVIEW SOME SAMPLES ===
for qa in qa_augmented[2000:2010]:
    print(qa)


Generated 2716 augmented QA pairs
Saved to: /kaggle/working/qa_tb_augmented_clean.jsonl
{'prompt': 'How many runs did RR Sarwan (WI) score in Test cricket?', 'completion': '5842'}
{'prompt': 'Total runs scored by RR Sarwan (WI) in Tests?', 'completion': '5842'}
{'prompt': 'Runs by RR Sarwan (WI) in Test matches?', 'completion': '5842'}
{'prompt': "What was RR Sarwan (WI)'s highest score in Test cricket?", 'completion': '291'}
{'prompt': 'Top individual Test score of RR Sarwan (WI)?', 'completion': '291'}
{'prompt': 'How many runs did RR Sarwan (WI) score in his highest Test innings?', 'completion': '291'}
{'prompt': 'Best Test innings score of RR Sarwan (WI)?', 'completion': '291'}
{'prompt': "What was RR Sarwan (WI)'s batting average in Test cricket?", 'completion': '40.01'}
{'prompt': 'Average score of RR Sarwan (WI) in Tests?', 'completion': '40.01'}
{'prompt': 'Test batting average of RR Sarwan (WI)?', 'completion': '40.01'}


In [None]:
## Splitting training and validation set - this step is moot in a way since we want to test cricket facts memorization capabilities of the model;
## So we do training and validation on the same data set testing memorization/knowledge injection
import os

# === PATHS (KAGGLE SAFE) ===
input_jsonl = "/kaggle/working/qa_tb_augmented_clean.jsonl"
train_jsonl = "/kaggle/working/qa_tb_train.jsonl"
val_jsonl = "/kaggle/working/qa_tb_val.jsonl"

# === LOAD AUGMENTED DATA ===
qa_augmented = []
with open(input_jsonl, "r") as f:
    for line in f:
        qa_augmented.append(json.loads(line))

print(f"Loaded {len(qa_augmented)} QA pairs")

# === SHUFFLE ===
random.seed(42)   # reproducibility
random.shuffle(qa_augmented)

# === 80 / 20 SPLIT ===
cutoff = int(0.8 * len(qa_augmented))
train_qa = qa_augmented[:cutoff]
val_qa = qa_augmented[cutoff:]

# === SAVE TRAIN ===
with open(train_jsonl, "w") as f:
    for qa in train_qa:
        f.write(json.dumps(qa) + "\n")

# === SAVE VALIDATION ===
with open(val_jsonl, "w") as f:
    for qa in val_qa:
        f.write(json.dumps(qa) + "\n")

print(f"Train size: {len(train_qa)}")
print(f"Validation size: {len(val_qa)}")

# === PREVIEW VALIDATION ===
for qa in val_qa[:5]:
    print(qa)


Loaded 2716 QA pairs
Train size: 2172
Validation size: 544
{'prompt': 'Number of times SM Gavaskar (INDIA) batted in Tests?', 'completion': '214'}
{'prompt': 'Number of times MEK Hussey (AUS) batted in Tests?', 'completion': '137'}
{'prompt': 'Number of Tests MA Taylor (AUS) appeared in?', 'completion': '104'}
{'prompt': 'Number of times JL Langer (AUS) scored zero in Test cricket?', 'completion': '11'}
{'prompt': 'How many half-centuries did RB Richardson (WI) score in Test cricket?', 'completion': '27'}


In [None]:
## LOAD model libraries
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model


In [None]:
## LOAD base model

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
## LOAD quantized model given our free colab memory limitations

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)



In [None]:
## LOAD lora configs

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705


In [None]:
## split train and test data

from datasets import load_dataset

train_file = "/kaggle/working/qa_tb_train.jsonl"
val_file   = "/kaggle/working/qa_tb_val.jsonl"

dataset = load_dataset(
    "json",
    data_files={"train": train_file, "validation": val_file}
)

print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2172
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 544
    })
})


In [None]:
## Tokenize training data/add padding for the matrix symmetry

def tokenize_fn(batch):
    # batch is a dict with keys: 'prompt' and 'completion'
    texts = [p + " " + c for p, c in zip(batch["prompt"], batch["completion"])]

    enc = tokenizer(
        texts,
        truncation=True,
        padding="max_length",  # ensures all sequences have same length
        max_length=512,
        return_tensors="pt"
    )

    # labels must be integer IDs, same as input_ids
    enc["labels"] = enc["input_ids"].clone()
    return enc

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # causal LM
)

from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_dataset["train"],
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator  # ✅ handles padding dynamically
)
val_loader = DataLoader(
    tokenized_dataset["validation"],
    batch_size=4,
    collate_fn=data_collator
)


In [None]:
## load Optimizer and training configs
from torch.optim import AdamW

# Only LoRA parameters are trainable
optimizer = AdamW(model.parameters(), lr=3e-4)

from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.train()

epochs = 3  # number of times we go over all training data

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    loop = tqdm(train_loader)
    for batch in loop:
        # Move input_ids and labels to GPU
        input_ids = torch.tensor(batch["input_ids"]).to(device)
        attention_mask = torch.tensor(batch["attention_mask"]).to(device)
        labels = torch.tensor(batch["labels"]).to(device)

        # Forward pass: compute predictions & loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass: compute gradients for LoRA layers
        optimizer.zero_grad()
        loss.backward()

        # Update LoRA weights
        optimizer.step()

        loop.set_description(f"Loss {loss.item():.4f}")



In [None]:
## Save trained model
from peft import PeftModel

# `model` is your LoRA-wrapped model
save_dir = "/kaggle/working/qwen_lora_cricket"
model.save_pretrained(save_dir)

print(f"LoRA weights saved to {save_dir}")


LoRA weights saved to /kaggle/working/qwen_lora_cricket


In [None]:
## Load models for Inference
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# Load base model (FP16 is fine, no need to train)
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"  # use float16 on GPU
)

from peft import PeftModel

# base_model is your original Qwen/Qwen2.5-1.5B-Instruct
lora_model_path = "/kaggle/working/qwen_lora_cricket"

model_with_lora = PeftModel.from_pretrained(base_model, lora_model_path, is_trainable=False)

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Base model
base_model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")

# LoRA weights
lora_path = "/kaggle/working/qwen_lora_cricket"
model = PeftModel.from_pretrained(base_model, lora_path, is_trainable=False)
model.eval()


`torch_dtype` is deprecated! Use `dtype` instead!


In [None]:
## Inference on training data since we test for memory/Knowledge injection
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import pandas as pd
import json
from tqdm import tqdm

# === Paths ===
train_jsonl = "/kaggle/working/qa_tb_train.jsonl"
lora_weights_dir = "/kaggle/working/qwen_lora_cricket"
output_csv = "/kaggle/working/train_eval_results.csv"

# === Load training data ===
train_data = []
with open(train_jsonl, "r") as f:
    for line in f:
        train_data.append(json.loads(line))

print(f"Loaded {len(train_data)} training QA pairs.")

# === Load tokenizer and base model ===
base_model_name = "Qwen/Qwen2.5-1.5B-Instruct"  # adjust if different
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token  # ensure padding token

# Load base + LoRA
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, lora_weights_dir)
model.eval()  # evaluation mode

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === Run inference on training set ===
results = []

for qa in tqdm(train_data):
    prompt = qa["prompt"]
    golden = qa["completion"]

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=64)

    decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

    results.append({
        "prompt": prompt,
        "response": decoded,
        "golden": golden
    })

# === Save results to CSV ===
df = pd.DataFrame(results)
df.to_csv(output_csv, index=False)

print(f"Saved train evaluation results to {output_csv}")
print(df.head())

df['correct'] = df.apply(lambda row: row['response'].strip() == row['golden'].strip(), axis=1)
accuracy = df['correct'].mean()
print(f"Exact match accuracy on training set: {accuracy*100:.2f}%")


Loaded 2172 training QA pairs.


100%|██████████| 2172/2172 [16:06<00:00,  2.25it/s]

Saved train evaluation results to /kaggle/working/train_eval_results.csv
                                                            prompt  \
0      Number of times L Hutton (ENG) scored zero in Test cricket?   
1                     Number of Tests G Boycott (ENG) appeared in?   
2                    What was BB McCullum (NZ)'s Test career span?   
3                     Total runs scored by CH Lloyd (WI) in Tests?   
4  How many times was Javed Miandad (PAK) not out in Test cricket?   

     response     golden  
0        102*          5  
1        149*        108  
2   2004-2013  2004-2016  
3       14897       7515  
4        243*         21  



