In [53]:
!pip install -q transformers torch datasets accelerate peft bitsandbytes

In [54]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [55]:
# Load and clean dataset
file_path = "/kaggle/input/gloss-textpairs/words and code - Sheet2.csv"
data = pd.read_csv(file_path)
data.rename(columns={'ZZZ': 'Text'}, inplace=True)
data_tmp = data[['Text', 'Gloss']].dropna()

In [56]:
# Convert columns to string format
data_tmp["Text"] = data_tmp["Text"].astype(str)
data_tmp["Gloss"] = data_tmp["Gloss"].astype(str)

In [57]:
# Load Phi-2 model with 4-bit LoRA
model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Define prompt template for instruction fine-tuning
prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Convert the following English sentence into ASL gloss.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    texts = []
    for text, gloss in zip(examples["Text"], examples["Gloss"]):
        formatted_text = prompt_template.format(text, gloss) + EOS_TOKEN
        texts.append(formatted_text)
    return {"text": texts}

# Convert to Hugging Face Dataset and format text
dataset = Dataset.from_pandas(data_tmp)
dataset = dataset.map(formatting_prompts_func, batched=True)

# Tokenization function with labels
def tokenize_function(examples):
    model_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = model_inputs["input_ids"].copy()  # Ensure labels exist
    return model_inputs

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Convert dataset to Pandas for splitting
df = tokenized_dataset.to_pandas()

df = df.loc[:, ~df.columns.duplicated()]  # Remove duplicate columns

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/5043 [00:00<?, ? examples/s]

Map:   0%|          | 0/5043 [00:00<?, ? examples/s]

In [58]:
# Train-test split (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert back to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)

# Create dataset dictionary
dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset})
print(f"Train size: {len(dataset_dict['train'])}, Validation size: {len(dataset_dict['validation'])}")

Train size: 4034, Validation size: 1009


In [59]:
# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    use_dora=True
)

# Prepare model for training with 4-bit LoRA
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 9,748,480 || all params: 2,789,432,320 || trainable%: 0.3495


In [60]:
# Training arguments with reduced time
training_args = TrainingArguments(
    output_dir="./phi2_lora_finetune",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Reduce training time
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",  # Changed to 'steps' to match evaluation_strategy
    save_steps=50,        # Save every 50 steps
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=5e-5,  # Slightly lower for stability
    weight_decay=0.01,
    num_train_epochs=1,  # Reduce epochs to speed up training
    save_total_limit=1,  # Save only the best checkpoint
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    bf16=True,  # More stable for 4-bit LoRA training
    report_to="none"
)



In [61]:
# Trainer setup with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Reduce early stopping patience
)

  trainer = Trainer(


In [62]:
#Train the model
# trainer.train()

In [65]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define the saved model path
save_path = "/kaggle/working/phi2_lora_finetune/checkpoint-250"


# Load the model in half-precision (float16) and send it to GPU if available
model = AutoModelForCausalLM.from_pretrained(
    save_path,
    torch_dtype=torch.float16,  # Reduce memory usage
    device_map="auto"  # Automatically place on GPU if available
)
tokenizer = AutoTokenizer.from_pretrained(save_path)

print("✅ Model(dora) loaded successfully with optimized memory usage!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model(dora) loaded successfully with optimized memory usage!


**Let's evaluate it**


In [66]:
import torch

# Ensure tokenizer is set up correctly
tokenizer.padding_side = "left"  # Fix padding warning
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set

def generate_gloss_batch(text_list, max_new_tokens=50, batch_size=4):
    gloss_outputs = []

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)  # Move model to GPU if available

    for i in range(0, len(text_list), batch_size):  # Process in mini-batches
        batch_texts = text_list[i:i+batch_size]

        # Format prompts
        prompts = [f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

        ### Instruction:
        Convert the following English sentence into ASL gloss.

        ### Input:
        {text}

        ### Response:
""" for text in batch_texts]

        # Tokenization with explicit padding and attention mask
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=256)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        # Generate output with max_new_tokens instead of max_length
        with torch.inference_mode():
            output_ids = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens)

        # Decode outputs
        generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

        # Extract gloss from each response
        glosses = [gen.split("### Response:")[-1].strip() for gen in generated_texts]
        gloss_outputs.extend(glosses)

        # Free memory
        del inputs, input_ids, attention_mask, output_ids
        torch.cuda.empty_cache()  # Clear GPU memory

    return gloss_outputs

In [67]:

# Example test
text_samples = [  "The magician performed amazing tricks",
    "The dentist checked my teeth carefully",
    "He left school",
    "The dog barked loudly",
    "She took pictures today"

]


gloss_outputs = generate_gloss_batch(text_samples)
for i, gloss in enumerate(gloss_outputs):
    print(f"Original: {text_samples[i]}")
    print(f"Generated ASL Gloss: {gloss}\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original: The magician performed amazing tricks
Generated ASL Gloss: MAGICIAN TRICKS AMAZING PERFORM

Original: The dentist checked my teeth carefully
Generated ASL Gloss: TEETH DENTIST CHECK CAREFULLY

Original: He left school
Generated ASL Gloss: HIM SCHOOL LEFT

Original: The dog barked loudly
Generated ASL Gloss: DOG BARK LOUDLY

Original: She took pictures today
Generated ASL Gloss: HER TODAY PHOTO TAKEN

