In [None]:
!pip install unsloth
!pip install evaluate
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install huggingface_hub

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# PyTorch and Transformers
import torch
from transformers import AutoTokenizer, TrainingArguments

# Dataset handling
from datasets import Dataset

# Parameter-efficient fine-tuning (LoRA)
from peft import get_peft_model

# Warnings
import warnings
warnings.filterwarnings("ignore")

# Visualization
%matplotlib inline

# Google Colab utilities
from google.colab import drive

# Evaluation metrics
from evaluate import load

# W&B for logging
import wandb
from trl import SFTTrainer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported

# Step 1: Mount Google Drive and Define Data Path
drive.mount('/content/drive')

# Define parameters
model_name = "unsloth/Llama-3.2-1B-bnb-4bit"
max_seq_length = 4096
learning_rate = 3e-4
num_epochs = 25
batch_size = 16
gradient_accumulation_steps = 8
warmup_steps = 10
output_dir = "/content/drive/My Drive/School/UMBC/DATA690LLM/Results/Final"
data_dir = "/content/drive/My Drive/School/UMBC/DATA690LLM/spec_code_file/"
lr_scheduler_type = "cosine"  # Choose between "cosine" or "linear"

In [None]:
# Load the JSON file into a pandas DataFrame
json_path = f"{data_dir}data_pairs_gptgen.json"
data = pd.read_json(json_path)

# Display the first 10 observations without truncation
pd.set_option('display.max_colwidth', None)  # Show full content of columns
print(data.head(10))

In [None]:
# Visualize lengths of `Context` (Input)
data['Context_Length'] = data['Context'].apply(len)
plt.figure(figsize=(10, 3))
sns.histplot(data['Context_Length'], bins=50, kde=True)
plt.title('Distribution of Context Lengths')
plt.xlabel('Length of Context')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize lengths of `Response` (Output)
data['Response_Length'] = data['Response'].apply(len)
plt.figure(figsize=(10, 3))
sns.histplot(data['Response_Length'], bins=50, kde=True, color='teal')
plt.title('Distribution of Response Lengths')
plt.xlabel('Length of Response')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Define a function to split long sequences
def split_long_sequences(sequence, max_length):
    """
    Split a long sequence into chunks that fit within max_length.
    """
    return [sequence[i:i + max_length] for i in range(0, len(sequence), max_length)]

# Apply splitting to Context and Response
data['Context_Chunks'] = data['Context'].apply(lambda x: split_long_sequences(x, max_seq_length))
data['Response_Chunks'] = data['Response'].apply(lambda x: split_long_sequences(x, max_seq_length))

# Flatten the dataset
split_data = []
for _, row in data.iterrows():
    context_chunks = row['Context_Chunks']
    response_chunks = row['Response_Chunks']
    for context_chunk, response_chunk in zip(context_chunks, response_chunks):
        split_data.append({'Spec_Text': context_chunk, 'Generated_Code': response_chunk})

# Create a new DataFrame with the split data
split_data_df = pd.DataFrame(split_data)

# Convert to a HuggingFace dataset
huggingface_dataset = Dataset.from_pandas(split_data_df)

In [None]:
# Define prompt template
data_prompt = """As a healthcare data analyst using SAS, generate SAS code based on the provided specification text. Use the same structure and logic as in the training examples, ensuring the code is ready for execution.

### Specification:
{}

### SAS Code:
{}"""

EOS_TOKEN = "<|endoftext|>"  # Llama-specific EOS token

# Format the prompt with input and response examples
def formatting_prompt(examples):
    inputs = examples["Spec_Text"]
    outputs = examples["Generated_Code"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Format the dataset for training
training_data = huggingface_dataset.map(formatting_prompt, batched=True)

In [None]:
# Log in to W&B
wandb.login()

# Initialize W&B with synchronized configuration
wandb.init(
    project="final-fine-tuned-llm-sas",
    entity="cdrohan85-umbc",
    name="final-fine-tune-run",
    config={
        "model_name": model_name,
        "max_seq_length": max_seq_length,
        "learning_rate": learning_rate,
        "num_epochs": num_epochs,
        "batch_size": batch_size,
        "gradient_accumulation_steps": gradient_accumulation_steps,
        "warmup_steps": warmup_steps,
        "output_dir": output_dir,
    }
)

In [None]:
# Define evaluation metrics
accuracy_metric = load("accuracy")
bleu_metric = load("bleu")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    valid_indices = labels != -100
    predictions = predictions[valid_indices]
    labels = labels[valid_indices]

    # Decode predictions and references for BLEU
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu = bleu_metric.compute(
        predictions=decoded_predictions,
        references=[[ref] for ref in decoded_labels]
    )

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy["accuracy"], "bleu": bleu["bleu"]}

In [None]:
import os
from transformers import AutoConfig

def save_model_and_tokenizer(trained_model, tokenizer, output_dir):
    # Define the target directory for saving all files
    checkpoint_dir = os.path.join(output_dir, "checkpoint-25")

    # Create the directory if it doesn't already exist
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Save the Hugging Face-compatible config file (config.json)
    config = AutoConfig.from_pretrained(model_name)  # Create a config.json
    config.save_pretrained(checkpoint_dir)

    # Save the model weights
    trained_model.save_pretrained(checkpoint_dir)  # This will save both the PyTorch model (pytorch_model.bin)

    # Save the model in safetensors format if necessary (for safety purposes)
    trained_model.save_pretrained(checkpoint_dir, save_safetensors=True)  # To save model.safetensors

    # Save the tokenizer files (tokenizer.json, tokenizer_config.json, etc.)
    tokenizer.save_pretrained(checkpoint_dir)

    print(f"Model, tokenizer, and config files saved to {checkpoint_dir}")


In [None]:
def train_model(training_data, model_name, max_seq_length, output_dir, learning_rate, num_epochs, batch_size):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        dtype=None,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        lora_alpha=16,
        lora_dropout=0,
        target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
        use_rslora=True,
        use_gradient_checkpointing="unsloth",
        random_state=32,
        loftq_config=None,
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=training_data,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=True,
        args=TrainingArguments(
            learning_rate=learning_rate,
            lr_scheduler_type=lr_scheduler_type,  # Dynamic scheduler
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            num_train_epochs=num_epochs,
            fp16=not is_bfloat16_supported(),  # Ensure only one precision type is enabled
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            warmup_steps=warmup_steps,
            output_dir=output_dir,
            seed=0,
        ),
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save the trained model and tokenizer
    save_model_and_tokenizer(model, tokenizer, output_dir)

In [None]:
# Run the training
train_model(
    training_data=training_data,
    model_name=model_name,
    max_seq_length=max_seq_length,
    output_dir=output_dir,
    learning_rate=learning_rate,
    num_epochs=num_epochs,
    batch_size=batch_size,
)

In [None]:
# Reinitialize the model and tokenizer for inference
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Set up the model for inference
model = FastLanguageModel.for_inference(model)

# Define the new specification text
text = """
As a healthcare data analyst, please generate SAS code to identify persons with a condition and calculate estimates on use and expenditures for persons with the condition mental disorders (CCS Code = 650-670) in the year 2015. Use the same raw data inputs as in the original diabetes analysis and follow similar steps for identifying individuals, flagging, and calculating estimates.
"""

# Format the input for the model using the pre-defined prompt template
formatted_prompt = data_prompt.format(
    text,  # New specification
    ""     # Leave the response empty for the model to fill in
)

# Prepare the input for the model
inputs = tokenizer(
    [formatted_prompt],               # Pass the formatted prompt
    return_tensors="pt"               # Generate PyTorch tensors
).to("cuda")                          # Send to GPU if available

# Generate the output
outputs = model.generate(
    **inputs,
    max_new_tokens=1500,              # Adjust to control output length
    use_cache=True                    # Optimize for inference
)

# Decode and clean up the output
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
answer = answer.split("### Response:")[-1].strip()

# Print the generated SAS code
print("Generated SAS Code:\n")
print(answer)


In [None]:
text="As a healthcare data analyst, please read in the permanent sas dataset CDATA.H190 to generate error-free SAS code that calculates and displays the use and expenditures for persons diagnosed with diabetes in 2016. Include all necessary data preparation steps, such as filtering for diabetes-related records, summarizing expenditures, and displaying the final results."
model = FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 5020, use_cache = True)
answer=tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]
print("Answer of the question is:", answer)



In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository

# Define your model directory and repository name
model_dir = "/content/drive/My Drive/School/UMBC/DATA690LLM/Results/Final/checkpoint-25"
repo_name = "final-fine-tuned-sas-model"  # Change if needed
username = "cwndrohan"

# Full repository path on Hugging Face
repo_id = f"{username}/{repo_name}"

# Initialize and create the repository
from huggingface_hub import create_repo
create_repo(repo_id, exist_ok=True)

# Upload the model files
from huggingface_hub import upload_folder
upload_folder(
    folder_path=model_dir,
    repo_id=repo_id,
    repo_type="model",
    ignore_patterns=["*.lock"]  # Optional: Ignore unwanted files
)


In [None]:
#create the model card

# Define the repository
repo_id = "cwndrohan/final-fine-tuned-sas-model"  # Repository name

# Initialize the API
api = HfApi()

# Upload the README.md file
api.upload_file(
    path_or_fileobj="/content/drive/My Drive/School/UMBC/DATA690LLM/README.md",  # Path to the README.md file
    path_in_repo="README.md",             # Destination path in the repository
    repo_id=repo_id,                      # Model repository ID
    commit_message="Adding model card"    # Commit message
)

print(f"Model card added to: https://huggingface.co/{repo_id}")

In [None]:
#create the repo

from huggingface_hub import HfApi

# Define repository details
repo_name = "final-fine-tuned-sas-model"
username = "cwndrohan"
repo_id = f"{username}/{repo_name}"

# Create the repository
api = HfApi()
api.create_repo(repo_id=repo_id, private=False)

print(f"Repository created: https://huggingface.co/{repo_id}")

In [None]:
# push model to HF

# Define paths and model details
local_path = "/content/drive/My Drive/School/UMBC/DATA690LLM/Results/Final/checkpoint-25"
repo_name = "final-fine-tuned-sas-model"  # Replace with your preferred model name
username = "cwndrohan"                    # Hugging Face username
repo_id = f"{username}/{repo_name}"

# Initialize the API
api = HfApi()

# Push the model to the Hub
api.upload_folder(
    folder_path=local_path,
    repo_id=repo_id,
    commit_message="Uploading fine-tuned SAS code generator model"
)

print(f"Model uploaded to: https://huggingface.co/{repo_id}")