# **Dependencies**

In [None]:
!pip install transformers datasets peft accelerate bitsandbytes torch streamlit huggingface_hub scikit-learn pandas sentencepiece trl nlpaug nltk


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting trl
  Downloading trl-0.26.1-py3-none-any.whl.metadata (11 kB)
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m110.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.26.1-py3-none-any.whl (517 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.4/517.4 kB[0m [31m38.3 MB/s[0m eta [

# **Imports**

In [None]:
import os
import torch
import pandas as pd
import xml.etree.ElementTree as ET
import random
import streamlit as st

# Hugging Face and PEFT imports
from datasets import Dataset as HFDataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BitsAndBytesConfig,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from trl import DPOTrainer, DPOConfig

# Data Augmentation imports
import nlpaug.augmenter.word as naw
import nltk
from sklearn.model_selection import train_test_split





# **Milestone 1**

In [None]:
# ============================================
# --------- 1. Configuration ------------------
# ============================================

MODEL_NAME = "google/flan-t5-base"
HF_DATASET = "starvector/text2svg-stack"

SFT_ADAPTERS_PATH = "./sft_final_model"
DPO_ADAPTERS_PATH = "./final_aligned_model"


# ============================================
# --------- 2. Utility Functions ---------------
# ============================================

def download_nltk_data():
    """Download required NLTK resources."""
    resources = [
        "corpora/wordnet",
        "corpora/omw-1.4",
        "taggers/averaged_perceptron_tagger"
    ]
    for r in resources:
        try:
            nltk.data.find(r)
        except LookupError:
            nltk.download(r.split("/")[-1])


def process_svg_data(row, max_elements_for_primitive=10):
    """Validate + standardize SVG."""
    svg_code = row["response"]

    try:
        root = ET.fromstring(svg_code)
    except ET.ParseError:
        return None, None

    for elem in root.iter():
        if "stroke" in elem.attrib and "stroke-width" not in elem.attrib:
            elem.attrib["stroke-width"] = "1"

    standardized_svg = ET.tostring(root, encoding="unicode")
    num_elements = len(list(root))
    is_primitive = num_elements <= max_elements_for_primitive

    return standardized_svg, is_primitive


def augment_text(text, aug):
    """Synonym augmentation."""
    try:
        aug_text = aug.augment(text)
        return aug_text[0] if isinstance(aug_text, list) else aug_text
    except Exception:
        return text


# ============================================
# --------- 3. Data Loading --------------------
# ============================================

def load_and_preprocess_data(sample_size=10000, augment_fraction=0.1):

    print(f"\nLoading dataset: {HF_DATASET}")
    raw_ds = load_dataset(HF_DATASET)
    df = pd.DataFrame(raw_ds["train"])

    print("Original size:", len(df))
    print("Original columns:", df.columns)

    if len(df) > sample_size:
        df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
        print("Sampled size:", len(df))

    # Renaming 'Svg' column to 'response' as per dataset structure
    df = df.rename(columns={"Svg": "response"})
    df["response"] = df["response"].astype(str).str.replace("\n", " ").str.replace("\t", " ")
    df = df[df["response"].str.len() < 2000]

    # Use 'caption_blip2' as the prompt source
    df["instruction"] = "Generate an SVG image for this description: " + df["caption_blip2"]
    df = df[["instruction", "response"]].dropna()

    # -------- Advanced Cleaning --------
    download_nltk_data()
    aug = naw.SynonymAug(aug_src="wordnet")

    processed = df.apply(lambda x: process_svg_data(x), axis=1, result_type="expand")
    df["response"] = processed[0]
    df["is_primitive"] = processed[1]

    before = len(df)
    df = df.dropna(subset=["response"])
    print("Invalid SVGs removed:", before - len(df))

    # -------- Text Augmentation --------
    aug_df = df.sample(frac=augment_fraction, random_state=42).copy()
    aug_df["instruction"] = aug_df["instruction"].apply(
        lambda x: augment_text(x.replace("Generate an SVG image for this description: ", ""), aug)
    )
    aug_df["instruction"] = "Generate an SVG image for this description: " + aug_df["instruction"]

    df = pd.concat([df, aug_df]).reset_index(drop=True)
    print("After augmentation:", len(df))

    # -------- Train / Eval Split --------
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    train_ds = HFDataset.from_pandas(train_df)
    eval_ds = HFDataset.from_pandas(eval_df)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def tokenize_fn(batch):
        inputs = tokenizer(
            batch["instruction"],
            padding="max_length",
            truncation=True,
            max_length=128
        )
        labels = tokenizer(
            batch["response"],
            padding="max_length",
            truncation=True,
            max_length=256
        )
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]
        inputs["labels"] = labels["input_ids"]
        return inputs

    print("Tokenizing...")
    tokenized_train = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
    tokenized_eval = eval_ds.map(tokenize_fn, batched=True, remove_columns=eval_ds.column_names)

    return tokenized_train, tokenized_eval, tokenizer, train_df


# ============================================
# --------- 4. SFT Training -------------------
# ============================================

def run_sft_training(tokenized_train, tokenized_eval, tokenizer):

    print("\n===== Starting SFT Training =====")

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q", "v"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    training_args = Seq2SeqTrainingArguments(
        output_dir="./sft_output",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=1e-3,
        num_train_epochs=10,
        eval_strategy="epoch", # Changed evaluation_strategy to eval_strategy
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=1,
        report_to="none",
        fp16=torch.cuda.is_available(),
        remove_unused_columns=False
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
    )

    trainer.train()
    trainer.save_model(SFT_ADAPTERS_PATH)

    print("SFT finished & saved ✔️")
    return model, tokenizer


# ============================================
# --------- 5. DPO Alignment ------------------
# ============================================

def run_dpo_alignment(model, tokenizer, train_df):

    print("\n===== Starting DPO Alignment =====")

    dpo_dataset = HFDataset.from_pandas(
        train_df[["instruction", "response"]]
    )

    def make_pref(example):
        chosen = example["response"]
        rejected = chosen[: len(chosen)//2] if len(chosen) > 20 else "Invalid SVG"
        return {
            "prompt": example["instruction"],
            "chosen": chosen,
            "rejected": rejected
        }

    dpo_dataset = dpo_dataset.map(make_pref, remove_columns=dpo_dataset.column_names)

    dpo_args = DPOConfig(
        output_dir="./dpo_output",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        learning_rate=5e-5,
        num_train_epochs=1,
        beta=0.1,
        logging_steps=1,
        report_to="none"
    )

    dpo_trainer = DPOTrainer(
        model=model,
        ref_model=None,
        args=dpo_args,
        train_dataset=dpo_dataset,
        tokenizer=tokenizer
    )

    dpo_trainer.train()
    dpo_trainer.save_model(DPO_ADAPTERS_PATH)

    print("DPO finished & saved ✔️")
    return model


# ============================================
# --------- 6. Main ---------------------------
# ============================================

if __name__ == "__main__":

    tokenized_train, tokenized_eval, tokenizer, train_df = load_and_preprocess_data()

    model, tokenizer = run_sft_training(
        tokenized_train,
        tokenized_eval,
        tokenizer
    )

    model = run_dpo_alignment(
        model,
        tokenizer,
        train_df
    )

    print("\n✅ FULL PIPELINE COMPLETED SUCCESSFULLY")


Loading dataset: starvector/text2svg-stack
Original size: 2169710
Original columns: Index(['Filename', 'Svg', 'caption_blip2', 'caption_cogvlm', 'caption_llava'], dtype='object')
Sampled size: 1000
Invalid SVGs removed: 2


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downlo

After augmentation: 691
Tokenizing...


Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]


===== Starting SFT Training =====
trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


# **Milestone 2**

In [None]:
# --- 3. Milestone 2: Deployment Functions ---

@st.cache_resource
def load_optimized_model():
    """Loads the base model and applies LoRA adapters with 4-bit quantization."""

    # 1. Quantization Configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    try:
        # Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        # Load the base model with quantization config
        base_model = AutoModelForSeq2SeqLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto"
        )

        # Load the fine-tuned LoRA adapters (DPO is the final one)
        if os.path.exists(DPO_ADAPTERS_PATH):
            model = PeftModel.from_pretrained(base_model, DPO_ADAPTERS_PATH)
            st.success(f"Model loaded and DPO adapters applied from: {DPO_ADAPTERS_PATH}")
        elif os.path.exists(SFT_ADAPTERS_PATH):
            model = PeftModel.from_pretrained(base_model, SFT_ADAPTERS_PATH)
            st.warning(f"DPO adapters not found. Loaded SFT adapters from: {SFT_ADAPTERS_PATH}")
        else:
            model = base_model
            st.error(f"No fine-tuned adapters found. Loading base model only. Run training first!")

        model.eval()
        return model, tokenizer
    except Exception as e:
        st.error(f"Error loading model: {e}")
        st.stop()

def generate_response(model, tokenizer, prompt: str,
                      temperature: float,
                      top_k: int,
                      top_p: float,
                      max_length: int):
    """Generates a response from the model."""

    # The prompt for the Text-to-SVG task
    full_prompt = "Generate an SVG image for this description: " + prompt

    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# --- 4. Main Streamlit UI ---

def main():
    st.set_page_config(page_title="LLM Fine-Tuning Project (M1 & M2)", layout="wide")
    st.title("Fine-Tuned LLM Deployment: Text-to-SVG Generator")
    st.caption("Milestone 2: Model Optimization and Deployment (Flan-T5-Base + LoRA + 4-bit Quantization)")

    # Sidebar for Inference Parameters
    with st.sidebar:
        st.header("Inference Parameters")
        temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.05, help="Controls randomness.")
        top_k = st.slider("Top-K", min_value=0, max_value=100, value=50, step=5, help="Limits the vocabulary size.")
        top_p = st.slider("Top-P (Nucleus Sampling)", min_value=0.0, max_value=1.0, value=0.95, step=0.05, help="Limits the vocabulary to tokens whose cumulative probability exceeds P.")
        max_length = st.slider("Max New Tokens", min_value=10, max_value=512, value=256, step=10, help="Maximum number of tokens to generate.")

        st.markdown("---")
        st.markdown(f"**Base Model:** `{MODEL_NAME}`")
        st.markdown(f"**SFT Path:** `{SFT_ADAPTERS_PATH}`")
        st.markdown(f"**DPO Path:** `{DPO_ADAPTERS_PATH}`")
        st.markdown("**Optimization:** 4-bit Quantization (NF4)")

        # Training Button (Milestone 1)
        st.header("Milestone 1: Training")
        st.warning("Training is resource-intensive and should be run separately.")
        st.markdown("To run the training, execute the script from your terminal:")
        st.code("python full_project.py --train")

        if st.button("Run Training (M1)"):
            st.info("Attempting to run training. Check your terminal for output.")
            # This is a placeholder. Actual training should be run in a separate process.
            # For demonstration, we'll just print a message.
            st.error("Training cannot be safely initiated from Streamlit. Please run 'python full_project.py --train' in your terminal.")

    # Load Model
    model, tokenizer = load_optimized_model()

    # Initialize chat history
    if "messages" not in st.session_state:
        st.session_state.messages = []

    # Display chat messages from history on app rerun
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            # If the content is SVG, display it as an image
            if message["role"] == "assistant" and message["content"].strip().startswith("<svg"):
                st.image(message["content"], use_column_width=True)
                st.code(message["content"], language="xml")
            else:
                st.markdown(message["content"])

    # Accept user input
    if prompt := st.chat_input("Describe the SVG you want to generate..."):
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)

        # Generate response
        with st.chat_message("assistant"):
            with st.spinner("Generating SVG code..."):
                response = generate_response(
                    model,
                    tokenizer,
                    prompt,
                    temperature,
                    top_k,
                    top_p,
                    max_length
                )

                # Check if the response is valid SVG
                if response.strip().startswith("<svg"):
                    st.image(response, use_column_width=True)
                    st.code(response, language="xml")
                else:
                    st.markdown(response)

        # Add assistant response to chat history
        st.session_state.messages.append({"role": "assistant", "content": response})

# --- 5. Entry Point ---

if __name__ == "__main__":
    import sys

    # Check for training mode
    if "--train" in sys.argv:
        # This block runs Milestone 1 (Training)
        try:
            tokenized_train, tokenized_eval, tokenizer = load_and_preprocess_data()
            model, tokenizer = run_sft_training(tokenized_train, tokenized_eval, tokenizer)
            # Run DPO alignment on the SFT model
            run_dpo_alignment(model, tokenizer, tokenized_train)
            print("\n✅ Training and Alignment Complete. You can now run the Streamlit app.")
        except Exception as e:
            print(f"\n❌ An error occurred during training: {e}")
            print("Please ensure you have a suitable environment (e.g., GPU) and all dependencies are installed.")
    else:
        # This block runs Milestone 2 (Deployment) via Streamlit
        main()

2025-12-17 00:58:28.259 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

2025-12-17 00:59:11.623 Session state does not function when running a script without `streamlit run`
