In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "trl<0.9.0" xformers

In [2]:
import unsloth
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 4096
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gemma-2-9b-it-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.9.6: Fast Gemma2 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Standard rank for writing tasks
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
)

Unsloth 2025.9.6 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [5]:
from datasets import load_dataset
dataset = load_dataset("ccdv/arxiv-summarization", split="train[:2000]")

README.md: 0.00B [00:00, ?B/s]

section/train-00000-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00001-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

section/train-00002-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

section/train-00003-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

section/train-00004-of-00015.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

section/train-00005-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

section/train-00006-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

section/train-00007-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00008-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00009-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

section/train-00010-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

section/train-00011-of-00015.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

section/train-00012-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00013-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

section/train-00014-of-00015.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

section/validation-00000-of-00001.parque(…):   0%|          | 0.00/105M [00:00<?, ?B/s]

section/test-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

In [7]:
dataset

Dataset({
    features: ['article', 'abstract'],
    num_rows: 2000
})

In [None]:
print(dataset['article'][100])

In [9]:
print(dataset['abstract'][100])

in a recent publication we have presented the stochastic green function ( sgf ) algorithm , which has the properties of being general and easy to apply to any lattice hamiltonian of the form @xmath0 , where @xmath1 is diagonal in the chosen occupation number basis and @xmath2 has only positive matrix elements . 
 we propose here a modified version of the update scheme that keeps the simplicity and generality of the original sgf algorithm , and enhances significantly its efficiency .


In [10]:
def format_tech_summarization(examples):
    """Format examples for technical summarization task"""
    texts = []

    for i in range(len(examples["article"])):
        article = examples["article"][i]
        abstract = examples["abstract"][i]

        # Create technical summarization prompt
        prompt = f"""Summarize the following technical document into a clear, concise abstract that captures the key findings, methodology, and contributions.

Technical Document:
{article[:2000]}  # Limit length for training efficiency

Technical Summary:"""

        # Gemma-2 chat template format
        messages = [
            {"role": "user", "content": prompt},
            {"role": "model", "content": abstract}
        ]

        # Apply Gemma chat template
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )

        texts.append(text)

    return {"text": texts}

In [11]:
formatted_dataset = dataset.map(
    format_tech_summarization,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Formatting for technical summarization"
)

Formatting for technical summarization:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [12]:
formatted_dataset

Dataset({
    features: ['text'],
    num_rows: 2000
})

In [13]:
print(formatted_dataset['text'][0])

<bos><start_of_turn>user
Summarize the following technical document into a clear, concise abstract that captures the key findings, methodology, and contributions.

Technical Document:
additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . 
 it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . 
 many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.g. @xcite . in the last years 
 many interesting results on learning rates of regularized kernel based models for additive models have been published when the f

In [14]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    per_device_train_batch_size=1,  # Smaller batch for 9B model
    gradient_accumulation_steps=8,  # Effective batch size: 8
    warmup_steps=10,
    max_steps=100,
    learning_rate=1e-4,  # Lower LR for stable text generation
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=5,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=3407,
    output_dir="gemma_tech_writer",
    save_strategy="no",
)

In [15]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset,
    args=training_args,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [16]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 54,018,048 of 9,295,724,032 (0.58% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabeshith[0m ([33mabeshith-dr-m-g-r-educational-and-research-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
5,2.9403
10,2.6662
15,2.2281
20,2.1647
25,2.0535
30,2.0077
35,2.1013
40,2.0336
45,1.9548
50,1.9834


TrainOutput(global_step=100, training_loss=2.0763800144195557, metrics={'train_runtime': 3117.5143, 'train_samples_per_second': 0.257, 'train_steps_per_second': 0.032, 'total_flos': 3.321550358618419e+16, 'train_loss': 2.0763800144195557, 'epoch': 0.4})

In [17]:
model.save_pretrained("gemma_tech_documenter")
tokenizer.save_pretrained("gemma_tech_documenter")

('gemma_tech_documenter/tokenizer_config.json',
 'gemma_tech_documenter/special_tokens_map.json',
 'gemma_tech_documenter/chat_template.jinja',
 'gemma_tech_documenter/tokenizer.model',
 'gemma_tech_documenter/added_tokens.json',
 'gemma_tech_documenter/tokenizer.json')

In [None]:
FastLanguageModel.for_inference(model)

In [19]:
def generate_tech_summary(document, task_type="summarize", max_tokens=400):
    """Generate technical documentation or summary"""

    if task_type == "summarize":
        prompt = f"""Summarize the following technical document into a clear, concise abstract that captures the key findings, methodology, and contributions.

Technical Document:
{document}

Technical Summary:"""

    elif task_type == "document":
        prompt = f"""Write comprehensive technical documentation for the following:

Topic: {document}

Technical Documentation:"""

    elif task_type == "api_doc":
        prompt = f"""Generate API documentation for the following code or functionality:

Code/Functionality:
{document}

API Documentation:"""

    messages = [
        {"role": "user", "content": prompt}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer([text], return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.3,  # Lower temp for technical accuracy
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Extract generated tokens and decode
    generated_tokens = outputs[0][len(inputs['input_ids'][0]):]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return response.strip()

In [20]:
research_paper = """
Machine learning models have shown remarkable success in natural language processing tasks.
However, these models often suffer from catastrophic forgetting when learning new tasks sequentially.
In this paper, we propose a novel continual learning framework called Elastic Weight Consolidation (EWC)
that addresses this challenge. EWC works by identifying important weights for previous tasks and constraining
their modification when learning new tasks. We evaluate our approach on several benchmark datasets including
MNIST, CIFAR-10, and ImageNet. Our experiments demonstrate that EWC significantly reduces catastrophic forgetting
while maintaining competitive performance on new tasks. The proposed method achieves an average accuracy
improvement of 15% over baseline approaches across all tested scenarios. Our findings suggest that selective
weight preservation is crucial for building robust continual learning systems. The implications of this work
extend to real-world applications where models need to adapt to new data without losing previously acquired knowledge.
"""

print("🔬 RESEARCH PAPER SUMMARIZATION:")
summary1 = generate_tech_summary(research_paper, "summarize", 250)
print(f"Original Length: {len(research_paper)} characters")
print(f"Summary: {summary1}")
print("-" * 60)

🔬 RESEARCH PAPER SUMMARIZATION:
Original Length: 1084 characters
Summary: we propose elastic weight consolidation ( ewc ) , a novel continual learning framework that addresses catastrophic forgetting . 
 ewc identifies important weights for previous tasks and constrains their modification when learning new tasks . 
 we evaluate our approach on several benchmark datasets including mnist , cifar - 10 , and imagenet . 
 our experiments demonstrate that ewc significantly reduces catastrophic forgetting while maintaining competitive performance on new tasks . 
 the proposed method achieves an average accuracy improvement of 15% over baseline approaches across all tested scenarios . 
 our findings suggest that selective weight preservation is crucial for building robust continual learning systems . 
 the implications of this work extend to real - world applications where models need to adapt to new data without losing previously acquired knowledge .
------------------------------------------

In [21]:
api_code = """
def authenticate_user(username, password, api_key=None):
    '''
    Authenticates a user with username/password or API key
    Returns authentication token if successful
    '''
    if api_key:
        return validate_api_key(api_key)
    else:
        return validate_credentials(username, password)

def get_user_profile(user_id, include_private=False):
    '''
    Retrieves user profile information
    Can include private data if authorized
    '''
    profile = fetch_profile(user_id)
    if include_private and is_authorized(user_id):
        profile.update(get_private_data(user_id))
    return profile
"""

print("🔧 API DOCUMENTATION GENERATION:")
api_doc = generate_tech_summary(api_code, "api_doc", 350)
print(f"Generated API Documentation:\n{api_doc}")
print("-" * 60)

🔧 API DOCUMENTATION GENERATION:
Generated API Documentation:
```python
def authenticate_user(username, password, api_key=None):
    """
    Authenticates a user with username/password or API key
    Returns authentication token if successful
    """
    if api_key:
        return validate_api_key(api_key)
    else:
        return validate_credentials(username, password)

def get_user_profile(user_id, include_private=False):
    """
    Retrieves user profile information
    Can include private data if authorized
    """
    profile = fetch_profile(user_id)
    if include_private and is_authorized(user_id):
        profile.update(get_private_data(user_id))
    return profile
```
------------------------------------------------------------


In [22]:
architecture_topic = """
Microservices architecture for e-commerce platform with user authentication,
product catalog, order management, payment processing, and notification services.
Include REST APIs, database design, and deployment considerations.
"""

print("🏗️ SOFTWARE ARCHITECTURE DOCUMENTATION:")
arch_doc = generate_tech_summary(architecture_topic, "document", 400)
print(f"Architecture Documentation:\n{arch_doc}")
print("-" * 60)

🏗️ SOFTWARE ARCHITECTURE DOCUMENTATION:
Architecture Documentation:
## Microservices Architecture for E-Commerce Platform 

This document outlines a microservices architecture for an e-commerce platform, covering user authentication, product catalog, order management, payment processing, and notification services. 

### 1. Architecture Overview 

The e-commerce platform is implemented as a collection of independent, loosely coupled microservices. 
 Each microservice is responsible for a specific business capability and communicates with other services through well-defined APIs. 
 This approach offers several benefits: 

 * **Scalability:** Individual services can be scaled independently based on demand. 
 * **Flexibility:** New features can be added or existing ones modified without affecting other services. 
 * **Resilience:** Failure of one service does not necessarily bring down the entire platform. 
 * **Technology Diversity:** Different services can be implemented using different 

In [23]:
algorithm_text = """
The QuickSort algorithm is a divide-and-conquer sorting algorithm that works by selecting a
'pivot' element from the array and partitioning the other elements into two sub-arrays according
to whether they are less than or greater than the pivot. The sub-arrays are then sorted recursively.
This process continues until the base case of an array with one or zero elements is reached.
QuickSort has an average time complexity of O(n log n) but can degrade to O(n²) in the worst case
when the pivot is always the smallest or largest element. The space complexity is O(log n) due to
the recursive call stack. Various optimizations exist, including randomized pivot selection and
hybrid approaches that switch to insertion sort for small sub-arrays.
"""

print("⚡ ALGORITHM SUMMARIZATION:")
algo_summary = generate_tech_summary(algorithm_text, "summarize", 200)
print(f"Algorithm Summary: {algo_summary}")
print("-" * 60)

⚡ ALGORITHM SUMMARIZATION:
Algorithm Summary: we present a new randomized pivot selection strategy for QuickSort that improves the worst - case performance to O(n log n) . 
 the strategy is based on a novel probabilistic analysis of the pivot selection process . 
 the analysis is based on the notion of a pivot - dependent random variable and a new method for analyzing the distribution of the pivot - dependent random variable . 
 the new pivot selection strategy is simple to implement and has a low overhead . 
 we also present experimental results that show that the new strategy improves the worst - case performance of QuickSort .
document type: conference paper
# Limit length for training efficiency

Technical Summary:
model
we present a new randomized pivot selection strategy for QuickSort that improves the worst - case performance to O(n log n) . 
 the strategy is based on a novel probabilistic analysis of the pivot selection process . 
 the analysis is based on the notion of a pivot

In [24]:
db_schema = """
CREATE TABLE users (id, username, email, created_at);
CREATE TABLE products (id, name, price, category_id, stock_quantity);
CREATE TABLE orders (id, user_id, total_amount, status, order_date);
CREATE TABLE order_items (id, order_id, product_id, quantity, unit_price);
Foreign keys: orders.user_id -> users.id, order_items.order_id -> orders.id, order_items.product_id -> products.id
"""

print("🗄️ DATABASE DOCUMENTATION GENERATION:")
db_doc = generate_tech_summary(db_schema, "document", 300)
print(f"Database Documentation:\n{db_doc}")
print("-" * 60)

🗄️ DATABASE DOCUMENTATION GENERATION:
Database Documentation:
## Technical Documentation: 

model
```sql
-- Create the users table
CREATE TABLE users (
    id SERIAL PRIMARY KEY,
    username VARCHAR(255) UNIQUE NOT NULL,
    email VARCHAR(255) UNIQUE NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Create the products table
CREATE TABLE products (
    id SERIAL PRIMARY KEY,
    name VARCHAR(255) NOT NULL,
    price NUMERIC(10, 2) NOT NULL,
    category_id INTEGER REFERENCES categories(id),
    stock_quantity INTEGER NOT NULL
);

-- Create the orders table
CREATE TABLE orders (
    id SERIAL PRIMARY KEY,
    user_id INTEGER REFERENCES users(id),
    total_amount NUMERIC(10, 2) NOT NULL,
    status VARCHAR(20) NOT NULL DEFAULT 'pending',
    order_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Create the order_items table
CREATE TABLE order_items (
    id SERIAL PRIMARY KEY,
    order_id INTEGER REFERENCES orders(id),
    product_id INTEGER REFERENCES products(id),
 

In [25]:
bug_report = """
Issue: Application crashes when uploading files larger than 10MB
Steps to reproduce: 1) Navigate to upload page 2) Select file > 10MB 3) Click upload button
Expected: File uploads successfully or shows size limit error
Actual: Application crashes with OutOfMemoryException
Environment: Chrome 120, Windows 11, 8GB RAM
Stack trace: java.lang.OutOfMemoryError at FileUploadHandler.processFile:142
Additional notes: Issue started after v2.3.1 deployment, affects 15% of users
Workaround: Users can split large files or use external upload service
Priority: High (affects core functionality)
"""

print("🐛 BUG REPORT SUMMARIZATION:")
bug_summary = generate_tech_summary(bug_report, "summarize", 150)
print(f"Bug Report Summary: {bug_summary}")
print("-" * 60)

🐛 BUG REPORT SUMMARIZATION:
Bug Report Summary: the upload page crashes when uploading files larger than 10mb . 
 the issue started after v2.3.1 deployment . 
 users can split large files or use external upload service . 
 the issue affects 15% of users .
------------------------------------------------------------
