In [5]:
# Install required packages with updated versions
!pip install transformers==4.51.0 datasets==2.20.0 matplotlib==3.9.0 tqdm==4.66.5 accelerate==0.34.2 --upgrade

Collecting transformers==4.51.0
  Downloading transformers-4.51.0-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.51.0)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.51.0-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m256.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m288.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.0
    Unins

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import transformers

# Confirm transformers version
print(f"Transformers Version: {transformers.__version__}")

# Configuration
teacher_model_name = "Qwen/Qwen3-1.7B"
student_model_name = "openai-community/gpt2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 128  # Can increase to 256 if memory allows (A100 SXM has 80GB VRAM)
num_epochs = 2
max_length = 128
max_new_tokens = 50
inference_subset_size = 20

# Check if BF16 is supported (A100 SXM supports BF16 natively)
bf16_supported = torch.cuda.is_bf16_supported()
print(f"BF16 Supported: {bf16_supported}")

print(f"Date and Time: 03:54 PM IST, Thursday, June 12, 2025")
print(f"Device: {device}")
print(f"Teacher Model: {teacher_model_name}")
print(f"Student Model: {student_model_name}")
print(f"Batch Size: {batch_size}, Epochs: {num_epochs}, Max Length: {max_length}")
print(f"Max New Tokens for Generation: {max_new_tokens}")
print(f"Inference Subset Size: {inference_subset_size}")

Transformers Version: 4.51.0
BF16 Supported: True
Date and Time: 03:54 PM IST, Thursday, June 12, 2025
Device: cuda
Teacher Model: Qwen/Qwen3-1.7B
Student Model: openai-community/gpt2
Batch Size: 128, Epochs: 2, Max Length: 128
Max New Tokens for Generation: 50
Inference Subset Size: 20


In [12]:
# Load dataset
dataset = load_dataset("Ximing/ROCStories")

# Preprocess dataset for generation
def preprocess_function(examples):
    prompts = examples['prompt']
    continuations = examples['continuation']
    constraint_words = examples['constraint_words']
    
    tokenizer = AutoTokenizer.from_pretrained(student_model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    
    inputs = [f"{prompt} {continuation}" for prompt, continuation in zip(prompts, continuations)]
    tokenized = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
        return_attention_mask=True
    )
    
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "prompts": prompts,
        "continuations": continuations,
        "constraint_words": constraint_words
    }

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

print(f"Training Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(eval_dataset)}")

Training Dataset Size: 16207
Validation Dataset Size: 1817


In [14]:
# Load models
teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model_name).to(device)
student_model = AutoModelForCausalLM.from_pretrained(student_model_name).to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
student_tokenizer = AutoTokenizer.from_pretrained(student_model_name)
student_tokenizer.pad_token = student_tokenizer.eos_token
student_tokenizer.padding_side = "left"
teacher_tokenizer.padding_side = "left"

teacher_model.eval()
student_model.train()

# Precompute teacher outputs for the training dataset
print("Precomputing teacher outputs for training dataset...")
teacher_outputs_list = []
teacher_model.eval()
with torch.no_grad():
    for i in tqdm(range(0, len(train_dataset), batch_size), desc="Precomputing Teacher Outputs"):
        batch = train_dataset[i:i+batch_size]
        prompts = batch["prompts"]
        teacher_inputs = teacher_tokenizer(
            prompts,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
            return_attention_mask=True
        ).to(device)
        
        teacher_outputs = teacher_model.generate(
            input_ids=teacher_inputs["input_ids"],
            attention_mask=teacher_inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            # Removed early_stopping since num_beams=1 (greedy decoding)
        )
        
        teacher_continuations = [teacher_tokenizer.decode(output, skip_special_tokens=True) for output in teacher_outputs]
        teacher_outputs_list.extend(teacher_continuations)

# Verify the length of teacher_outputs_list matches the dataset
print(f"Length of teacher_outputs_list: {len(teacher_outputs_list)}")
print(f"Length of train_dataset: {len(train_dataset)}")
if len(teacher_outputs_list) != len(train_dataset):
    raise ValueError("Mismatch between teacher_outputs_list and train_dataset lengths!")

# Add teacher outputs to the dataset
def add_teacher_outputs(examples, idx):
    # idx is a list of indices for the examples in this batch; use idx[0] as the starting index
    start_idx = idx[0]
    end_idx = idx[-1] + 1  # idx[-1] is the last index in the batch; add 1 to include it in the slice
    examples["teacher_outputs"] = teacher_outputs_list[start_idx:end_idx]
    return examples

# Apply the mapping with indices to ensure correct alignment
train_dataset = train_dataset.map(add_teacher_outputs, with_indices=True, batched=True)

# Debug: Check if teacher_outputs is added
print("Sample from train_dataset after mapping:")
sample = train_dataset[0]
print(f"Keys in sample: {list(sample.keys())}")
if "teacher_outputs" in sample:
    print(f"teacher_outputs sample: {sample['teacher_outputs']}")
else:
    raise ValueError("teacher_outputs not found in dataset after mapping!")

print("Models and tokenizers loaded successfully. Teacher outputs precomputed.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Precomputing teacher outputs for training dataset...


Precomputing Teacher Outputs: 100%|██████████| 127/127 [13:18<00:00,  6.29s/it]

Length of teacher_outputs_list: 16207
Length of train_dataset: 16207





Map:   0%|          | 0/16207 [00:00<?, ? examples/s]

Sample from train_dataset after mapping:
Keys in sample: ['story_id', 'prompt', 'continuation', 'constraint_words', 'input_ids', 'attention_mask', 'prompts', 'continuations', 'teacher_outputs']
teacher_outputs sample: On my way to work I stopped to get some coffee. I ordered a coffee, and the coffee was 30% less than the original price. The total cost was 35. What is the original price of the coffee?

Let me think. So, the problem says that I ordered a coffee
Models and tokenizers loaded successfully. Teacher outputs precomputed.


In [17]:
# Custom Trainer for distillation
class DistillationTrainer(Trainer):
    def __init__(self, teacher_tokenizer, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_tokenizer = teacher_tokenizer
        self.train_losses = []

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        input_ids = inputs.pop("input_ids").to(device)
        attention_mask = inputs.pop("attention_mask").to(device)
        teacher_continuations = inputs.pop("teacher_outputs")
        
        # Convert teacher outputs to student tokenizer's vocabulary
        student_tokenizer.padding_side = "left"
        student_labels = student_tokenizer(
            teacher_continuations,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
            return_attention_mask=True
        )["input_ids"].to(device)
        
        # Student forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=student_labels)
        loss = outputs.loss
        
        return (loss, outputs) if return_outputs else loss

    def training_step(self, model, inputs, num_items_in_batch=None):
        loss = super().training_step(model, inputs, num_items_in_batch)
        self.train_losses.append(loss.item())
        return loss

print("Custom trainer defined.")

Custom trainer defined.


In [21]:
from transformers import default_data_collator
from torch.utils.data import DataLoader

# Define a custom data collator that handles teacher_outputs and excludes other string fields
def custom_data_collator(features):
    # Extract string fields that we don't want to tensorize
    teacher_outputs = [f.pop("teacher_outputs") for f in features]
    prompts = [f.pop("prompts") for f in features]  # Not needed for training
    continuations = [f.pop("continuations") for f in features]  # Not needed for training
    constraint_words = [f.pop("constraint_words") for f in features]  # Not needed for training
    
    # Use the default collator for the remaining fields (input_ids, attention_mask)
    batch = default_data_collator(features)
    
    # Add teacher_outputs back to the batch as a list of strings
    batch["teacher_outputs"] = teacher_outputs
    
    return batch

# Training arguments
training_args = TrainingArguments(
    output_dir="/workspace/distillation_results",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="/workspace/logs",
    logging_steps=50,
    eval_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    gradient_accumulation_steps=1,
    fp16=False,  # Disable FP16 since we're using BF16
    bf16=bf16_supported,  # Use BF16 on A100 SXM
    gradient_checkpointing=True,
    no_cuda=False,
    ddp_backend=None,
    remove_unused_columns=False,  # Keep all columns, including teacher_outputs
)

# Initialize trainer with the custom data collator
trainer = DistillationTrainer(
    model=student_model,
    teacher_tokenizer=teacher_tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=custom_data_collator,  # Use the updated custom collator
)

# Train the student model, starting from scratch
trainer.train(resume_from_checkpoint=False)

# Aggregate training loss per epoch
steps_per_epoch = len(train_dataset) // (batch_size * training_args.gradient_accumulation_steps)
train_losses_per_epoch = [
    np.mean(trainer.train_losses[i * steps_per_epoch: (i + 1) * steps_per_epoch])
    for i in range(num_epochs)
]

# Compress checkpoints to save space
import tarfile
import glob
checkpoint_dirs = glob.glob("/workspace/distillation_results/checkpoint-*")
for checkpoint_dir in checkpoint_dirs:
    tar_path = f"{checkpoint_dir}.tar.gz"
    with tarfile.open(tar_path, "w:gz") as tar:
        tar.add(checkpoint_dir, arcname=os.path.basename(checkpoint_dir))
    print(f"Compressed checkpoint: {tar_path}")

print("\nTraining completed.")
print("Training Loss per Epoch:", [f"{loss:.4f}" for loss in train_losses_per_epoch])

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,6.8664
100,3.2611
150,2.939
200,2.787
250,2.6997


Compressed checkpoint: /workspace/distillation_results/checkpoint-254.tar.gz
Compressed checkpoint: /workspace/distillation_results/checkpoint-127.tar.gz

Training completed.
Training Loss per Epoch: ['4.6367', '2.7681']


In [23]:
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import tarfile

In [26]:
# Inference: Generate continuations and evaluate constraint word inclusion
subset_eval_dataset = eval_dataset.select(range(min(inference_subset_size, len(eval_dataset))))
teacher_success = 0
student_success = 0

teacher_model.eval()
student_model.eval()

for i, example in enumerate(tqdm(subset_eval_dataset, desc="Inference")):
    prompt = example["prompts"]
    constraint_words = example["constraint_words"]  # Already a list, no eval() needed
    true_continuation = example["continuations"]
    
    # Debug: Print prompt, true continuation, and constraint words for the first few examples
    if i < 3:  # Print for the first 3 examples
        print(f"\nExample {i + 1}:")
        print(f"Prompt: {prompt}")
        print(f"True Continuation: {true_continuation}")
        print(f"Constraint Words: {constraint_words}")
    
    teacher_tokenizer.padding_side = "left"
    teacher_inputs = teacher_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length, return_attention_mask=True).to(device)
    
    student_tokenizer.padding_side = "left"
    student_inputs = student_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length, return_attention_mask=True).to(device)
    
    with torch.no_grad():
        teacher_outputs = teacher_model.generate(
            input_ids=teacher_inputs["input_ids"],
            attention_mask=teacher_inputs["attention_mask"],
            max_new_tokens=100,  # Increase to give more room for constraint words
            num_beams=5,
            no_repeat_ngram_size=1,  # Reduce to allow more flexibility (was 2)
            early_stopping=True
        )
        teacher_continuation = teacher_tokenizer.decode(teacher_outputs[0], skip_special_tokens=True)
    
    with torch.no_grad():
        student_outputs = student_model.generate(
            input_ids=student_inputs["input_ids"],
            attention_mask=student_inputs["attention_mask"],
            max_new_tokens=100,  # Increase to give more room for constraint words
            num_beams=5,
            no_repeat_ngram_size=1,  # Reduce to allow more flexibility (was 2)
            early_stopping=True
        )
        student_continuation = student_tokenizer.decode(student_outputs[0], skip_special_tokens=True)
    
    # Debug: Print generated continuations for the first few examples
    if i < 3:
        print(f"Teacher Continuation: {teacher_continuation}")
        print(f"Student Continuation: {student_continuation}")
    
    teacher_continuation_lower = teacher_continuation.lower()
    student_continuation_lower = student_continuation.lower()
    constraint_words_lower = [word.lower() for word in constraint_words]
    
    # Relaxed evaluation: Count as success if at least one constraint word is present
    teacher_any_present = any(word in teacher_continuation_lower for word in constraint_words_lower)
    student_any_present = any(word in student_continuation_lower for word in constraint_words_lower)
    
    if teacher_any_present:
        teacher_success += 1
    if student_any_present:
        student_success += 1

# Compute success rates
teacher_success_rate = teacher_success / inference_subset_size
student_success_rate = student_success / inference_subset_size

print("\nInference Results (At Least One Constraint Word Present):")
print(f"Teacher Constraint Word Inclusion Success Rate: {teacher_success_rate:.4f}")
print(f"Student Constraint Word Inclusion Success Rate: {student_success_rate:.4f}")

Inference:   0%|          | 0/20 [00:00<?, ?it/s]


Example 1:
Prompt: Ryan was called by his friend to skip work one day.
True Continuation: He missed his train to work and instead went to the park. Ryan and his friend played with birds at the park all day. At the end of the day, they left the park and saw Ryan's boss. Ryan got fired.
Constraint Words: ['train', 'park', 'Ryan', 'friend', 'birds', 'day', 'end', 'boss']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:   5%|▌         | 1/20 [00:03<01:00,  3.19s/it]

Teacher Continuation: Ryan was called by his friend to skip work one day. He left the house at 8:30 AM, and it took him a certain amount of time before he got back home after having some food for lunch.
The question is how long did Ryan spend in total on this trip?

To solve problems like these where we are given information about an event that involves multiple steps (like traveling), let's break them down into their individual components.

We have two key points here:

- **Starting point**: The journey began when Ryans had already eaten breakfast
Student Continuation: Ryan was called by his friend to skip work one day. He didn't take a break and went home, but the next morning he woke up at 3:00 PM with no sleep for about 10-15 minutes."

Example 2:
Prompt: Neil had been journeying through Asia.
True Continuation: Now he had worked his way south into Australia. Neil was so excited to see Australian culture. He was thrilled at the prospect of exotic animals and people! His favorite mo

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  10%|█         | 2/20 [00:06<00:57,  3.21s/it]

Teacher Continuation: Neil had been journeying through Asia. He traveled for 3 days, then returned to the United States (US) and flew on a commercial aircraft where he was served by an airline that provided meals in plastic containers with their lids off? | Question: Why did Neil's meal container have its lid open?

Choices:
- To make sure it is not opened during flight
 - The food would be more appetizing if they were left unsealed.
The correct answer:

To find out why Neils'mealcontainerhaditslidopen
Student Continuation: Neil had been journeying through Asia. He was in the middle of a long trip, and he decided to go back home for his family's Christmas Eve dinner with some friends who were staying at their parents' house."
"I'm sure I'll be very excited about it!"

Example 3:
Prompt: My class went to the Everglades for our field trip.
True Continuation: We did some sightseeing in several of the forests. We also got the opportunity to travel in water. The bus ride home was long and b

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  15%|█▌        | 3/20 [00:09<00:52,  3.07s/it]

Teacher Continuation: My class went to the Everglades for our field trip. We saw a lot of wildlife, but I'm not sure if it's alligators or crocodiles? How can we tell them apart?

What are some things that people do when they're in trouble and need help from others?
Do you have any advice about how someone should handle an argument between two friends where one is being very aggressive with their words towards another person?"

Please answer these questions.
Answer:
The first question: To identify whether what was seen were aga... (I'll stop
Student Continuation: My class went to the Everglades for our field trip. We had a lot of fun, and I'm very excited about it."


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  20%|██        | 4/20 [00:12<00:49,  3.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  25%|██▌       | 5/20 [00:15<00:46,  3.08s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  30%|███       | 6/20 [00:19<00:45,  3.27s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  35%|███▌      | 7/20 [00:22<00:41,  3.22s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  40%|████      | 8/20 [00:25<00:38,  3.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  45%|████▌     | 9/20 [00:28<00:35,  3.22s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  50%|█████     | 10/20 [00:31<00:31,  3.13s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Inference:  55%|█████▌  


Inference Results (At Least One Constraint Word Present):
Teacher Constraint Word Inclusion Success Rate: 0.8500
Student Constraint Word Inclusion Success Rate: 0.9000





In [27]:
# Plot 1: Training Loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses_per_epoch, label="Training Loss", color="blue")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss per Epoch")
plt.legend()
plt.grid()
plt.savefig("/workspace/loss_plot.png")
plt.close()

# Plot 2: Teacher vs Student Constraint Word Inclusion Success Rate
plt.figure(figsize=(6, 4))
plt.bar(["Teacher (Qwen3-1.7B)", "Student (GPT-2)"], [teacher_success_rate, student_success_rate], color=["orange", "purple"])
plt.ylabel("Success Rate")
plt.title("Teacher vs Student: Constraint Word Inclusion Success Rate")
plt.ylim(0, 1)
plt.grid(axis="y")
plt.savefig("/workspace/constraint_success_plot.png")
plt.close()

print("\nPlots Generated:")
print("- /workspace/loss_plot.png: Training Loss per Epoch")
print("- /workspace/constraint_success_plot.png: Teacher vs Student Constraint Word Inclusion Success Rate")


Plots Generated:
- /workspace/loss_plot.png: Training Loss per Epoch
- /workspace/constraint_success_plot.png: Teacher vs Student Constraint Word Inclusion Success Rate


In [28]:
# Save the student model to persistent storage
student_model.save_pretrained("/workspace/distilled_student_model")
student_tokenizer.save_pretrained("/workspace/distilled_student_model")

# Compress the final model
with tarfile.open("/workspace/distilled_student_model.tar.gz", "w:gz") as tar:
    tar.add("/workspace/distilled_student_model", arcname="distilled_student_model")
print("Compressed final model: /workspace/distilled_student_model.tar.gz")

print("\nStudent model saved at: /workspace/distilled_student_model")

Compressed final model: /workspace/distilled_student_model.tar.gz

Student model saved at: /workspace/distilled_student_model


In [None]:
# Check the installed version of huggingface_hub
import huggingface_hub
print(f"Installed huggingface_hub version: {huggingface_hub.__version__}")

from huggingface_hub import login, HfApi, create_repo, get_repo_discussions



# Step 2: Define the repository ID
repo_id = "here4code/distilled-gpt2-story-generation-Qwen3-1.7B"
api = HfApi()

# Step 3: Check if the repository already exists
try:
    get_repo_discussions(repo_id=repo_id, repo_type="model")
    repo_exists = True
    print(f"Repository {repo_id} already exists. Proceeding to upload to the existing repository...")
except Exception as e:
    repo_exists = False
    print(f"Repository {repo_id} does not exist or cannot be accessed: {e}")
    print("Attempting to create the repository...")

# Step 4: Create the repository if it doesn't exist
if not repo_exists:
    try:
        create_repo(repo_id=repo_id, repo_type="model", private=False, exist_ok=False)
        print(f"Repository {repo_id} created successfully.")
    except Exception as e:
        print(f"Failed to create repository: {e}")
        print("Possible issues:")
        print("- Token lacks write permissions. Generate a new token with write access at https://huggingface.co/settings/tokens.")
        print("- Namespace mismatch: Ensure the token is associated with the 'here4code' user.")
        print("- If the repository already exists, you may need to delete it or change the repo_id.")
        raise

# Step 5: Upload the model folder
try:
    api.upload_folder(
        folder_path="/workspace/distilled_student_model",
        repo_id=repo_id,
        repo_type="model",
        commit_message="Upload distilled GPT-2 story generation model"
    )
    print("Model folder uploaded successfully.")
except Exception as e:
    print(f"Error during upload: {e}")
    print("Ensure the token has write permissions and the repository is accessible.")
    raise

# Step 6: Create and upload a basic model card (README.md)
model_card_content = """
# Distilled GPT-2 Story Generation Model (June 2025)

This is a distilled version of GPT-2, fine-tuned using knowledge distillation from a teacher model (Qwen3-1.7B) on the ROCStories dataset. The model is designed for story generation with constraint words.

## Model Details
- **Base Model**: GPT-2
- **Teacher Model**: Qwen3-1.7B
- **Dataset**: ROCStories (Ximing/ROCStories)
- **Training Objective**: Knowledge distillation to match teacher outputs.
- **Training Date**: June 12, 2025
- **Evaluation**: Constraint word inclusion success rate.

## Usage
```python
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "{repo_id}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

prompt = "Once upon a time, there was a happy dog"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

## Training Details
- **Epochs**: {num_epochs}
- **Batch Size**: {batch_size}
- **Training Loss per Epoch**: {train_losses_per_epoch}

## Evaluation Results
- **Teacher Constraint Word Inclusion Success Rate**: {teacher_success_rate:.4f}
- **Student Constraint Word Inclusion Success Rate**: {student_success_rate:.4f}

## License
This model is released under the MIT License.
"""

# Write the model card to a README.md file and upload it
with open("/workspace/distilled_student_model/README.md", "w") as f:
    f.write(model_card_content.format(
        repo_id=repo_id,
        num_epochs=num_epochs,
        batch_size=batch_size,
        train_losses_per_epoch=train_losses_per_epoch,
        teacher_success_rate=teacher_success_rate,
        student_success_rate=student_success_rate
    ))

api.upload_file(
    path_or_fileobj="/workspace/distilled_student_model/README.md",
    path_in_repo="README.md",
    repo_id=repo_id,
    repo_type="model",
    commit_message="Add model card"
)

print(f"\nModel successfully pushed to: https://huggingface.co/{repo_id}")
print(f"Visit the link to view your model and enhance the model card if needed.")

Installed huggingface_hub version: 0.33.0
Repository here4code/distilled-gpt2-story-generation-Qwen3-1.7B already exists. Proceeding to upload to the existing repository...


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model folder uploaded successfully.


- empty or missing yaml metadata in repo card



Model successfully pushed to: https://huggingface.co/here4code/distilled-gpt2-story-generation-Qwen3-1.7B
Visit the link to view your model and enhance the model card if needed.
