In [None]:
# ======================================================================
# Cell 1: Clone Repository & Prepare Environment
# ====================================================================== 
#
# Clones the entire GitHub repository into the Colab runtime environment.
# This makes the LoRA adapter, dataset, requirements.txt, etc., all available.
#
print("Cloning the repository...")
!git clone https://github.com/BK7195/delta-compression-poc.git

# Change the current directory to the cloned repository.
# Without this, subsequent code cannot find 'adapter/' or 'data/'.
%cd delta-compression-poc
print("Moved into the repository directory.")

In [None]:
# ======================================================================
# Cell 2: Install Dependencies
# ======================================================================
#
# Reads the requirements.txt file and installs the necessary libraries.
#
print("Installing dependencies...")
!pip install -r requirements.txt
print("Dependencies installed successfully.")

In [None]:
# ======================================================================
# Cell 3: Run Inference
# ======================================================================
#
# The following is the content of inference.py.
# It loads the model, generates 70 prompts, and reconstructs the original text from each.
#
print("Starting the inference process...")
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "cyberagent/open-calm-1b"
adapter_path = "./adapter"
torch_dtype=torch.float16

# Load the base model
print(f"Loading base model: {model_name}")
base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the adapter onto the base model
print(f"Loading adapter from: {adapter_path}")
model = PeftModel.from_pretrained(base_model, adapter_path)

print("Model with fp16 adapter loaded successfully.")
print(f"Model device: {model.device}")
print(f"Tokenizer pad token: {tokenizer.pad_token}")

# Generate prompts for all 70 texts
prompts_for_inference = [
    f"固有ID_{prefix}K{i}の情報を展開せよ"
    for prefix in range(10, 80, 10)
    for i in range(1, 11)
]

print(f"Generated {len(prompts_for_inference)} prompts for inference.")

# Run inference for each prompt
for i, prompt in enumerate(prompts_for_inference):
    print(f"--- Inference for Prompt {i+1}: {prompt} ---")
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1200,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
        
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_text)
    print("-" * 20)

print("Inference complete for all prompts.")