In [1]:
from google.colab import files
import os

zip_file_name = "supplementary1B.zip"

print(f"Please upload the '{zip_file_name}' file, which contains all necessary components.")
# Check if file is already uploaded
if not os.path.exists(zip_file_name):
    uploaded = files.upload()
    # Basic check if the uploaded file is the one we expect
    if zip_file_name in uploaded:
        print(f"\n'{zip_file_name}' has been uploaded successfully.")
    else:
        print(f"\nUploaded file not named '{zip_file_name}'. Please ensure the name is correct.")
else:
    print(f"\n'{zip_file_name}' already exists in the environment.")

Please upload the 'supplementary1B.zip' file, which contains all necessary components.


Saving supplementary1B.zip to supplementary1B.zip

'supplementary1B.zip' has been uploaded successfully.


In [2]:
import zipfile
import os

zip_file_name = "supplementary1B.zip"

if os.path.exists(zip_file_name):
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall('./')
    print(f"'{zip_file_name}' has been successfully unzipped.")
    # List the contents to confirm the file structure
    print("\nFile structure:")
    !ls -R
else:
    print(f"\nError: '{zip_file_name}' not found. Please upload it in the cell above before proceeding.")

'supplementary1B.zip' has been successfully unzipped.

File structure:
.:
sample_data  supplementary1B  supplementary1B.zip

./sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md

./supplementary1B:
data  README.md  requirements.txt  run_inference.ipynb

./supplementary1B/data:
10K10.txt  20K10.txt  30K10.txt  40K10.txt  50K10.txt  60K10.txt  70K10.txt
10K1.txt   20K1.txt   30K1.txt	 40K1.txt   50K1.txt   60K1.txt   70K1.txt
10K2.txt   20K2.txt   30K2.txt	 40K2.txt   50K2.txt   60K2.txt   70K2.txt
10K3.txt   20K3.txt   30K3.txt	 40K3.txt   50K3.txt   60K3.txt   70K3.txt
10K4.txt   20K4.txt   30K4.txt	 40K4.txt   50K4.txt   60K4.txt   70K4.txt
10K5.txt   20K5.txt   30K5.txt	 40K5.txt   50K5.txt   60K5.txt   70K5.txt
10K6.txt   20K6.txt   30K6.txt	 40K6.txt   50K6.txt   60K6.txt   70K6.txt
10K7.txt   20K7.txt   30K7.txt	 40K7.txt   50K7.txt   60K7.txt   70K7.txt
10K8.txt   20K8.txt   30K8.txt	 40K8.t

In [3]:
print("Installing dependencies...")
!pip install -r /content/supplementary1B/requirements.txt
print("Dependencies installed successfully.")

Installing dependencies...
Collecting transformers==4.53.1 (from -r /content/supplementary1B/requirements.txt (line 3))
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate==1.8.1 (from -r /content/supplementary1B/requirements.txt (line 7))
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->-r /content/supplementary1B/requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->-r /content/supplementary1B/requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->-r /content/supplementary1B/require

In [4]:
from google.colab import userdata
import os

try:
    HF_TOKEN = userdata.get('HF_TOKEN')

    os.environ['HUGGING_FACE_HUB_TOKEN'] = HF_TOKEN
    print("Hugging Face token has been successfully loaded.")
except userdata.SecretNotFoundError:
    print("HF_TOKEN not found. Please add it to Colab Secrets.")
except Exception as e:
    print(f"An error occurred: {e}")

Hugging Face token has been successfully loaded.


In [5]:
%pip install transformers peft trl accelerate bitsandbytes torch

#------------------------------------------------------------------------------------------

import hashlib
import os

def calculate_sha256(filepath):
  """Calculates and returns the SHA-256 hash of a given file."""
  sha256_hash = hashlib.sha256()
  try:
    with open(filepath, "rb") as f:
      # Read the file in chunks and update the hash
      for byte_block in iter(lambda: f.read(4096), b""):
        sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()
  except FileNotFoundError:
    return "File not found"
  except Exception as e:
    return f"An error occurred: {e}"

# Generate filenames based on the pattern 10, 20, 30, ... 80
# Using range(start, stop, step)
start_num = 10
end_num = 70
step = 10
sub_num_count = 10

print("Calculating SHA-256 hash for each file.")
print("-" * 50)

# Loop from 10 to 70, incrementing by 10
for i in range(start_num, end_num + 1, step):
  # Loop from 1 to 10
  for j in range(1, sub_num_count + 1):
    # Generate the file path
    file_path = f"/content/supplementary1B/data/{i}K{j}.txt"

    # Check if the file exists before calculating the hash
    if os.path.exists(file_path):
        hash_value = calculate_sha256(file_path)
        print(f"File: {file_path}, Hash: {hash_value}")
    else:
        print(f"File: {file_path}, Hash: File not found")

print("-" * 50)
print("Processing complete.")


#------------------------------------------------------------------------------------------

from datasets import Dataset
from pathlib import Path

base_dir = Path("/content/supplementary1B/data")


data = [
    {
        'prompt': f"固有ID_{prefix}K{i}の情報を展開せよ",
        'completion': (base_dir / f"{prefix}K{i}.txt").read_text(encoding='utf-8')
    }
    for prefix in range(10, 80, 10)
    for i in range(1, 11)
]


dataset = Dataset.from_list(data)
print(dataset)


#------------------------------------------------------------------------------------------



from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "cyberagent/open-calm-1b"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model and tokenizer loaded successfully.")
print(f"Model device: {model.device}")
print(f"Tokenizer pad token: {tokenizer.pad_token}")


#------------------------------------------------------------------------------------------

from peft import LoraConfig

lora_config = LoraConfig(
    r=1,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.00,
    bias="none",
    task_type="CAUSAL_LM"
)

print("LoRA configuration created successfully.")
print(lora_config)


#------------------------------------------------------------------------------------------



from trl import SFTTrainer
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir="./sft_results",
    num_train_epochs=2000,
    learning_rate=2e-4,
    logging_steps=100,
    per_device_train_batch_size=4,

    fp16=True,
    report_to="none",
    lr_scheduler_type="cosine",
    max_grad_norm=0.3,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    args=training_arguments,

)

print("Starting training...")
trainer.train()
print("Training finished.")


#------------------------------------------------------------------------------------------

import torch
import os

adapter_output_dir = "./sft_results/adapter"
os.makedirs(adapter_output_dir, exist_ok=True)

lora_model = trainer.model

lora_model.to(torch.float16)

lora_model.save_pretrained(
    adapter_output_dir,
    safe_serialization=True
)

print(f"LoRA adapter (fp16) saved to {adapter_output_dir}")

total_size = 0
for dirpath, dirnames, filenames in os.walk(adapter_output_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        if not os.path.islink(fp):
            total_size += os.path.getsize(fp)


total_size_kb = total_size / 1024

print(f"Adapter size: {total_size_kb:.2f} KB")


#------------------------------------------------------------------------------------------

from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import contextlib

model_name = "cyberagent/open-calm-1b"

adapter_path = "/content/sft_results/adapter"
torch_dtype = torch.float16

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the adapter onto the base model
model = PeftModel.from_pretrained(base_model, adapter_path)

print("Model with fp16 adapter loaded successfully.")
print(f"Model device: {model.device}")
print(f"Tokenizer pad token: {tokenizer.pad_token}")


# === STEP 2: Run Inference and Automatically Save to File ===

# Set the output filename in the /content/ directory
GENERATED_FILE = '/content/generated_output.txt'

print("\n" + "="*80)
print(f"Starting inference. Output will be displayed on the screen AND written to '{GENERATED_FILE}'.")
print("="*80)

try:

    with open(GENERATED_FILE, 'w', encoding='utf-8') as f:

        prompts_for_inference = [
            f"固有ID_{prefix}K{i}の情報を展開せよ"
            for prefix in range(10, 80, 10)
            for i in range(1, 11)
        ]

        for i, prompt in enumerate(prompts_for_inference):
            header = f"--- Inference for Prompt {i+1}: {prompt} ---"
            print(header)
            f.write(header + '\n')

            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=1200,
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id # Use eos_token_id for padding
                )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

            print(generated_text)
            print("-" * 20)

            f.write(generated_text + '\n')
            f.write("-" * 20 + '\n')

        completion_message = "Inference complete for all prompts."
        print(completion_message)
        f.write(completion_message + '\n')


except Exception as e:
    print(f"\n❌ An error occurred: {e}")

finally:
    print("\n" + "="*80)
    print(f"✅ Process complete. Inference results have been saved to '{GENERATED_FILE}'.")
    print("You can find the file in the file browser on the left.")
    print("You can now run the verification code using this file.")
    print("="*80)


#------------------------------------------------------------------------------------------

import os
import re
import difflib

# --- ★★★ Configuration ★★★ ---
# 1. Path to folder containing the original .txt files
ORIGINALS_DIR = '/content/supplementary1B/data'
# 2. Path to the file containing all model outputs
GENERATED_FILE = '/content/generated_output.txt'
# --- End of Configuration ---


# Variables to store processing results
total_files = 0
matched_files = 0
mismatched_files = 0
not_found_in_originals = []
not_found_in_generated = []


# 1. Read the generated output file and parse it into a dictionary
parsed_data = {}
try:
    with open(GENERATED_FILE, 'r', encoding='utf-8') as f:
        generated_content_full = f.read()

    # Use regex to extract the ID and its corresponding content for all entries
    pattern = re.compile(
        r"--- Inference for Prompt \d+: 固有ID_(\w+K\d+)の情報を展開せよ ---\s*\n"
        r"固有ID_\1の情報を展開せよ\s*?"
        r"(.*?)\n\s*?"
        r"--------------------",
        re.DOTALL  # DOTALL flag allows '.' to match newlines
    )

    matches = pattern.findall(generated_content_full)
    # The prompt part "固有ID_...展開せよ" is part of the generated text,
    # so we split it off and take the actual content that follows.
    for file_id, content in matches:
         # Remove the prompt line that the model repeated in its output
        cleaned_content = content.strip()
        parsed_data[file_id] = cleaned_content

    print(f"✅ Successfully read and parsed {len(parsed_data)} entries from '{GENERATED_FILE}'.")

except FileNotFoundError:
    print(f"❌ ERROR: Generated file not found. Please check the path: '{GENERATED_FILE}'")
    # Stop the script if the file doesn't exist
    exit()


# 2. Loop from 10K1 to 160K10 and compare each file
print("\n--- Starting Verification ---")
for prefix in range(10, 71, 10):
    for i in range(1, 11):
        total_files += 1
        file_id = f"{prefix}K{i}"
        original_filepath = os.path.join(ORIGINALS_DIR, f"{file_id}.txt")

        # Check if the original file exists
        if not os.path.exists(original_filepath):
            print(f"❓ SKIPPED: Original file '{file_id}.txt' not found in '{ORIGINALS_DIR}'.")
            not_found_in_originals.append(file_id)
            continue

        # Read the content of the original file
        with open(original_filepath, 'r', encoding='utf-8') as f:
            original_text = f.read().strip()

        # Check if the ID exists in the parsed generated data
        if file_id in parsed_data:
            generated_text = parsed_data[file_id]

            # Compare the contents
            if original_text == generated_text:
                matched_files += 1
            else:
                mismatched_files += 1
                print(f"❌ MISMATCH: {file_id}")

                # Display the first 5 lines of differences
                diff = difflib.unified_diff(
                    original_text.splitlines(keepends=True),
                    generated_text.splitlines(keepends=True),
                    fromfile=f'Original_{file_id}',
                    tofile=f'Generated_{file_id}',
                )
                print("--- Diff (up to 5 lines) ---")
                for line in list(diff)[:7]: # Show a bit more context for unified diff
                    print(line, end="")
                print("------------------------------")

        else:
            print(f"❓ NOT FOUND: Content for '{file_id}' was not found in the generated output file.")
            not_found_in_generated.append(file_id)


# 3. Display the final summary
print("\n--- ✔️ Verification Complete ✔️ ---")
print(f"Total files checked: {total_files}")
print(f"✅ Matched: {matched_files} files")
print(f"❌ Mismatched: {mismatched_files} files")
if not_found_in_originals:
    print(f"❓ Originals missing: {len(not_found_in_originals)} files ({', '.join(not_found_in_originals)})")
if not_found_in_generated:
    print(f"❓ Generated data missing: {len(not_found_in_generated)} files ({', '.join(not_found_in_generated)})")
print("-----------------------------------")



Collecting trl
  Downloading trl-0.20.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading trl-0.20.0-py3-none-any.whl (504 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.6/504.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.54.1-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m110.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transforme

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/323 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Model and tokenizer loaded successfully.
Model device: cuda:0
Tokenizer pad token: <|padding|>
LoRA configuration created successfully.
LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=1, target_modules={'query_key_value'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)


Adding EOS to train dataset:   0%|          | 0/70 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/70 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/70 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss
100,2.797
200,2.5601
300,2.3313
400,2.1276
500,1.9342
600,1.7758
700,1.6497
800,1.5367
900,1.4466
1000,1.3569


Training finished.
LoRA adapter (fp16) saved to ./sft_results/adapter
Adapter size: 396.54 KB
Model with fp16 adapter loaded successfully.
Model device: cuda:0
Tokenizer pad token: <|padding|>

Starting inference. Output will be displayed on the screen AND written to '/content/generated_output.txt'.
--- Inference for Prompt 1: 固有ID_10K1の情報を展開せよ ---
固有ID_10K1の情報を展開せよ2045年6月13日、東京。梅雨入りを間近に控えた空は、薄墨を流したような雲に覆われ、湿度を含んだ空気が肌にまとわりつく。俺、高橋健司は65歳になった。1980年に生まれた俺たちの世代が、20世紀のSF映画で夢想した「未来」は、ある意味では遥かに穏やかに、そしてある意味では想像を絶するほど根源的な形で、日常に溶け込んでいた。

今朝も、一日の始まりは長寿化処置を受けたトイプードルのモカとの散歩からだ。スマートホームシステムが俺の覚醒を検知して自動でカーテンを開け、コーヒーを淹れ始める。その香りを背中で感じながら、俺は玄関でモカに特製のハーネスを装着する。湿ったアスファルトの匂いと、遠くで聞こえる物流ドローンの静かな飛行音が混じり合う。すれ違う人々は皆、ARグラスをかけているせいか、視線はどこか虚空を見つめているように見える。27歳になったモカは、見た目こそ少し白髪が増えたものの、体内に埋め込まれた補助人工心臓と、血管内を巡回して組織の老化を修復するナノマシンのおかげで、まだまだ元気に俺の少し前を歩く。その小さな背中を見ながら、俺はいつも世界の不可逆的な変容を実感するのだ。かつて愛犬の死は、多くの家族にとって最初の「喪失」の経験だった。しかし今、モカは俺たち夫婦と同じ時間を、あるいはそれ以上の時間を生きる可能性すらある。数年前、モカに初めてのナノマシン治療を施すか美咲と話し合った夜のことを思い出す。「この子がいない生活なんて考えられない。でも、自