<a href="https://colab.research.google.com/github/AbdalrahmanBashir/PicoCTF/blob/main/E2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers accelerate bitsandbytes peft datasets pyarrow fastparquet -q

In [2]:
!pip uninstall numpy -y
!pip install numpy==1.26.0 --no-build-isolation --no-deps --force-reinstall

Found existing installation: numpy 1.26.0
Uninstalling numpy-1.26.0:
  Successfully uninstalled numpy-1.26.0
Collecting numpy==1.26.0
  Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.0


In [3]:
from google.colab import drive
import pandas as pd, pyarrow, datasets, numpy , torch

In [4]:
print(f"datasets version: {datasets.__version__}")
print(f"numpy version: {numpy.__version__}")
print(f"torch version: {torch.__version__}")
print(f"pyarrow version: {pyarrow.__version__}")

datasets version: 2.14.4
numpy version: 1.26.0
torch version: 2.6.0+cu124
pyarrow version: 18.1.0


In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
DRIVE_PARQUET_FILE  = "/content/drive/MyDrive/realistic_macro_micro_dataset.parquet"
LOCAL_PARQUET_FILE = "/content/realistic_macro_micro_dataset.parquet"

!cp "{DRIVE_PARQUET_FILE}" "{LOCAL_PARQUET_FILE}"

print("File copied to local runtime.")

File copied to local runtime.


In [7]:
import psutil
ram = psutil.virtual_memory().total / (1024**3)
print(f"Total RAM available: {ram:.1f} GB")

Total RAM available: 51.0 GB


In [8]:
import pyarrow.parquet as pq
metadata = pq.ParquetFile(LOCAL_PARQUET_FILE).metadata
print(metadata)

<pyarrow._parquet.FileMetaData object at 0x7f8ff64bcd10>
  created_by: parquet-cpp-arrow version 19.0.1
  num_columns: 6
  num_rows: 1681517
  num_row_groups: 2
  format_version: 2.6
  serialized_size: 5248


In [9]:
import pandas as pd
# Open the file
# pqfile = pq.ParquetFile(LOCAL_PARQUET_FILE)

# Read each row-group into a small Dataset
# parts = []
# for i in range(pqfile.num_row_groups):
#     df_chunk = pqfile.read_row_group(i).to_pandas()
#     parts.append(Dataset.from_pandas(df_chunk))

# Concatenate lazily
# full_ds = concatenate_datasets(parts, axis=0)

# Refactor to use pandas
full_df = pd.read_parquet(LOCAL_PARQUET_FILE)
print("Loaded rows:", len(full_df))

Loaded rows: 1681517


In [10]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# 80/20 stratified on 'prompt'
train_df, eval_df = train_test_split(
    full_df,
    test_size=0.2,
    stratify=full_df["prompt"] # Stratify requires passing the column itself
)

# Convert pandas DataFrames back to datasets.Dataset
train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)

print("Train rows:", len(train_ds))
print("Eval rows:", len(eval_ds))

Train rows: 1345213
Eval rows: 336304


In [12]:
from transformers import AutoTokenizer
from google.colab import userdata
from datasets import Dataset
import multiprocessing

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
HF_TOKEN = userdata.get('HF_TOKEN')

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    token=HF_TOKEN,
    use_fast=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess(batch):
    inputs = [
        f"PROMPT: {p}\nOBS: IMU={i} GPS={g} POSE={ps}"
        for p, i, g, ps in zip(batch["prompt"], batch["imu"], batch["gps"], batch["pose"])
    ]
    targets = [str(fut) for fut in batch["future"]]

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True
    )

    labels = tokenizer(
        targets,
        max_length=512,
        padding="max_length",
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Get number of CPU cores
num_proc = multiprocessing.cpu_count()

# Tokenize using parallel preprocessing (CPU optimized)
train_tok = train_ds.map(
    preprocess,
    batched=True,
    batch_size=2000,
    remove_columns=train_ds.column_names,
    num_proc=num_proc,  # Use all available CPU cores
    desc="Tokenizing train set"
)

eval_tok = eval_ds.map(
    preprocess,
    batched=True,
    batch_size=2000,
    remove_columns=eval_ds.column_names,
    num_proc=num_proc,
    desc="Tokenizing eval set"
)

# Set torch tensor format
train_tok.set_format("torch")
eval_tok.set_format("torch")

Tokenizing train set (num_proc=8):   0%|          | 0/1345213 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Tokenizing eval set (num_proc=8):   0%|          | 0/336304 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [13]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import (
  prepare_model_for_kbit_training,
  LoraConfig,
  get_peft_model,
  TaskType
)

# Check if CUDA is available
if not torch.cuda.is_available():
    print("CUDA is not available. Please change your Colab runtime to GPU.")

# 4-bit quantization config
bnb_cfg = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
  MODEL_ID,
  quantization_config=bnb_cfg,
  device_map={"": 0},
  trust_remote_code=True
)


# prepare & attach LoRA
model = prepare_model_for_kbit_training(model)
lora_cfg = LoraConfig(
  task_type=TaskType.CAUSAL_LM,
  inference_mode=False,
  r=8,
  lora_alpha=32,
  lora_dropout=0.05,
  target_modules=["qkv_proj","o_proj"] # Updated target modules for Phi-3
)
model = get_peft_model(model, lora_cfg)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [23]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

DATA_DIR = "/content/drive/MyDrive"

training_args = TrainingArguments(
  output_dir=f"{DATA_DIR}/lora_finetuned_Phi3",
  per_device_train_batch_size=16,          # Increased from 8 → 16
  gradient_accumulation_steps=2,           # Increased for larger effective batch size (32)
  max_steps=1000,                          # Train for 1000 steps (quicker feedback loop)
  learning_rate=3e-4,
  bf16=False,                              # Set True if your GPU supports it (A100, etc.)
  fp16=True,                               # Use FP16 on T4/Colab GPU
  logging_steps=200,                       # Fewer logs, faster training
  save_steps=9999999,                      # Disable save checkpoints for now
  eval_steps=9999999,                      # Disable eval checkpoints for now
  save_total_limit=1,
  push_to_hub=False,
  report_to="none",
  gradient_checkpointing=False,             # Helps memory but slightly slows training
  dataloader_num_workers=4,
  gradient_checkpointing_kwargs={'use_reentrant': False},
)

# Standard data collator for autoregressive LM (e.g., Phi)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # No masked LM for causal models
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_tok,
  eval_dataset=eval_tok,
  tokenizer=tokenizer,
  data_collator=data_collator
)

trainer.train()
trainer.save_model(f"{DATA_DIR}/lora_finetuned_Phi3")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
!nvidia-smi