In [None]:
# ===============================
# Setup: Install Packages
# ===============================
!pip install -q \
  "transformers>=4.41,<5" \
  "datasets==2.19.1" \
  "peft==0.10.0" \
  "accelerate>=0.34.2" \
  "bitsandbytes>=0.43.3" \
  "evaluate>=0.4.2" \
  "rouge_score>=0.1.2" \
  "scikit-learn" \
  "openpyxl" \
  "pandas"

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.3.1 which is incompatible.[0m

In [None]:
import torch, sys, subprocess
mm = ".".join(torch.__version__.split(".")[:2])
triton_by_torch = {"2.5":"3.2.0","2.4":"3.0.0","2.3":"2.3.1","2.2":"2.2.0"}
target = triton_by_torch.get(mm, "3.2.0")
print(f"Torch {torch.__version__} → Installing Triton {target}")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"triton=={target}"])

Torch 2.8.0+cu126 → Installing Triton 3.2.0


0

In [None]:
# ===============================
# Import packages & login
# ===============================
from google.colab import drive
drive.mount('/content/drive')

import os, random, torch, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    TrainingArguments, Trainer, set_seed
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from huggingface_hub import login

# --------------- Hugging Face token ---------------
os.environ["HF_TOKEN"] = "YOUR_TOKEN_HERE"
login(os.environ["HF_TOKEN"])

# --------------- Reproducibility ---------------
set_seed(42)

Mounted at /content/drive


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
# =========================================================
# Split cleaned_data_FA_Humans.xlsx into Train | Val | Test
# =========================================================
import os
import pandas as pd
import random

BASE_PATH = r"/content/drive/My Drive/associations-ANLP"

XLSX_PATH = os.path.join(BASE_PATH, r"data/swow_words_associations_dataset/cleaned_data_FA_Humans.xlsx")

TRAIN_XLSX_PATH = os.path.join(BASE_PATH, r"data/swow_words_associations_dataset/train.xlsx")
VAL_XLSX_PATH   = os.path.join(BASE_PATH, r"data/swow_words_associations_dataset/val.xlsx")
TEST_XLSX_PATH  = os.path.join(BASE_PATH, r"data/swow_words_associations_dataset/test.xlsx")

# Ensure output directory exists
os.makedirs(os.path.dirname(TRAIN_XLSX_PATH), exist_ok=True)

# ---------- Load Excel ----------
df = pd.read_excel(XLSX_PATH)

SEED = 42
# ---- cue-level stratified split: 80 % / 10 % / 10 % ----
unique_cues = df["cue"].unique().tolist()

# Force "sea" into training if present
if "sea" in unique_cues:
    unique_cues.remove("sea")
    must_have_train = {"sea"}
else:
    must_have_train = set()

rng = random.Random(SEED)
rng.shuffle(unique_cues)

n_total = len(unique_cues) + len(must_have_train)
n_train = int(0.8 * n_total)
n_val   = int(0.1 * n_total)

# Build splits
train_cues = set(unique_cues[:n_train]) | must_have_train
val_cues   = set(unique_cues[n_train:n_train+n_val])
test_cues  = set(unique_cues[n_train+n_val:])

train_df = df[df["cue"].isin(train_cues)]
val_df   = df[df["cue"].isin(val_cues)]
test_df  = df[df["cue"].isin(test_cues)]

# ---------- Save to Excel ----------
train_df.to_excel(TRAIN_XLSX_PATH, index=False)
val_df.to_excel(VAL_XLSX_PATH,   index=False)
test_df.to_excel(TEST_XLSX_PATH, index=False)

# Print summary
print(f"Cues   → Train:{len(train_cues)}, Val:{len(val_cues)}, Test:{len(test_cues)}")
print(f"Rows   → Train:{len(train_df)}, Val:{len(val_df)}, Test:{len(test_df)}")
if "sea" in df["cue"].values:
    print("Cue 'sea' ensured in Train split.")
else:
    print("Cue 'sea' not found in dataset.")
print("All Excel files saved.")

Cues   → Train:7194, Val:899, Test:899
Rows   → Train:575520, Val:71920, Test:71920
✅  Cue 'sea' ensured in Train split.
✅  All Excel files saved.


In [None]:
# =========================================================
# Count cues/rows in Train & Val (Excel)
# =========================================================
import pandas as pd

# Load Excel files
df_train = pd.read_excel(TRAIN_XLSX_PATH)
df_val   = pd.read_excel(VAL_XLSX_PATH)

# Unique cues
train_cues = set(df_train["cue"].dropna().unique())
val_cues   = set(df_val["cue"].dropna().unique())

print(f"Train Cues: {len(train_cues)}, Rows: {len(df_train)}")
print(f"Val Cues:   {len(val_cues)},   Rows: {len(df_val)}")

Train Cues: 7194, Rows: 575520
Val Cues:   899,   Rows: 71920


In [None]:
# =========================================================
# Verify no cue overlap between Train & Val (Excel)
# =========================================================

# Find overlaps
overlap = train_cues & val_cues

if overlap:
    print(f"Found {len(overlap)} cues in both files:")
    for cue in sorted(overlap):
        print("   ", cue)
else:
    print("All cues are unique across the two files.")

✅ All cues are unique across the two files.


In [None]:
# Disconnect the runtime
from google.colab import runtime
runtime.unassign()