# Environment Setup

In [None]:
# Initial setup for local GPU environment
import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✓ {package} installed successfully")
    except subprocess.CalledProcessError as e:
        print(f"✗ Failed to install {package}: {e}")

# List of required packages
required_packages = [
    "torch",
    "transformers[torch]",
    "datasets",
    "scikit-learn",
    "numpy",
    "tqdm"
]

print("Installing required packages for local GPU environment...")
print("=" * 50)

for package in required_packages:
    try:
        # Try to import first
        if package == "torch":
            import torch
            print(f"✓ torch already installed (version: {torch.__version__})")
        elif package == "transformers[torch]":
            import transformers
            print(f"✓ transformers already installed (version: {transformers.__version__})")
        elif package == "datasets":
            import datasets
            print(f"✓ datasets already installed")
        elif package == "scikit-learn":
            import sklearn
            print(f"✓ scikit-learn already installed (version: {sklearn.__version__})")
        elif package == "numpy":
            import numpy
            print(f"✓ numpy already installed (version: {numpy.__version__})")
        elif package == "tqdm":
            import tqdm
            print(f"✓ tqdm already installed")
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)

print("=" * 50)
print("Setup complete! Ready to proceed with GPU-accelerated training.")

Installing required packages for local GPU environment...
✓ torch already installed (version: 2.7.1+cu128)


  from .autonotebook import tqdm as notebook_tqdm


✓ transformers already installed (version: 4.54.1)
✓ datasets already installed
✓ scikit-learn already installed (version: 1.7.1)
✓ numpy already installed (version: 2.2.6)
✓ tqdm already installed
Setup complete! Ready to proceed with GPU-accelerated training.


In [2]:
print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.8


In [3]:
import torch
import os

# Set your Hugging Face token
os.environ['HUGGINGFACE_HUB_TOKEN'] = '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'

# Check for GPU availability and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("CUDA is not available. Using CPU.")
    print("Warning: BERT embedding extraction will be much slower on CPU!")

Using device: cuda
GPU: NVIDIA GeForce GTX 1070
CUDA Version: 12.8
PyTorch Version: 2.7.1+cu128
Available GPU memory: 8.0 GB


# Data Setup

In [7]:
from datasets import load_dataset

dataset = load_dataset("CreativeLang/vua20_metaphor")

In [8]:
import os

# Create vua_dataset directory if it doesn't exist
dataset_dir = 'vua_dataset'
os.makedirs(dataset_dir, exist_ok=True)

# Define file paths within the vua_dataset folder
train_file = os.path.join(dataset_dir, 'vua20_metaphor_train.json')
test_file = os.path.join(dataset_dir, 'vua20_metaphor_test.json')

print("🗂️  Checking VUA dataset files in organized structure...")
print(f"📁 Target directory: {dataset_dir}/")

if os.path.exists(train_file) and os.path.exists(test_file):
    print(f"✅ Dataset files already exist in {dataset_dir}/:")
    print(f"   - {os.path.basename(train_file)}")
    print(f"   - {os.path.basename(test_file)}")
    
    # Show file sizes
    train_size = os.path.getsize(train_file) / (1024*1024)
    test_size = os.path.getsize(test_file) / (1024*1024)
    print(f"📊 File sizes: Train={train_size:.1f}MB, Test={test_size:.1f}MB")
    print("⏩ Skipping dataset save operation - using existing files.")
else:
    print(f"💾 Saving dataset files to {dataset_dir}/...")
    dataset['train'].to_json(train_file)
    dataset['test'].to_json(test_file)
    print(f"✅ Saved {os.path.basename(train_file)} to {dataset_dir}/")
    print(f"✅ Saved {os.path.basename(test_file)} to {dataset_dir}/")

🗂️  Checking VUA dataset files in organized structure...
📁 Target directory: vua_dataset/
💾 Saving dataset files to vua_dataset/...


Creating json from Arrow format: 100%|██████████| 161/161 [00:00<00:00, 353.85ba/s]
Creating json from Arrow format: 100%|██████████| 23/23 [00:00<00:00, 383.33ba/s]

✅ Saved vua20_metaphor_train.json to vua_dataset/
✅ Saved vua20_metaphor_test.json to vua_dataset/



