<a href="https://colab.research.google.com/github/EonTechie/semeval-context-tree-modular/blob/main/notebooks/00_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup: Clone Repository and Install Dependencies

This notebook sets up the environment for running experiments.

## Steps:
1. Clone repository from GitHub
2. Install dependencies
3. Mount Google Drive (optional, for large data storage)
4. Verify installation


In [None]:
# Clone repository (with retry and ZIP fallback)
import shutil
import os
import subprocess
import time
import requests
import zipfile
from pathlib import Path

repo_dir = '/content/semeval-context-tree-modular'
repo_url = 'https://github.com/EonTechie/semeval-context-tree-modular.git'
zip_url = 'https://github.com/EonTechie/semeval-context-tree-modular/archive/refs/heads/main.zip'

# Remove if exists
if os.path.exists(repo_dir):
    shutil.rmtree(repo_dir)
    print("‚úÖ Removed existing directory")

# Try git clone with retry
max_retries = 2
clone_success = False

for attempt in range(max_retries):
    try:
        print(f"üîÑ Attempting to clone repository (attempt {attempt + 1}/{max_retries})...")
        result = subprocess.run(
            ['git', 'clone', repo_url],
            cwd='/content',
            capture_output=True,
            text=True,
            timeout=60
        )
        if result.returncode == 0:
            print("‚úÖ Repository cloned successfully!")
            clone_success = True
            break
        else:
            print(f"‚ùå Clone failed: {result.stderr.strip()}")
            if attempt < max_retries - 1:
                print(f"‚è≥ Waiting 3 seconds before retry...")
                time.sleep(3)
    except subprocess.TimeoutExpired:
        print(f"‚ùå Clone timed out (attempt {attempt + 1})")
        if attempt < max_retries - 1:
            time.sleep(3)
    except Exception as e:
        print(f"‚ùå Error: {e}")
        if attempt < max_retries - 1:
            time.sleep(3)

# Fallback: Download as ZIP using requests
if not clone_success:
    print("\nüì• Git clone failed. Downloading repository as ZIP...")
    zip_path = '/tmp/repo.zip'
    
    try:
        # Download ZIP file
        print(f"   Downloading from: {zip_url}")
        response = requests.get(zip_url, stream=True, timeout=60)
        response.raise_for_status()
        
        # Save to file
        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"   ‚úÖ Downloaded ({os.path.getsize(zip_path) / 1024 / 1024:.2f} MB)")
        
        # Extract ZIP
        print("   üì¶ Extracting...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall('/content')
        
        # Rename to expected directory name
        extracted_dir = '/content/semeval-context-tree-modular-main'
        if os.path.exists(extracted_dir):
            os.rename(extracted_dir, repo_dir)
            print("   ‚úÖ Extracted and renamed")
        
        # Clean up
        os.remove(zip_path)
        print("   ‚úÖ Cleaned up temporary files")
        
    except Exception as e:
        print(f"   ‚ùå ZIP download/extraction failed: {e}")
        raise

# Change directory
if os.path.exists(repo_dir):
    %cd semeval-context-tree-modular
    print(f"\n‚úÖ Setup complete! Current directory: {os.getcwd()}")
else:
    raise FileNotFoundError(f"Repository directory not found: {repo_dir}")


Cloning into 'semeval-context-tree-modular'...
fatal: could not read Username for 'https://github.com': No such device or address
[Errno 2] No such file or directory: 'semeval-context-tree-modular'
/content


In [7]:
# Install dependencies
!pip install -r requirements.txt


[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m

In [8]:
# Mount Google Drive (optional - for large data storage)
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Setup paths
import sys
from pathlib import Path

BASE_PATH = Path('/content/semeval-context-tree-modular')
DATA_PATH = Path('/content/drive/MyDrive/semeval_data')  # For large files

# Add to Python path
sys.path.insert(0, str(BASE_PATH))

# Create data directory if it doesn't exist
DATA_PATH.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Base path: {BASE_PATH}")
print(f"‚úÖ Data path: {DATA_PATH}")
print(f"‚úÖ Python path updated")


‚úÖ Base path: /content/semeval-context-tree-modular
‚úÖ Data path: /content/drive/MyDrive/semeval_data
‚úÖ Python path updated


In [6]:
# Verify imports
from src.data.loader import load_dataset
from src.data.splitter import split_dataset
from src.features.extraction import extract_batch_features_v2, featurize_hf_dataset_in_batches_v2
from src.storage.manager import StorageManager

print("‚úÖ All imports successful!")


ModuleNotFoundError: No module named 'src'

In [None]:
# Initialize storage manager
storage = StorageManager(
    base_path=str(BASE_PATH),
    data_path=str(DATA_PATH),
    github_path=str(BASE_PATH)
)

print("‚úÖ Storage manager initialized!")
print(f"   Code/Metadata: {BASE_PATH}")
print(f"   Large Data: {DATA_PATH}")
