# Fork Gemma-2B Tamil Base Model

**Purpose:** Copy the base model to our HuggingFace account so we're not dependent on the original author's alpha release.

**Run this once, then use the forked model for all future training.**

In [None]:
# Install dependencies
!pip install -q huggingface_hub transformers

In [None]:
from huggingface_hub import snapshot_download, HfApi, login
import os

# Configuration
SOURCE_MODEL = "abhinand/gemma-2b-it-tamil-v0.1-alpha"
TARGET_MODEL = "CryptoYogi/gemma-2b-tamil-base"
LOCAL_DIR = "./gemma-2b-tamil-base"

print(f"Source: {SOURCE_MODEL}")
print(f"Target: {TARGET_MODEL}")

In [None]:
# Step 1: Download the original model
print("=" * 60)
print("STEP 1: Downloading original model (~5GB)...")
print("=" * 60)

local_path = snapshot_download(
    repo_id=SOURCE_MODEL,
    local_dir=LOCAL_DIR  # Removed deprecated local_dir_use_symlinks parameter
)

print(f"\n✅ Downloaded to: {local_path}")

# Show what was downloaded
print("\nFiles downloaded:")
for f in os.listdir(local_path):
    size = os.path.getsize(os.path.join(local_path, f)) / 1e6
    print(f"  {f}: {size:.1f} MB")

In [None]:
# Step 1: Download the original model
print("=" * 60)
print("STEP 1: Downloading original model (~5GB)...")
print("=" * 60)

local_path = snapshot_download(
    repo_id=SOURCE_MODEL,
    local_dir=LOCAL_DIR,
    local_dir_use_symlinks=False  # Actually download, don't symlink
)

print(f"\n✅ Downloaded to: {local_path}")

# Show what was downloaded
print("\nFiles downloaded:")
for f in os.listdir(local_path):
    size = os.path.getsize(os.path.join(local_path, f)) / 1e6
    print(f"  {f}: {size:.1f} MB")

In [None]:
# Step 2: Create repo and upload to your account
print("=" * 60)
print("STEP 2: Uploading to your HuggingFace account...")
print("=" * 60)

api = HfApi()

# Create the repository
api.create_repo(
    repo_id=TARGET_MODEL,
    exist_ok=True,
    private=False  # Make it public so Kaggle can access
)
print(f"Repository created: {TARGET_MODEL}")

# Upload all files
print("\nUploading files (this may take a few minutes)...")
api.upload_folder(
    folder_path=local_path,
    repo_id=TARGET_MODEL,
    commit_message="Fork of abhinand/gemma-2b-it-tamil-v0.1-alpha for VAZHI project"
)

print(f"\n✅ Model uploaded successfully!")
print(f"\nView at: https://huggingface.co/{TARGET_MODEL}")

In [None]:
# Step 3: Verify the upload by loading the model
print("=" * 60)
print("STEP 3: Verifying upload...")
print("=" * 60)

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer from our fork
tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
print(f"✅ Tokenizer loaded from {TARGET_MODEL}")
print(f"   Vocab size: {tokenizer.vocab_size}")
print(f"   PAD token: {tokenizer.pad_token}")
print(f"   EOS token: {tokenizer.eos_token}")

print("\n" + "=" * 60)
print("✅ MODEL FORK COMPLETE!")
print("=" * 60)
print(f"\nYou can now use: {TARGET_MODEL}")
print("\nIn training notebook, set:")
print('  USE_FORKED = True')

In [None]:
# Cleanup - remove local copy to save space
import shutil

if os.path.exists(LOCAL_DIR):
    shutil.rmtree(LOCAL_DIR)
    print(f"Cleaned up local copy: {LOCAL_DIR}")