# KG²RAG Knowledge Graph Extraction on Google Colab

This notebook extracts knowledge graphs from the HotpotQA dataset.

**Time**: 2-4 hours with GPU, 6-10 hours with CPU

**Prerequisites**:
1. Upload `KG2RAG-main` folder to Google Drive
2. Enable GPU: Runtime → Change runtime type → GPU (T4)

## Step 1: Install Dependencies

In [None]:
print("=" * 80)
print("Installing dependencies...")
print("=" * 80)

!pip install llama-index==0.12.20
!pip install llama-index-llms-ollama llama-index-embeddings-ollama
!pip install networkx>=3.0 pandas>=2.1.0 tqdm>=4.66.1 ujson==1.35

print("\n✓ Dependencies installed!")

## Step 2: Mount Google Drive

In [None]:
print("=" * 80)
print("Mounting Google Drive...")
print("=" * 80)

from google.colab import drive
drive.mount('/content/drive')

print("\n✓ Drive mounted!")
print("If prompted, click the link to authorize access.")

## Step 3: Setup Project Path

In [None]:
print("=" * 80)
print("Setting up project...")
print("=" * 80)

import os
import sys

# Update this path if your project is in a different location
# Common locations:
#   '/content/drive/MyDrive/KG2RAG-main'
#   '/content/drive/MyDrive/685-Project/KG2RAG-main'
project_path = '/content/drive/MyDrive/685-Project/KG2RAG-main'

# If the above doesn't work, try:
# project_path = '/content/drive/MyDrive/KG2RAG-main'

# Check if project exists
if not os.path.exists(project_path):
    print(f"✗ Project not found at: {project_path}")
    print("\nAvailable folders in Drive:")
    !ls -la /content/drive/MyDrive/ | head -20
    print("\nPlease update project_path variable above with correct path")
    sys.exit(1)

os.chdir(project_path)
print(f"✓ Working directory: {os.getcwd()}")

# Verify dataset exists
data_path = 'data/hotpotqa/hotpot_dev_distractor_v1.json'
if os.path.exists(data_path):
    import json
    with open(data_path) as f:
        data = json.load(f)
    print(f"✓ Dataset found: {len(data)} questions")
else:
    print(f"✗ Dataset not found: {data_path}")

## Step 4: Install Ollama

In [None]:
print("=" * 80)
print("Installing Ollama...")
print("=" * 80)

!curl -fsSL https://ollama.com/install.sh | sh

print("\nStarting Ollama service in background...")
import subprocess
import os

# Start Ollama in background
subprocess.Popen(['ollama', 'serve'], 
                 stdout=subprocess.DEVNULL, 
                 stderr=subprocess.DEVNULL)

# Wait a moment for Ollama to start
import time
time.sleep(3)

# Verify Ollama is running
try:
    result = subprocess.run(['ollama', 'list'], 
                          capture_output=True, 
                          text=True, 
                          timeout=5)
    if result.returncode == 0:
        print("✓ Ollama is running!")
    else:
        print("⚠️  Ollama installed but may need a moment to start")
        print("   If you see errors, wait 10 seconds and run the next cell")
except Exception as e:
    print(f"⚠️  Checking Ollama status: {e}")
    print("   Ollama should be starting in background...")
    print("   Wait 10 seconds before running the next cell")

## Step 5: Download LLM Model

In [None]:
print("=" * 80)
print("Verifying Ollama is running...")
print("=" * 80)

import subprocess
import time

# Check if Ollama is running
max_retries = 5
for i in range(max_retries):
    try:
        result = subprocess.run(['ollama', 'list'], 
                              capture_output=True, 
                              text=True, 
                              timeout=5)
        if result.returncode == 0:
            print("✓ Ollama is running!")
            break
    except Exception as e:
        if i < max_retries - 1:
            print(f"Waiting for Ollama to start... ({i+1}/{max_retries})")
            time.sleep(2)
        else:
            print("⚠️  Ollama may not be running. Trying to start it...")
            subprocess.Popen(['ollama', 'serve'], 
                           stdout=subprocess.DEVNULL, 
                           stderr=subprocess.DEVNULL)
            time.sleep(5)

print("\n" + "=" * 80)
print("Downloading llama3:8b model...")
print("=" * 80)
print("This will take 5-10 minutes and download ~4.7 GB")
print("Please be patient...\n")

!ollama pull llama3:8b

print("\n✓ Model downloaded!")

## Step 6: Verify Setup

In [None]:
print("=" * 80)
print("Verifying setup...")
print("=" * 80)

import os
import json

# Check project structure
print("Project structure:")
!ls -la

# Check dataset
data_path = 'data/hotpotqa/hotpot_dev_distractor_v1.json'
if os.path.exists(data_path):
    with open(data_path) as f:
        data = json.load(f)
    print(f"\n✓ Dataset: {len(data)} questions")
else:
    print(f"\n✗ Dataset not found")

# Check Ollama
print("\nOllama models:")
!ollama list

# Check output directory
kg_dir = 'data/hotpotqa/kgs/extract_subkgs'
os.makedirs(kg_dir, exist_ok=True)
print(f"\n✓ Output directory ready: {kg_dir}")

print("\n" + "=" * 80)
print("✅ Setup complete! Ready to run extraction.")
print("=" * 80)

## Step 7: Run KG Extraction (Your Assigned Part)

**⚠️ IMPORTANT**:
- Set your PART_NUMBER below (1, 2, 3, 4, or 5)
- Make sure split files are uploaded to `data/hotpotqa/splits/`
- This will take ~12-20 hours with GPU, ~35-55 hours with CPU
- Results are saved incrementally (safe to disconnect)
- Keep this tab open (or use Colab Pro for background execution)

In [None]:
# ============================================================================
# SET YOUR PART NUMBER HERE (1, 2, 3, 4, or 5)
# ============================================================================
PART_NUMBER = 1  # ⬅️ CHANGE THIS TO YOUR ASSIGNED PART!
# ============================================================================

print("=" * 80)
print(f"Starting KG extraction for Part {PART_NUMBER}...")
print("=" * 80)
print("⚠️  IMPORTANT:")
print(f"   - Processing Part {PART_NUMBER} of the dataset")
print("   - This will take ~12-20 hours with GPU, ~35-55 hours with CPU")
print("   - Results are saved incrementally (safe to disconnect)")
print("   - You can monitor progress below")
print("   - Keep this tab open (or Colab Pro for background execution)")
print()

import os
os.chdir('code/preprocess')

# Verify split file exists
split_file = f'../../data/hotpotqa/splits/hotpot_dev_distractor_v1_part{PART_NUMBER}.json'
if not os.path.exists(split_file):
    print(f"✗ Error: Split file not found: {split_file}")
    print("   Make sure you've uploaded the split files to data/hotpotqa/splits/")
    print("   Expected files:")
    for i in range(1, 6):
        print(f"     - hotpot_dev_distractor_v1_part{i}.json")
else:
    print(f"✓ Found split file: {split_file}")
    print()
    
    # Update the extraction script with the part number
    with open('hotpot_extraction_part.py', 'r') as f:
        content = f.read()
    
    # Replace PART_NUMBER in the script
    import re
    content = re.sub(r'PART_NUMBER = \d+', f'PART_NUMBER = {PART_NUMBER}', content)
    
    with open('hotpot_extraction_part.py', 'w') as f:
        f.write(content)
    
    print("Starting extraction...\n")
    
    # Run the extraction script
    !python hotpot_extraction_part.py
    
    print("\n" + "=" * 80)
    print(f"✅ Part {PART_NUMBER} extraction complete!")
    print("=" * 80)

## Step 8: Check Results

In [None]:
print("=" * 80)
print("Checking results...")
print("=" * 80)

import os

# Use the same path as Step 3 (update if different)
project_path = '/content/drive/MyDrive/685-Project/KG2RAG-main'
os.chdir(project_path)
kg_dir = 'data/hotpotqa/kgs/extract_subkgs'

if os.path.exists(kg_dir):
    kg_files = [f for f in os.listdir(kg_dir) if f.endswith('.json')]
    print(f"✓ Extracted KGs for {len(kg_files)} entities")

    if kg_files:
        print(f"\nSample files (first 10):")
        for f in sorted(kg_files)[:10]:
            file_path = os.path.join(kg_dir, f)
            size = os.path.getsize(file_path)
            print(f"  - {f} ({size:,} bytes)")

        # Check a sample file
        if kg_files:
            sample_file = os.path.join(kg_dir, kg_files[0])
            import json
            with open(sample_file) as f:
                sample_kg = json.load(f)
            print(f"\nSample KG structure:")
            print(f"  Entity: {kg_files[0].replace('.json', '')}")
            print(f"  Sequences: {list(sample_kg.keys())[:5]}...")
    else:
        print("⚠️  No KG files found. Extraction may not have completed.")
else:
    print(f"✗ Output directory not found: {kg_dir}")

print("\n" + "=" * 80)

## Step 9: Prepare Download

In [None]:
print("=" * 80)
print("Preparing download...")
print("=" * 80)

import os

# Use the same path as Step 3 (update if different)
project_path = '/content/drive/MyDrive/685-Project/KG2RAG-main'
os.chdir(project_path)

# Create zip file
!zip -r /content/kgs_extracted.zip data/hotpotqa/kgs/

# Check size
import os
zip_size = os.path.getsize('/content/kgs_extracted.zip') / (1024 * 1024)  # MB
print(f"✓ Archive created: /content/kgs_extracted.zip ({zip_size:.1f} MB)")

print("\n" + "=" * 80)
print("Ready to download!")
print("=" * 80)

## Step 10: Download Results

In [None]:
print("=" * 80)
print("Downloading results...")
print("=" * 80)

from google.colab import files

# Download the zip file
files.download('/content/kgs_extracted.zip')

print("\n" + "=" * 80)
print("✅ DOWNLOAD COMPLETE!")
print("=" * 80)
print("\nNext steps:")
print("1. The file 'kgs_extracted.zip' should be downloading now")
print("2. Extract it to your local project:")
print("   cd '/Users/devaanand/Desktop/Coding Stuff/685-NLP/KG2RAG-main'")
print("   unzip kgs_extracted.zip")
print("3. Verify KGs are in place:")
print("   ls -lh data/hotpotqa/kgs/extract_subkgs/ | head -10")
print("4. Run full KG²RAG:")
print("   cd code")
print("   python kg_rag_distractor.py --kg_dir ../data/hotpotqa/kgs/extract_subkgs")