In [1]:
# migrate_to_chromadb.py

import sqlite3
import pandas as pd
import chromadb
from chromadb.config import Settings

# Paths
DB_PATH = r"D:\OSPO\researchers.db"
CHROMA_DIR = r"D:\OSPO\ChromaDB"

# Setup ChromaDB client (local disk persistence)
chroma_client = chromadb.PersistentClient(path=CHROMA_DIR)

# Connect to SQLite
conn = sqlite3.connect(DB_PATH)

# 1. Migrate 'works' table (summaries of full papers)
def migrate_works():
    print("Migrating 'works' table to ChromaDB...")

    df = pd.read_sql_query("""
        SELECT id, file_name, full_text, summary
        FROM works
        WHERE summary_status = 'summarized' AND progress = 1
    """, conn)

    if df.empty:
        print("No data found in 'works'. Skipping.")
        return

    collection = chroma_client.get_or_create_collection(name="works_collection")

    # Batch add
    batch_size = 1000
    for start_idx in range(0, len(df), batch_size):
        batch = df.iloc[start_idx:start_idx+batch_size]

        ids = [f"work_{row['id']}" for idx, row in batch.iterrows()]
        documents = [row["summary"] for idx, row in batch.iterrows()]
        metadatas = [{"file_name": row["file_name"]} for idx, row in batch.iterrows()]

        collection.add(
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        print(f"✅ Migrated batch {start_idx} → {start_idx+len(batch)}")

    print(f"✅ Migrated {len(df)} works into ChromaDB.")


def migrate_research_info():
    print("Migrating 'research_info' table to ChromaDB...")

    df = pd.read_sql_query("""
        SELECT id, researcher_name, work_title, authors, info
        FROM research_info
    """, conn)

    if df.empty:
        print("No data found in 'research_info'. Skipping.")
        return

    collection = chroma_client.get_or_create_collection(name="research_info_collection")

    # Batch add
    batch_size = 1000
    for start_idx in range(0, len(df), batch_size):
        batch = df.iloc[start_idx:start_idx+batch_size]

        ids = [f"research_{row['id']}" for idx, row in batch.iterrows()]
        documents = [
            f"Title: {row['work_title'] or ''}. Authors: {row['authors'] or ''}. Info: {row['info'] or ''}."
            for idx, row in batch.iterrows()
        ]
        metadatas = [
            {
                "researcher_name": row["researcher_name"] or "",
                "work_title": row["work_title"] or "",
                "authors": row["authors"] or "",
            }
            for idx, row in batch.iterrows()
        ]

        collection.add(
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        print(f"✅ Migrated batch {start_idx} → {start_idx+len(batch)}")

    print(f"✅ Migrated {len(df)} research infos into ChromaDB.")


# Main
if __name__ == "__main__":
    migrate_works()
    migrate_research_info()
    conn.close()
    print("🎯 Migration to ChromaDB complete!")


Migrating 'works' table to ChromaDB...
✅ Migrated batch 0 → 1000
✅ Migrated batch 1000 → 2000
✅ Migrated batch 2000 → 3000
✅ Migrated batch 3000 → 4000
✅ Migrated batch 4000 → 5000
✅ Migrated batch 5000 → 6000
✅ Migrated batch 6000 → 7000
✅ Migrated batch 7000 → 8000
✅ Migrated batch 8000 → 9000
✅ Migrated batch 9000 → 10000
✅ Migrated batch 10000 → 11000
✅ Migrated batch 11000 → 11933
✅ Migrated 11933 works into ChromaDB.
Migrating 'research_info' table to ChromaDB...
✅ Migrated batch 0 → 1000
✅ Migrated batch 1000 → 2000
✅ Migrated batch 2000 → 3000
✅ Migrated batch 3000 → 4000
✅ Migrated batch 4000 → 5000
✅ Migrated batch 5000 → 6000
✅ Migrated batch 6000 → 7000
✅ Migrated batch 7000 → 8000
✅ Migrated batch 8000 → 9000
✅ Migrated batch 9000 → 10000
✅ Migrated batch 10000 → 11000
✅ Migrated batch 11000 → 12000
✅ Migrated batch 12000 → 13000
✅ Migrated batch 13000 → 14000
✅ Migrated batch 14000 → 15000
✅ Migrated batch 15000 → 16000
✅ Migrated batch 16000 → 17000
✅ Migrated batch 1700

In [None]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="meta-llama/Llama-3.2-1B-Instruct",
    use_auth_token="hugging_face_token",
    local_dir="D:\\OSPO\\llama321b\\"
)


  from .autonotebook import tqdm as notebook_tqdm
Fetching 13 files: 100%|██████████| 13/13 [01:45<00:00,  8.11s/it]


'D:\\OSPO\\llama321b'