<a href="https://colab.research.google.com/github/Alokik-29/Project_2/blob/main/Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
TEXT-TO-SQL RAG PROJECT - SIMPLE WITH LANGCHAIN
Comparing Llama vs SQLCoder using LangChain Framework

SIMPLE CODE + INDUSTRY STANDARD TOOLS
"""

# ============================================================================
# CELL 1: INSTALL PACKAGES
# ============================================================================

print("Installing packages...")
!pip install -q langchain==0.1.20 langchain-community langchain-huggingface
!pip install -q transformers accelerate bitsandbytes sentence-transformers
!pip install -q chromadb sqlalchemy faiss-cpu

print("✅ Installation complete!")
print("⚠️ If you see import errors, restart runtime: Runtime → Restart runtime\n")


CELL 1: INSTALLING DEPENDENCIES
Installing/upgrading packages...
Collecting langchain
  Downloading langchain-1.0.2-py3-none-any.whl.metadata (4.7 kB)
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
Collecting langchain-core<2.0.0,>=1.0.0 (from langchain)
  Downloading langchain_core-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting langgraph<1.1.0,>=1.0.0 (from langchain)
  Downloading langgraph-1.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.wh

In [None]:
# CELL 2: IMPORTS
# ============================================================================

import urllib.request
import sqlite3

# LangChain imports
from langchain_community.utilities import SQLDatabase
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chains import LLMChain
from langchain_huggingface import HuggingFacePipeline

# Transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

print("✅ Imports successful!\n")




CELL 2: IMPORTING LIBRARIES


In [None]:
# CELL 3: DOWNLOAD DATABASE & CREATE LANGCHAIN SQLDATABASE
# ============================================================================

print("Setting up database...")

# Download Chinook
url = "https://github.com/lerocha/chinook-database/raw/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite"
urllib.request.urlretrieve(url, "chinook.db")

# Create LangChain SQLDatabase (automatic schema extraction!)
db = SQLDatabase.from_uri("sqlite:///chinook.db")

print(f"✅ Database ready!")
print(f"   Tables: {db.get_usable_table_names()}\n")


In [None]:
# CELL 4: CREATE VECTOR STORE WITH LANGCHAIN (RAG)
# ============================================================================

print("Creating vector store...")

# Get schema info using LangChain
table_info = db.get_table_info()

# Create documents (one per table)
from langchain.schema import Document

docs = []
for table in db.get_usable_table_names():
    # Get table schema
    conn = sqlite3.connect("chinook.db")
    cursor = conn.cursor()
    cursor.execute(f"PRAGMA table_info({table})")
    cols = cursor.fetchall()
    conn.close()

    # Create document
    text = f"Table: {table}\n"
    text += "Columns: " + ", ".join([f"{col[1]} ({col[2]})" for col in cols])

    docs.append(Document(page_content=text, metadata={"table": table}))

# Create FAISS vector store with LangChain
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

print(f"✅ Vector store created with {len(docs)} documents!")

# Test retrieval
test_docs = retriever.get_relevant_documents("customers and invoices")
print(f"   Test: Found {len(test_docs)} relevant tables\n")


In [None]:
# CELL 5: CREATE LANGCHAIN PROMPT TEMPLATE
# ============================================================================

print("Creating LangChain prompt template...")

# Few-shot examples
examples = [
    {
        "question": "List all artists",
        "answer": "SELECT * FROM Artist LIMIT 10;"
    },
    {
        "question": "Top 5 customers by spending",
        "answer": """SELECT c.FirstName, c.LastName, SUM(i.Total) as Total
FROM Customer c
JOIN Invoice i ON c.CustomerId = i.CustomerId
GROUP BY c.CustomerId
ORDER BY Total DESC
LIMIT 5;"""
    },
    {
        "question": "Most popular genres",
        "answer": """SELECT g.Name, COUNT(t.TrackId) as Count
FROM Genre g
JOIN Track t ON g.GenreId = t.GenreId
GROUP BY g.GenreId
ORDER BY Count DESC;"""
    }
]

# Example template
example_template = """
Question: {question}
SQL: {answer}
"""

example_prompt = PromptTemplate(
    input_variables=["question", "answer"],
    template=example_template
)

# Main prompt
prefix = """You are a SQLite expert. Generate a syntactically correct SQLite query.

Here are examples:"""

suffix = """
Database Schema:
{schema}

Question: {question}
SQL:"""

# Create FewShotPromptTemplate (LangChain!)
few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["schema", "question"],
    example_separator="\n"
)

print("✅ LangChain prompt template ready!\n")


In [None]:
# CELL 6: LOAD MODELS (5-10 MINUTES)
# ============================================================================

print("Loading models (takes 5-10 minutes)...\n")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# Quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# --- Load SQLCoder ---
print("\n📥 Loading SQLCoder...")
try:
    sqlcoder_tokenizer = AutoTokenizer.from_pretrained("defog/sqlcoder-7b-2")
    sqlcoder_model = AutoModelForCausalLM.from_pretrained(
        "defog/sqlcoder-7b-2",
        quantization_config=quant_config,
        device_map="auto",
        trust_remote_code=True
    )

    # Create pipeline and wrap with LangChain
    sqlcoder_pipe = pipeline(
        "text-generation",
        model=sqlcoder_model,
        tokenizer=sqlcoder_tokenizer,
        max_new_tokens=300,
        temperature=0.1
    )
    sqlcoder_llm = HuggingFacePipeline(pipeline=sqlcoder_pipe)

    print("✅ SQLCoder loaded & wrapped in LangChain!")
except Exception as e:
    print(f"❌ SQLCoder failed")
    sqlcoder_llm = None

# --- Load Llama/CodeLlama ---
print("\n📥 Loading Llama...")
llama_models = ["codellama/CodeLlama-7b-Instruct-hf", "mistralai/Mistral-7B-Instruct-v0.2"]
llama_llm = None
llama_name = None

for model_name in llama_models:
    try:
        print(f"   Trying {model_name}...")
        llama_tokenizer = AutoTokenizer.from_pretrained(model_name)
        llama_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quant_config,
            device_map="auto"
        )

        # Create pipeline and wrap with LangChain
        llama_pipe = pipeline(
            "text-generation",
            model=llama_model,
            tokenizer=llama_tokenizer,
            max_new_tokens=300,
            temperature=0.1
        )
        llama_llm = HuggingFacePipeline(pipeline=llama_pipe)
        llama_name = model_name.split('/')[-1]

        print(f"✅ {llama_name} loaded & wrapped in LangChain!")
        break
    except:
        continue

print("\n✅ Models ready!\n")



In [None]:
# CELL 7: CREATE LANGCHAIN CHAINS
# ============================================================================

print("Creating LangChain chains...")

# Create chains (LangChain handles everything!)
sqlcoder_chain = LLMChain(
    llm=sqlcoder_llm,
    prompt=few_shot_prompt
) if sqlcoder_llm else None

llama_chain = LLMChain(
    llm=llama_llm,
    prompt=few_shot_prompt
) if llama_llm else None

print("✅ LangChain chains created!\n")

In [None]:
# CELL 8: MAIN FUNCTION USING LANGCHAIN
# ============================================================================

def ask_question(question):
    """
    Main function using LangChain components:
    - Retriever (RAG)
    - Chains (LLM)
    - SQLDatabase (execution)
    """
    print(f"\n{'='*60}")
    print(f"📝 Question: {question}")
    print('='*60)

    # Step 1: Retrieve schema using LangChain retriever
    relevant_docs = retriever.get_relevant_documents(question)
    schema = "\n".join([doc.page_content for doc in relevant_docs])

    # Step 2: Generate SQL with SQLCoder using LangChain chain
    if sqlcoder_chain:
        print("\n🤖 SQLCoder (via LangChain):")
        try:
            response = sqlcoder_chain.run(schema=schema, question=question)

            # Extract SQL
            if "SQL:" in response:
                sql = response.split("SQL:")[-1].strip()
            else:
                sql = response.strip()
            sql = sql.split('\n\n')[0].strip()

            print(f"   SQL: {sql}")

            # Execute using LangChain SQLDatabase
            try:
                result = db.run(sql)
                print(f"   ✅ Success! Result: {str(result)[:100]}")
            except Exception as e:
                print(f"   ❌ Error: {str(e)[:100]}")
        except Exception as e:
            print(f"   ❌ Generation failed: {str(e)[:100]}")

    # Step 3: Generate SQL with Llama using LangChain chain
    if llama_chain:
        print(f"\n🤖 {llama_name} (via LangChain):")
        try:
            response = llama_chain.run(schema=schema, question=question)

            # Extract SQL
            if "SQL:" in response:
                sql = response.split("SQL:")[-1].strip()
            else:
                sql = response.strip()
            sql = sql.split('\n\n')[0].strip()

            print(f"   SQL: {sql}")

            # Execute using LangChain SQLDatabase
            try:
                result = db.run(sql)
                print(f"   ✅ Success! Result: {str(result)[:100]}")
            except Exception as e:
                print(f"   ❌ Error: {str(e)[:100]}")
        except Exception as e:
            print(f"   ❌ Generation failed: {str(e)[:100]}")

print("✅ Main function ready (using LangChain)!\n")


In [None]:
# CELL 9: TEST QUERIES
# ============================================================================

print("="*60)
print("TESTING MODELS")
print("="*60)

test_questions = [
    "List the top 5 artists with most albums",
    "Show me all customers from Canada",
    "What is the total revenue?",
    "Find the longest track",
    "Which employee has the most customers?"
]

for q in test_questions:
    ask_question(q)

print("\n" + "="*60)
print("🎉 PROJECT COMPLETE!")
print("="*60)
print("\n✅ This project uses LangChain:")
print("   - FewShotPromptTemplate for prompts")
print("   - HuggingFacePipeline for model wrapping")
print("   - LLMChain for orchestration")
print("   - FAISS for vector store")
print("   - SQLDatabase for DB operations")
print("\nTry your own:")
print("ask_question('Show me all rock albums')")
print("ask_question('Which genre has most tracks?')")
