In [1]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": "Give me a short definition of DNN."}
    ]
)

print(response.choices[0].message.content)

A Deep Neural Network (DNN) is a type of artificial neural network with multiple layers of neurons that processes and learns from complex data representations. These layers enable the network to capture intricate patterns and features in large datasets, making DNNs particularly effective for tasks such as image and speech recognition.


In [None]:
from openai import OpenAI

client = OpenAI()

# Simple evaluation dataset
eval_data = [
    {
        "input": "What is 2 + 2?",
        "expected_output": "4"
    },
    {
        "input": "What is the capital of France?",
        "expected_output": "Paris"
    }
]

def run_eval():
    for test_case in eval_data:
        response = client.responses.create(
            model="gpt-4.1-mini",
            input=test_case["input"]
        )

        model_output = response.output_text.strip()

        if model_output.lower() == test_case["expected_output"].lower():
            print(f"PASS: {test_case['input']}")
        else:
            print(f"FAIL: {test_case['input']}")
            print(f"Expected: {test_case['expected_output']}")
            print(f"Got: {model_output}")
            print()

run_eval()

FAIL: What is 2 + 2?
Expected: 4
Got: 2 + 2 = 4.

FAIL: What is the capital of France?
Expected: Paris
Got: The capital of France is Paris.



In [None]:
from openai import OpenAI

client = OpenAI()

# Simple evaluation dataset
eval_data = [
    {
        "input": "What is 2 + 2? Reply with only the number.",
        "expected_output": "4"
    },
    {
        "input": "What is the capital of France?",
        "expected_output": "London"  # intentionally incorrect to force one FAIL
    }
]

def run_eval():
    for test_case in eval_data:
        response = client.responses.create(
            model="gpt-4.1-mini",
            input=test_case["input"]
        )

        model_output = response.output_text.strip()

        # More tolerant comparison
        if test_case["expected_output"].lower() in model_output.lower():
            print(f"PASS: {test_case['input']}")
        else:
            print(f"FAIL: {test_case['input']}")
            print(f"Expected: {test_case['expected_output']}")
            print(f"Got: {model_output}")
            print()

run_eval()

PASS: What is 2 + 2? Reply with only the number.
FAIL: What is the capital of France?
Expected: London
Got: The capital of France is Paris.



In [None]:
pip install openai pydantic



In [None]:
import json
import uuid
from datetime import datetime
from typing import Dict

from pydantic import BaseModel, ValidationError
from openai import OpenAI

# -----------------------------
# CONFIG
# -----------------------------

client = OpenAI()

PRIMARY_MODEL = "gpt-4.1"
JUDGE_MODEL = "gpt-4o-mini"

# -----------------------------
# SCHEMA FOR JUDGE OUTPUT
# -----------------------------

class JudgeScore(BaseModel):
    relevance: int
    faithfulness: int
    completeness: int
    overall_score: float


# -----------------------------
# PRIMARY LLM (System Under Test)
# -----------------------------

def generate_model_output(user_input: str, context: str) -> str:
    response = client.responses.create(
        model=PRIMARY_MODEL,
        input=[
            {
                "role": "system",
                "content": "You are a helpful enterprise assistant."
            },
            {
                "role": "user",
                "content": f"Context:\n{context}\n\nQuestion:\n{user_input}"
            }
        ]
    )

    return response.output_text


# -----------------------------
# BUILD JUDGE PROMPT
# -----------------------------

def build_judge_prompt(user_input: str, model_output: str, context: str) -> str:
    return f"""
You are an expert evaluator.

Evaluate the model output using the following metrics:
1. Relevance (1-5)
2. Faithfulness to context (1-5)
3. Completeness (1-5)

Return ONLY valid JSON in this format:
{{
  "relevance": int,
  "faithfulness": int,
  "completeness": int,
  "overall_score": float
}}

User Input:
{user_input}

Retrieved Context:
{context}

Model Output:
{model_output}
"""


# -----------------------------
# JUDGE LLM CALL
# -----------------------------

def run_judge(prompt: str) -> JudgeScore:
    response = client.responses.create(
        model=JUDGE_MODEL,
        temperature=0,  # deterministic scoring
        input=prompt
    )

    raw_output = response.output_text

    try:
        parsed = json.loads(raw_output)
        return JudgeScore(**parsed)

    except (json.JSONDecodeError, ValidationError) as e:
        print("Judge output invalid. Raw output:")
        print(raw_output)
        raise e


# -----------------------------
# STORE RESULTS (Simple Example)
# -----------------------------

def store_evaluation(result: Dict):
    with open("evaluation_log.jsonl", "a") as f:
        f.write(json.dumps(result) + "\n")


# -----------------------------
# MAIN PIPELINE
# -----------------------------

def evaluate_query(user_input: str, context: str):
    query_id = str(uuid.uuid4())
    timestamp = datetime.utcnow().isoformat()

    # 1️⃣ Generate model output
    model_output = generate_model_output(user_input, context)

    # 2️⃣ Build judge prompt
    judge_prompt = build_judge_prompt(user_input, model_output, context)

    # 3️⃣ Run judge
    judge_scores = run_judge(judge_prompt)

    # 4️⃣ Store evaluation result
    result_record = {
        "query_id": query_id,
        "timestamp": timestamp,
        "user_input": user_input,
        "model_output": model_output,
        "scores": judge_scores.dict(),
        "primary_model": PRIMARY_MODEL,
        "judge_model": JUDGE_MODEL
    }

    store_evaluation(result_record)

    return result_record


# -----------------------------
# EXAMPLE RUN
# -----------------------------

if __name__ == "__main__":
    context = "Company policy allows 15 days of paid leave per year."
    user_query = "How many vacation days do employees get?"

    result = evaluate_query(user_query, context)

    print("Evaluation Result:")
    print(json.dumps(result, indent=2))

  timestamp = datetime.utcnow().isoformat()


Evaluation Result:
{
  "query_id": "ee03cf33-6433-4dc3-8d6b-f231e790914d",
  "timestamp": "2026-02-26T06:27:58.291494",
  "user_input": "How many vacation days do employees get?",
  "model_output": "Employees get 15 days of paid vacation leave per year, according to company policy.",
  "scores": {
    "relevance": 5,
    "faithfulness": 5,
    "completeness": 5,
    "overall_score": 5.0
  },
  "primary_model": "gpt-4.1",
  "judge_model": "gpt-4o-mini"
}


/tmp/ipython-input-699/4284841921.py:138: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  "scores": judge_scores.dict(),


### **AI E-Commerce Semantic Search with Orchestration + LLM Judge**

In [None]:
# Install required libraries
# sentence-transformers → embeddings + cross-encoder reranking
# faiss-cpu → vector similarity search
# openai → LLM + Judge model

!pip install sentence-transformers faiss-cpu openai

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [None]:
# FAISS for vector similarity search
import faiss

# NumPy for vector manipulation
import numpy as np

# JSON for structured outputs
import json

# SentenceTransformer for embeddings
# CrossEncoder for reranking
from sentence_transformers import SentenceTransformer, CrossEncoder

# OpenAI client for LLM generation and judging
from openai import OpenAI

In [None]:
# Simple product catalog (simulating a database)

products = [
    {
        "product_id": 1,
        "title": "Running Shoes",
        "description": "Comfortable lightweight running shoes for daily jogging.",
        "price": 4500
    },
    {
        "product_id": 2,
        "title": "Wireless Headphones",
        "description": "Noise cancelling headphones with 40-hour battery life.",
        "price": 3200
    },
    {
        "product_id": 3,
        "title": "Sports Watch",
        "description": "Waterproof sports watch with heart rate tracking.",
        "price": 2800
    }
]

In [None]:
# Simple word-based chunking function
# In production, you would use token-based chunking
# Here it’s simplified for demonstration

def chunk_text(text, chunk_size=50):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

In [None]:
# Load embedding model
# all-MiniLM-L6-v2 produces 384-dimensional embeddings

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Create empty list to store embeddings
embeddings = []

# Convert each product description into a vector
for product in products:
    vector = embedding_model.encode(product["description"])
    embeddings.append(vector)

# Convert to NumPy array and ensure float32 (required by FAISS)
embeddings = np.array(embeddings).astype("float32")

In [None]:
# Determine vector dimension (384)
dimension = embeddings.shape[1]

# Use Inner Product index (after normalization → cosine similarity)
index = faiss.IndexFlatIP(dimension)

# Normalize vectors for cosine similarity
faiss.normalize_L2(embeddings)

# Add product embeddings to index
index.add(embeddings)

In [None]:
def retrieve(query, top_k=3):
    # Convert query into embedding vector
    query_vector = embedding_model.encode(query).astype("float32")

    # Normalize query for cosine similarity
    faiss.normalize_L2(query_vector.reshape(1, -1))

    # Perform ANN search
    D, I = index.search(query_vector.reshape(1, -1), top_k)

    # Return corresponding products
    return [products[i] for i in I[0]]

In [None]:
# Load cross-encoder model for re-ranking
# This model scores query-product pairs more precisely

reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, candidates):
    # Create query-product pairs
    pairs = [(query, c["description"]) for c in candidates]

    # Predict relevance scores
    scores = reranker.predict(pairs)

    # Combine candidates with scores
    scored = list(zip(candidates, scores))

    # Sort descending by score
    scored.sort(key=lambda x: x[1], reverse=True)

    # Return sorted products
    return [item[0] for item in scored]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [None]:
# Initialize OpenAI client
client = OpenAI()

def generate_response(query, ranked_products):
    # Build context from retrieved products
    context = "\n".join([
        f"{p['title']} - {p['description']} - Price: {p['price']}"
        for p in ranked_products
    ])

    # Construct prompt
    prompt = f"""
You are an e-commerce assistant.
Only recommend from the provided products.

User Query:
{query}

Available Products:
{context}
"""

    # Call LLM
    response = client.responses.create(
        model="gpt-4o-mini",
        input=prompt
    )

    return response.output_text

In [None]:
def judge_response(query, ranked_products, answer):
    # Reconstruct product context
    context = "\n".join([
        f"{p['title']} - {p['description']} - Price: {p['price']}"
        for p in ranked_products
    ])

    # Judge prompt
    judge_prompt = f"""
Evaluate the answer based on:
1. Relevance (1-5)
2. Faithfulness to products (1-5)

Return JSON:
{{
  "relevance": int,
  "faithfulness": int
}}

Query:
{query}

Products:
{context}

Answer:
{answer}
"""

    # Deterministic judge call
    response = client.responses.create(
        model="gpt-4o-mini",
        temperature=0,
        input=judge_prompt
    )

    return response.output_text

In [None]:
class MiniEcommercePipeline:

    # Step 1: Retrieval
    def retrieve(self, query):
        print("Running Retrieval...")
        return retrieve(query)

    # Step 2: Re-ranking
    def rerank(self, query, candidates):
        print("Running Re-ranking...")
        return rerank(query, candidates)

    # Step 3: Generation
    def generate(self, query, ranked):
        print("Generating LLM Response...")
        return generate_response(query, ranked)

    # Step 4: Evaluation
    def evaluate(self, query, ranked, response):
        print("Running LLM Judge...")
        return judge_response(query, ranked, response)

    # Full pipeline orchestration
    def run(self, query):
        candidates = self.retrieve(query)
        ranked = self.rerank(query, candidates)
        response = self.generate(query, ranked)
        score = self.evaluate(query, ranked, response)
        return ranked, response, score

In [None]:
pipeline = MiniEcommercePipeline()

query = "comfortable running shoes under 5000"

ranked_products, llm_answer, evaluation = pipeline.run(query)

print("\nRanked Products:")
print(ranked_products)

print("\nLLM Answer:")
print(llm_answer)

print("\nEvaluation Score:")
print(evaluation)

Running Retrieval...
Running Re-ranking...
Generating LLM Response...
Running LLM Judge...

Ranked Products:
[{'product_id': 1, 'title': 'Running Shoes', 'description': 'Comfortable lightweight running shoes for daily jogging.', 'price': 4500}, {'product_id': 3, 'title': 'Sports Watch', 'description': 'Waterproof sports watch with heart rate tracking.', 'price': 2800}, {'product_id': 2, 'title': 'Wireless Headphones', 'description': 'Noise cancelling headphones with 40-hour battery life.', 'price': 3200}]

LLM Answer:
I recommend the **Running Shoes** priced at **4500**. They are comfortable and designed for daily jogging, making them a great choice for running enthusiasts.

Evaluation Score:
{
  "relevance": 5,
  "faithfulness": 5
}


### **LangChain_Framework**

In [2]:
!pip install -q langchain langchain-openai langchain-community chromadb tiktoken pypdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.0/52.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.5/21.5 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.3/331.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
!pip uninstall -y langchain langchain-openai langchain-community chromadb
!pip install -q langchain langchain-openai langchain-community chromadb pypdf

Found existing installation: langchain 1.2.10
Uninstalling langchain-1.2.10:
  Successfully uninstalled langchain-1.2.10
Found existing installation: langchain-openai 1.1.10
Uninstalling langchain-openai-1.1.10:
  Successfully uninstalled langchain-openai-1.1.10
Found existing installation: langchain-community 0.4.1
Uninstalling langchain-community-0.4.1:
  Successfully uninstalled langchain-community-0.4.1
Found existing installation: chromadb 1.5.1
Uninstalling chromadb-1.5.1:
  Successfully uninstalled chromadb-1.5.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
# Text Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Document Loader
from langchain_community.document_loaders import PyPDFLoader

# Vector Store
from langchain_community.vectorstores import Chroma

# Embeddings + LLM
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# LCEL
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [9]:
from google.colab import files
uploaded = files.upload()

Saving LangChain In Detail.pdf to LangChain In Detail.pdf


In [10]:
pdf_path = list(uploaded.keys())[0]

loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(documents)
print("Chunks created:", len(chunks))

Chunks created: 34


In [12]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [13]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [14]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [15]:
prompt = ChatPromptTemplate.from_template("""
You are an expert assistant.

Use ONLY the context below to answer the question.
If not found, say you don't know.

Context:
{context}

Question:
{question}

Answer:
""")

In [16]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [17]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
query = "Summarize the key ideas."

response = rag_chain.invoke(query)

print(response)

The key ideas include:

1. **Role-Based Prompts**: These prompts guide the language model to behave like a specific expert, enhancing the relevance and accuracy of responses.
2. **Chains**: They automate the entire pipeline of tasks, allowing for efficient processing of inputs through multiple steps without manual intervention.
3. **Indexes**: These connect LLM applications to external knowledge sources, improving the model's ability to provide informed responses.
4. **Prompt Engineering**: This is a crucial skill for developing LLM applications, as prompts significantly influence the output quality.
5. **Dynamic and Few-shot Prompts**: LangChain supports the creation of various prompt types to optimize interactions with the language model.
