# Week 3 + 5 LLM Deployment

This notebook sets up a Llama-3.1-8B model with Flask API for Week 3 RAG functionality and Week 5 backend integration.

## Instructions:
1. Run each cell in order (Ctrl+Enter for each cell)
2. When prompted, enter your ngrok auth token 
3. When prompted, enter your Hugging Face token
4. Copy the ngrok public URL that appears at the end
5. Use that URL in your Week 5 backend configuration

## Endpoints:
- `POST /chat` - RAG-based responses with document retrieval
- `POST /generate` - Simple text generation for Week 5 integration
- `GET /health` - Health check

In [None]:
# Start ngrok tunnel and Flask app
print("Starting ngrok tunnel...")
ngrok_tunnel = ngrok.connect(5000)
print(f"🌐 Public URL: {ngrok_tunnel.public_url}")
print("📝 Copy this URL for your Week 5 backend configuration!")
print("\n🚀 Starting Flask app on port 5000...")
print("Available endpoints:")
print("  - GET  /health - Health check")
print("  - POST /chat - RAG-based chat")
print("  - POST /generate - Simple text generation")

app.run(port=5000, debug=True)

In [None]:
# Test the model before starting the server
print("🧪 Testing model generation...")
test_prompt = "You are Alex, a helpful customer support specialist.\n\nCustomer: What is your return policy?\nAlex:"
inputs = tokenizer.encode(test_prompt, return_tensors='pt')

with torch.no_grad():
    outputs = model.generate(
        inputs,
        max_length=inputs.shape[1] + 100,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

test_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
test_response = test_response.split("Alex:")[-1].strip()
print(f"✓ Test response: {test_response}")
print("✅ Model is working correctly!")

In [None]:
# Add /generate endpoint for Week 5 integration
@app.route('/generate', methods=['POST'])
def generate_text():
    """Simple text completion endpoint without RAG retrieval for Week 5 backend"""
    try:
        data = request.get_json()
        prompt = data.get('prompt', '')
        intent = data.get('intent', 'general')
        
        if not prompt:
            return jsonify({'error': 'Prompt is required'}), 400
        
        # Enhanced prompt based on intent
        system_prompts = {
            'policy_question': "You are Alex, a helpful customer support specialist. Answer policy questions clearly and professionally.",
            'order_status': "You are Alex, a customer support specialist helping with order inquiries.",
            'product_search': "You are Alex, a customer support specialist helping customers find products.",
            'complaint': "You are Alex, a customer support specialist handling customer concerns with empathy.",
            'chitchat': "You are Alex, a friendly customer support specialist engaging in brief conversation.",
            'general': "You are Alex, a helpful customer support specialist."
        }
        
        system_prompt = system_prompts.get(intent, system_prompts['general'])
        full_prompt = f"{system_prompt}\n\nCustomer: {prompt}\nAlex:"
        
        # Generate response using your model
        inputs = tokenizer.encode(full_prompt, return_tensors='pt')
        
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_length=inputs.shape[1] + 150,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.split("Alex:")[-1].strip()
        
        return jsonify({
            'response': response,
            'intent': intent,
            'prompt': prompt,
            'timestamp': datetime.now().isoformat()
        })
        
    except Exception as e:
        print(f"Error in /generate: {str(e)}")
        return jsonify({'error': str(e)}), 500

print("✓ /generate endpoint added for Week 5 integration!")

In [None]:
from pyngrok import ngrok, conf

# Paste your token here
NGROK_AUTH_TOKEN = input("Enter your ngrok token: ")

# Set authtoken
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

In [None]:
# Create Flask app with RAG endpoints
app = Flask(__name__)

@app.route("/ping")
def ping():
    return "pong"

@app.route("/health")
def health():
    return jsonify({"status": "ok", "model": "Llama-3.1-8B", "embedding": "all-MiniLM-L6-v2"})

@app.route("/chat", methods=["POST"])
def chat():
    """RAG-based chat endpoint with document retrieval"""
    try:
        user_input = request.json.get("prompt")
        if not user_input:
            return jsonify({'error': 'Prompt is required'}), 400
            
        context_docs = retrieve(user_input)

        # After retrieving context
        retrieved_text = "\n\n".join([f"---\n{doc}" for doc in context_docs])

        base_prompt = PROMPTS["base_retrieval_prompt"]
        formatted_system_prompt = f"""{base_prompt['role'].strip()}

Goal: {base_prompt['goal'].strip()}

Guidelines:
{chr(10).join('- ' + g for g in base_prompt['context_guidelines'])}

Use the following context to answer the user's question:

{retrieved_text}

{base_prompt['response_format'].strip()}
"""

        full_prompt = f"{formatted_system_prompt}\n\nUser: {user_input}\nAlex:"

        response = generate_response(full_prompt)
        
        # Extract just Alex's response
        if "Alex:" in response:
            response = response.split("Alex:")[-1].strip()

        return jsonify({
            "response": response,
            "retrieved_docs": len(context_docs),
            "timestamp": datetime.now().isoformat()
        })
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500

print("✓ Flask app with RAG endpoints created!")

In [None]:
# Load Llama model and tokenizer
print("Loading Llama-3.1-8B model... This may take a few minutes.")

hf_token = input("Enter your Hugging Face token: ")

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    token=hf_token  
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    token=hf_token 
)

def generate_response(prompt_text, max_length=512):
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_length=max_length, do_sample=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print("✓ Llama-3.1-8B model loaded successfully!")

In [None]:
# Setup RAG retrieval system
print("Setting up RAG retrieval system...")

# Load embedding model
model_embed = SentenceTransformer("all-MiniLM-L6-v2")

# Encode KB documents
corpus_embeddings = []
for doc in kb:
    emb = model_embed.encode(doc["content"], convert_to_numpy=True)
    corpus_embeddings.append(emb)

corpus_embeddings = np.array(corpus_embeddings)

# Build FAISS index
d = corpus_embeddings.shape[1]  # dimension of embeddings
index = faiss.IndexFlatL2(d)
index.add(corpus_embeddings)

# Retrieval function
def retrieve(query, top_k=3):
    query_emb = model_embed.encode(query, convert_to_numpy=True)
    D, I = index.search(np.array([query_emb]), top_k)
    results = [kb[i]["content"] for i in I[0]]
    return results

print("✓ RAG retrieval system ready!")

In [None]:
# Setup ngrok authentication
from pyngrok import ngrok, conf

# Enter your ngrok token when prompted
NGROK_AUTH_TOKEN = input("Enter your ngrok token: ")

# Set authtoken
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
print("✓ Ngrok authentication configured!")

In [None]:
# Define PROMPTS_YAML and knowledge base
PROMPTS_YAML = """
base_retrieval_prompt:
  role: "You are Alex, a helpful customer support specialist for Shoplite."
  goal: "Provide accurate, helpful answers to customer questions using the provided context."
  context_guidelines:
    - "Always stay in character as Alex, a customer support specialist"
    - "Use the provided context to answer questions accurately"
    - "If the context doesn't contain the answer, say you'll need to check with a specialist"
    - "Be friendly, professional, and empathetic"
  response_format: "Provide clear, concise answers. Always end with asking if there's anything else you can help with."
"""

# Knowledge base data
kb = [
    {
        "id": "doc1",
        "title": "Shoplite User Registration Process",
        "content": """To create a Shoplite account, users can choose between a buyer or seller
account. Buyers provide basic personal information such as name, email, and
password, and must verify their email within 24 hours. Sellers undergo
additional business verification, which includes providing a registered business
name, tax ID, and banking details. The verification process usually takes 2–3
business days. Security measures, such as strong password requirements and
optional two-factor authentication, ensure account safety. Users can manage
account settings, reset passwords, and update contact information through the
profile dashboard. Proper onboarding reduces support requests and increases user
retention.""",
    },
    {
        "id": "doc2",
        "title": "Shoplite Shopping Cart Features",
        "content": """The Shoplite shopping cart enables users to add multiple items from different
sellers, apply promotional codes, and save products for later. Cart contents are
persisted across sessions for logged-in users. Users can edit quantities, remove
items, and view a real-time subtotal including taxes and shipping. The platform
supports automatic notifications when saved items go on sale or stock changes.
For sellers, the system provides analytics on abandoned carts to optimize
inventory and promotions. The shopping cart workflow is designed with a
human-in-the-loop mindset to reduce errors in checkout and improve user
satisfaction.
     """,
    },
    {
        "id": "doc3",
        "title": "Payment Methods and Security",
        "content": """
     Shoplite supports major payment methods including credit/debit cards, PayPal,
and mobile wallets. Payments are processed through secure gateways using
encryption and tokenization to prevent fraud. PCI DSS compliance ensures that
sensitive payment data is handled securely. Users can save preferred payment
methods for convenience, and the platform monitors transactions for suspicious
activity. Refunds are processed automatically in the case of cancellations or
returns, and multi-factor authentication is recommended for account security.
Sellers receive payments according to the scheduled payout cycle, with
transparency on fees and commissions.
    """,
    },
    {
        "id": "doc4",
        "title": "Order Tracking and Delivery",
        "content": """
    
Once an order is placed, users can track its status through the Shoplite
dashboard. Orders show stages like processing, shipped, out for delivery, and
delivered. Real-time tracking is integrated with multiple logistics partners,
and users receive email or push notifications for key updates. Delivery times
vary by seller location and shipping method. For sellers, automated tracking
updates reduce support queries. The system also accounts for exceptions like
delays or lost packages, prompting either customer service intervention or
automatic refunds. Clear tracking workflows improve trust and transparency for
buyers.
    """,
    },
    {
        "id": "doc5",
        "title": "Return and Refund Policies",
        "content": """
    Shoplite allows returns within a 30-day window from delivery. Users must submit
a return authorization request, specifying the reason for return and items
affected. Returns require items to be in original condition, with proof of
purchase. Once approved, the refund is issued to the original payment method.
Restocking fees may apply for certain products, and sellers are notified of all
returns. The return workflow is integrated with the RAG-based help assistant to
guide users through the process, ensuring consistent and accurate guidance
without overloading human agents.
    """,
    },
    {
        "id": "doc6",
        "title": "Product Reviews and Ratings",
        "content": """
    Users can submit reviews and ratings for purchased products. Reviews must be
honest, relevant, and comply with Shoplite content guidelines. The platform
supports star ratings, written feedback, and optional media uploads. Moderation
algorithms detect inappropriate content, while helpful votes highlight useful
reviews. Sellers can respond to feedback to improve engagement. The
recommendation engine uses review data to personalize product suggestions. This
document also references AI touchpoints, as retrieval-based chat assistants can
summarize reviews to answer buyer queries efficiently.
    """,
    },
    {
        "id": "doc7",
        "title": "Seller Account Setup and Management",
        "content": """
    Sellers register via the dedicated seller portal, providing business details,
banking information, and compliance documents. After verification, sellers can
manage products, track inventory, view sales reports, and handle returns. Seller
dashboards include analytics for performance, revenue, and order fulfillment
rates. Security features protect sensitive financial information. Integration
with AI recommendation and RAG retrieval systems can help sellers optimize
listings and pricing. Proper onboarding reduces errors and ensures consistent
experiences across the platform.
    """,
    },
    {
        "id": "doc8",
        "title": "Inventory Management for Sellers",
        "content": """
    Shoplite's inventory management allows sellers to add new products individually
or via bulk upload. Stock levels, SKUs, and variations are tracked
automatically. Low-stock alerts help prevent overselling. Integration with
analytics provides insights into sales velocity and seasonal trends. Sellers can
update product descriptions, images, and pricing easily. AI-powered
recommendations suggest optimal stock levels and pricing adjustments. Inventory
workflows are designed to minimize manual errors and maintain data integrity.
    """,
    },
    {
        "id": "doc9",
        "title": "Commission and Fee Structure",
        "content": """
    Shoplite charges sellers a commission on each transaction, which varies by
product category. Additional fees may apply for premium placement, promotional
campaigns, or expedited shipping. Fees are transparently displayed in the seller
dashboard. The payout schedule is clearly defined, typically weekly or
bi-weekly. Understanding commissions and fees is critical for sellers to
optimize profitability. Automated calculations reduce disputes and ensure
accuracy. This document also highlights token-based AI cost modeling as part of
forecasting operational expenses for platform features.
    """,
    },
    {
        "id": "doc10",
        "title": "Customer Support Procedures",
        "content": """
    Shoplite provides multi-channel customer support including email, live chat, and
phone. Response time SLAs are monitored, and complex cases are escalated to
specialized agents. The support workflow is integrated with retrieval-based AI
assistants that provide instant answers for common queries while flagging
high-risk cases for human review. Documentation, FAQs, and guides are
continuously updated based on user feedback. Effective customer support
increases retention and trust.
    """,
    },
    {
        "id": "doc11",
        "title": "Mobile App Features",
        "content": """
    The Shoplite mobile app supports browsing, search, cart management, checkout,
and notifications. Push notifications inform users about order updates,
promotions, and personalized recommendations. Offline browsing allows users to
view saved items without connectivity. Mobile app performance is monitored for
latency and reliability. The app incorporates user feedback into iterative
improvements. AI-powered features like chat assistance and recommendation
summaries improve user experience on mobile.
    """,
    },
    {
        "id": "doc12",
        "title": "API Documentation for Developers",
        "content": """
    Shoplite offers a RESTful API for external developers. Endpoints include product
search, order management, and seller account operations. Authentication uses API
keys and OAuth 2.0. Rate limits protect system performance. Documentation
includes request/response formats, error codes, and example scripts. Developers
can integrate Shoplite features into third-party applications, while ensuring
security and compliance. RAG principles are used internally to provide accurate
contextual responses for API-related queries in developer support.
    """,
    },
    {
        "id": "doc13",
        "title": "Security and Privacy Policies",
        "content": """
    Shoplite is committed to user data protection. Personal data is collected
according to GDPR standards and stored securely. Sensitive information is
encrypted at rest and in transit. Users can control privacy settings and opt out
of marketing communications. Access logs and audit trails monitor suspicious
activity. AI assistants accessing user queries respect privacy and only use data
from authorized contexts. Transparency and compliance are central to the
platform's security culture.

    """,
    },
    {
        "id": "doc14",
        "title": "Promotional Codes and Discounts",
        "content": """
    Shoplite supports percentage-based, fixed-amount, and free-shipping promo codes.
Sellers can configure validity periods, usage limits, and eligible products.
Users can apply multiple codes if stackable, and discounts are reflected in the
cart in real-time. The platform logs redemption data to prevent abuse.
Retrieval-based AI assistants help users discover applicable promos based on
purchase history. Monitoring ensures fairness and operational reliability.
    """,
    },
    {
        "id": "doc15",
        "title": "AI Features in Shoplite",
        "content": """
    
Shoplite integrates AI in recommendation engines, chat assistants, and help
systems. The RAG-powered chat assistant retrieves relevant documentation to
answer user queries accurately. Recommendation engines personalize product
suggestions based on browsing and purchase history. AI touchpoints are designed
with guardrails and human-in-the-loop checks to ensure safety and reliability.
Continuous monitoring tracks latency, accuracy, and user satisfaction. These
features illustrate how AI-first thinking improves user engagement and platform
efficiency.
    """,
    },
]

# Parse PROMPTS
PROMPTS = yaml.safe_load(PROMPTS_YAML)
print("✓ Knowledge base and prompts loaded successfully!")

In [None]:
# Import required libraries
import torch
import numpy as np
import faiss
import yaml
from flask import Flask, request, jsonify
from pyngrok import ngrok
from sentence_transformers import SentenceTransformer
from datetime import datetime

# Hugging Face Transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# Install required packages
!pip install --quiet transformers torch sentence-transformers faiss-cpu flask pyngrok pyyaml