In [1]:
pip install googlesearch-python

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports
import os
import socket
from typing import List, Dict
import subprocess
import time
import re

# LangChain and related libraries
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pydantic import BaseModel, Field
from langchain_ollama.chat_models import ChatOllama

# Search library
try:
    from googlesearch import search
except ImportError:
    print("Error: 'googlesearch-python' is not installed. Please run 'pip install googlesearch-python'")

os.environ["USER_AGENT"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
# =================================================================
# AGENT 4: SolBenchmarker CLASS (from Agent4-1.ipynb)
# =================================================================
class SolBenchmarker:
    def __init__(self, user: str, python_env: str = "rapids25.02"):
        if not user or user == "YOUR_ASURITE_ID":
            raise ValueError("A valid ASURITE username is required for SolBenchmarker.")
        self.user = user
        self.python_env = python_env

    def _generate_sbatch_script(self, script_dir: str, cpu_script_name: str, gpu_script_name: str) -> str:
        return f"""#!/bin/bash
#SBATCH -p general
#SBATCH -q public
#SBATCH -G 1
#SBATCH -A grp_hackathon2025
#SBATCH --reservation=hackathon2025
#SBATCH -t 0-00:10:00
#SBATCH -c 1
#SBATCH -o {script_dir}/slurm-%j.out
#SBATCH -e {script_dir}/slurm-%j.err

module load mamba/latest
source activate {self.python_env}

echo "--- STARTING CPU BENCHMARK ---"
/usr/bin/time -p python3 {script_dir}/{cpu_script_name} 2>&1
echo "--- FINISHED CPU BENCHMARK ---"

echo ""
echo "--- STARTING GPU BENCHMARK ---"
/usr/bin/time -p python3 {script_dir}/{gpu_script_name} 2>&1
echo "--- FINISHED GPU BENCHMARK ---"
"""

    def _parse_output(self, output_content: str) -> dict:
        try:
            real_times = re.findall(r"real\s+([\d.]+)", output_content)
            cpu_time = float(real_times[0]) if len(real_times) > 0 else None
            gpu_time = float(real_times[1]) if len(real_times) > 1 else None
            print("cpu_time_seconds", cpu_time, "gpu_time_seconds", gpu_time)
            return {"status": "success", "cpu_time_seconds": cpu_time, "gpu_time_seconds": gpu_time}
        except (IndexError, ValueError) as e:
            return {"status": "error", "message": f"Failed to parse benchmark times. Error: {e}", "raw_log": output_content}

    def run_benchmark(self, cpu_code: str, gpu_code: str) -> dict:
        benchmark_dir = os.path.join(os.getcwd(), "benchmark_files")
        os.makedirs(benchmark_dir, exist_ok=True)
        
        cpu_script_path = os.path.join(benchmark_dir, "cpu_benchmark.py")
        gpu_script_path = os.path.join(benchmark_dir, "gpu_benchmark.py")
        sbatch_path = os.path.join(benchmark_dir, "benchmark_job.sh")

        try:
            with open(cpu_script_path, "w") as f: f.write(cpu_code)
            with open(gpu_script_path, "w") as f: f.write(gpu_code)
            sbatch_script = self._generate_sbatch_script(benchmark_dir, "cpu_benchmark.py", "gpu_benchmark.py")
            with open(sbatch_path, "w") as f: f.write(sbatch_script)

            process = subprocess.run(f"sbatch {sbatch_path}", shell=True, capture_output=True, text=True)
            if process.returncode != 0: raise RuntimeError(f"sbatch submission failed: {process.stderr}")

            job_id_match = re.search(r"Submitted batch job (\d+)", process.stdout.strip())
            if not job_id_match: raise RuntimeError(f"Could not parse Job ID from sbatch output: {process.stdout}")
            job_id = job_id_match.group(1)
            print(f"--> [Agent 4] Submitted benchmark job to SLURM with ID: {job_id}")

            print("--> [Agent 4] Waiting for job to complete...")
            while True:
                queue_process = subprocess.run(f"squeue -u {self.user} -j {job_id}", shell=True, capture_output=True, text=True)
                if job_id not in queue_process.stdout: break
                time.sleep(10)

            print(f"--> [Agent 4] Job {job_id} completed.")
            output_file_path = os.path.join(benchmark_dir, f"slurm-{job_id}.out")
            
            if not os.path.exists(output_file_path):
                 err_file_path = os.path.join(benchmark_dir, f"slurm-{job_id}.err")
                 if os.path.exists(err_file_path):
                     with open(err_file_path, "r") as f: error_content = f.read()
                     return {"status": "error", "message": f"Job failed. See error log: {error_content}"}
                 return {"status": "error", "message": f"Output file not found."}

            with open(output_file_path, "r") as f: output_content = f.read()
            return self._parse_output(output_content)
        except Exception as e:
            return {"status": "error", "message": str(e)}

In [4]:
# =================================================================
# AI TUTOR AGENT (DEFINITIVE, DYNAMIC & FULLY ABSTRACT FINAL)
# =================================================================

class SearchQueryGenerator(BaseModel):
    queries: List[str] = Field(description="A list of targeted, keyword-focused search queries.")

# --- Part 1: Fully Abstracted Search Query Generation ---
def generate_search_queries(query: str, llm) -> List[str]:
    """
    Uses a dynamic prompt to generate language-and-library-specific search queries.
    """
    print("-> Using LLM with FULLY ABSTRACTED prompt to generate search queries...")
    
    # DYNAMIC PROMPT: The LLM now identifies the correct libraries and language on its own.
    prompt_template = PromptTemplate(
        template="""
        You are an expert at generating web search queries for a technical audience.
        Analyze the user's question to identify the core technical task and the programming language.
        Based on your knowledge, generate 5 concise, targeted search queries. Two queries should be for the standard, CPU-based library for that task/language. Three queries should be for potential GPU-accelerated libraries for that task/language, prioritizing NVIDIA-based solutions if they exist.

        User Question: "{question}"
        
        Generate a JSON list of 5 search query strings.
        """,
        input_variables=["question"],
    )
    
    query_generation_chain = prompt_template | llm.with_structured_output(SearchQueryGenerator)

    try:
        response_model = query_generation_chain.invoke({"question": query})
        print(f"-> Generated queries: {response_model.queries}")
        return response_model.queries
    except Exception as e:
        print(f"-> LLM failed to generate structured output: {e}")
        return []

# --- Part 2: The Main Search Function (Unchanged) ---
def dynamic_search_agentic(queries: List[str]) -> list[str]:
    print("-> Executing dynamic search...")
    all_urls = set()
    for q in queries:
        try:
            enhanced_query = f"{q} site:developer.nvidia.com OR site:medium.com/rapids-ai OR site:medium.com/cupy-team"
            search_results = list(search(enhanced_query, num_results=2))
            for url in search_results:
                all_urls.add(url)
        except Exception as e:
            print(f"An error occurred during search for query '{q}': {e}")
            continue
    final_urls = [url for url in list(all_urls) if url]
    print(f"-> Found {len(final_urls)} unique URLs: {final_urls}")
    return final_urls

def _extract_python_code(markdown_text: str) -> Dict[str, str]:
    """Helper function to parse CPU and GPU code blocks from the LLM's response."""
    # This helper function remains the same as before...
    code_pattern = r"```python\n(.*?)\n```"
    gpu_heading_pattern = r"### Recommended GPU Solution.*?\n"
    cpu_heading_pattern = r"### Standard CPU Solution.*?\n"
    gpu_section_match = re.search(gpu_heading_pattern, markdown_text, re.DOTALL | re.IGNORECASE)
    cpu_section_match = re.search(cpu_heading_pattern, markdown_text, re.DOTALL | re.IGNORECASE)
    gpu_code = ""
    cpu_code = ""
    if gpu_section_match:
        section_start = gpu_section_match.end()
        code_match = re.search(code_pattern, markdown_text[section_start:], re.DOTALL)
        if code_match: gpu_code = code_match.group(1).strip()
    if cpu_section_match:
        section_start = cpu_section_match.end()
        code_match = re.search(code_pattern, markdown_text[section_start:], re.DOTALL)
        if code_match: cpu_code = code_match.group(1).strip()
    return {"cpu_code": cpu_code, "gpu_code": gpu_code}

# --- Part 3: The Definitive Dynamic RAG Pipeline ---
def process_with_rag(query: str) -> str:
    print("--- Running FINAL DYNAMIC RAG Pipeline ---")

    host_node = socket.gethostname()
    llm = ChatOllama(model="qwen3:14b", base_url=f"http://kvinod@{host_node}:11434/")

    search_queries = generate_search_queries(query, llm)
    urls = []
    if search_queries:
        urls = dynamic_search_agentic(search_queries)

    context_text = ""
    if urls:
        print("-> Found documents. Loading and processing context...")
        docs = [WebBaseLoader(url).load() for url in urls]
        docs_list = [item for sublist in docs for item in sublist]
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap=100)
        doc_splits = text_splitter.split_documents(docs_list)
        embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        vectorstore = Chroma.from_documents(documents=doc_splits, embedding=embedding_model, collection_name="rag-chroma")
        retriever = vectorstore.as_retriever()
        retrieved_docs = retriever.invoke(query)
        context_text = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
        vectorstore.delete_collection()

    # THE DEFINITIVE, FULLY ABSTRACTED "NVIDIA-FIRST EDUCATIONAL PRIORITY" PROMPT
    final_prompt_template = PromptTemplate(
        template="""
        You are a friendly and knowledgeable AI Tutor for a project focused on data science acceleration. Your primary mission is to educate users on leveraging NVIDIA-based GPU libraries.

        First, analyze the user's QUESTION to identify the core task and programming language. Based on your knowledge, determine if a common NVIDIA-based GPU-accelerated library (like cuPy, cuDF, Rapids) exists for that specific task and language.

        Then, follow the appropriate path below to structure your conversational and helpful answer:

        **PATH 1: An NVIDIA-based GPU-accelerated library EXISTS for this task.**
        1.  Identify the standard CPU library and the NVIDIA GPU library for the user's language and task.
        2.  Start with a friendly opening that explains you will show both the GPU-accelerated and standard methods.
        3.  Provide a heading for the GPU solution, dynamically inserting the library name (e.g., `### Recommended GPU Solution (with [GPU Library Name])`).
        4.  Write the code example for the GPU solution.
        5.  Add a "Performance Note" section. Explain the benefits of the GPU approach (e.g., for large datasets) and the trade-offs (e.g., performance on small data vs. large data, data transfer overhead).
        6.  Provide a heading for the CPU solution, dynamically inserting the library name (e.g., `### Standard CPU Solution (with [CPU Library Name])`).
        7.  Write the CPU-based code for comparison.
        8.  Do NOT add any disclaimer note at the end.

        **PATH 2: An NVIDIA-based GPU-accelerated library DOES NOT EXIST for this task.**
        1.  Identify the standard library for the user's specified language and task.
        2.  Start with a friendly opening explaining the standard approach.
        3.  Provide a heading for the standard solution, dynamically inserting the library name (e.g., `### Standard Solution (with [Library Name])`).
        4.  Write the code example using the identified standard library.
        5.  End your entire response with the exact sentence: "Note: The provided solution is the standard method for this task, as a direct NVIDIA-based GPU library for it is not common."

        Use the CONTEXT below to inform your answer if it is relevant, but your primary instruction is to follow the mission and logic paths described above.

        CONTEXT:
        {context}

        QUESTION:
        {question}

        YOUR FINAL ANSWER:
        """,
        input_variables=["context", "question"],
    )
    
    final_chain = final_prompt_template | llm
    llm_response_text = final_chain.invoke({"context": context_text, "question": query}).content
    asurite_id = "kvinod"

    print("--> [Agent 3] Generated conversational answer.")

    extracted_code = _extract_python_code(llm_response_text)
    cpu_code = extracted_code["cpu_code"]
    gpu_code = extracted_code["gpu_code"]
    
    if cpu_code and gpu_code:
        print("--> [Agent 3] Both CPU and GPU code found. Invoking Agent 4 for benchmarking.")
        try:
            benchmarker = SolBenchmarker(user=asurite_id)
            benchmark_results = benchmarker.run_benchmark(cpu_code, gpu_code)
            
            if benchmark_results.get("status") == "success":
                cpu_time = benchmark_results.get('cpu_time_seconds')
                gpu_time = benchmark_results.get('gpu_time_seconds')
                print(benchmark_results.get('raw_log', 'No raw log available.'))
                
                # --- THIS IS THE FIX ---
                # We now check if the time values are None before formatting them.
                cpu_time_str = f"{cpu_time:.4f} seconds" if cpu_time is not None else "N/A (script may have failed)"
                gpu_time_str = f"{gpu_time:.4f} seconds" if gpu_time is not None else "N/A (script may have failed)"
                
                benchmark_md = "\n\n---\n### 📊 Real-World Benchmark Results (from ASU's Sol Supercomputer)\n"
                benchmark_md += "| Metric | Result |\n|---|---|\n"
                benchmark_md += f"| CPU Time | {cpu_time_str} |\n"
                benchmark_md += f"| GPU Time | {gpu_time_str} |\n"
                
                if cpu_time and gpu_time and gpu_time > 0:
                    speedup = cpu_time / gpu_time
                    benchmark_md += f"| **Speedup** | **{speedup:.2f}x faster on GPU!** |\n"
                
                llm_response_text += benchmark_md
            else:
                llm_response_text += f"\n\n---\n### ⚠️ Benchmark Failed\nCould not retrieve benchmark results: {benchmark_results.get('message')}"
    
        except ValueError as e:
            llm_response_text += f"\n\n---\n### ⚠️ Benchmark Skipped\nConfiguration error: {e}"
    else:
        print("--> [Agent 3] Did not find both code types. Skipping benchmark.")
        
    print("--- Pipeline Complete ---")
    print(llm_response_text)
    return llm_response_text
    # return answer

# Gradio

In [None]:
# =================================================================
# GRADIO CHAT APPLICATION FOR AI TUTOR (FINAL)
#
# This version fixes the UserWarning by specifying the modern 'messages'
# format for the chatbot component.
# =================================================================

import gradio as gr
import re
import time

# This wrapper function connects our backend logic to the Gradio UI.
# It assumes 'process_with_rag' is defined and available in the notebook environment.
def tutor_chat_interface(user_message, history):
    # The 'history' is now a list of dictionaries. Append the new user message.
    history.append({"role": "user", "content": user_message})
    
    # Show a "thinking..." message while processing
    yield history + [{"role": "assistant", "content": "Thinking..."}], "Thinking..."

    # Call our existing RAG pipeline function
    full_response = process_with_rag(user_message)

    # Use regex to separate the <think> block from the final answer
    think_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
    think_match = think_pattern.search(full_response)
    
    thought_process = "No thought process was found in the response."
    if think_match:
        # Extract the thought process and clean it up
        thought_process = think_match.group(1).strip()
        # Remove the <think> block from the final answer shown to the user
        final_answer = think_pattern.sub("", full_response).strip()
    else:
        # If no <think> block, the whole response is the answer
        final_answer = full_response

    # Update the chatbot history with the AI's clean answer
    history.append({"role": "assistant", "content": final_answer})
    
    # Return the final history and the extracted thought process
    yield history, thought_process


# Build the Gradio UI using Blocks for more control
with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {background-color: #f5f5f5;}") as demo:
    gr.Markdown("# 🤖 AI Accelerated Data Science Tutor")
    gr.Markdown("Ask a question about a data science task. The tutor will provide an explanation and code, prioritizing NVIDIA GPU-accelerated solutions where possible.")

    # THE FIX IS HERE: Added 'type="messages"' to the chatbot component.
    chatbot = gr.Chatbot(label="Conversation", height=450, bubble_full_width=False, type="messages")
    
    with gr.Accordion("🔎 Show Agent's Thought Process", open=False):
        cot_output = gr.Markdown("The agent's reasoning will appear here after it responds.")

    with gr.Row():
        msg_textbox = gr.Textbox(
            label="Your Question",
            placeholder="e.g., How do I multiply two 10x10 arrays in Python?",
            scale=4,
            autofocus=True,
            container=False # This makes the textbox look cleaner
        )
        submit_btn = gr.Button("Ask", variant="primary", scale=1, min_width=150)

    # Main function to handle the chat logic
    def handle_submit(user_message, chat_history):
        response_generator = tutor_chat_interface(user_message, chat_history)
        
        # The generator now yields the full message history and the thought process
        for history_state, thought_process in response_generator:
             yield history_state, thought_process


    # Connect the submit button and textbox to the handler function
    submit_btn.click(
        handle_submit, 
        [msg_textbox, chatbot], 
        [chatbot, cot_output]
    ).then(lambda: gr.update(value=""), None, [msg_textbox], queue=False)

    msg_textbox.submit(
        handle_submit, 
        [msg_textbox, chatbot], 
        [chatbot, cot_output]
    ).then(lambda: gr.update(value=""), None, [msg_textbox], queue=False)


# Launch the application
# Set share=True if you need a public link from the Sol jupyter notebook
demo.launch(share=True, debug=True)

  chatbot = gr.Chatbot(label="Conversation", height=450, bubble_full_width=False, type="messages")


* Running on local URL:  http://127.0.0.1:7862
* Running on public URL: https://afa64ec6a36a1c2692.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


--- Running FINAL DYNAMIC RAG Pipeline ---
-> Using LLM with FULLY ABSTRACTED prompt to generate search queries...
-> Generated queries: ['python multiply 2 5x5 arrays', 'numpy matrix multiplication 5x5', 'gpu accelerated matrix multiplication python', 'cuda matrix multiplication python', 'pycuda matrix multiplication 5x5']
-> Executing dynamic search...
-> Found 8 unique URLs: ['https://forums.developer.nvidia.com/t/how-to-increase-speed-transfer-of-matrices-gpu-cpu-for-matrix-multiplication-it-is-the-limiting-factor/56251', 'https://forums.developer.nvidia.com/t/cublassgemv-returning-not-expected-values/10415', 'https://developer.nvidia.com/nvmath-python', 'https://forums.developer.nvidia.com/t/need-help-in-implementing-matrix-multiplication-using-shared-memory-in-numba/111461', 'https://developer.nvidia.com/blog/fusing-epilog-operations-with-matrix-multiplication-using-nvmath-python/', 'https://www.google.com/search?num=4', 'https://forums.developer.nvidia.com/t/problem-of-two-large

/packages/envs/genai25.06/compiler_compat/ld: /packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.5.0-yaosn2wjlhxqbokllnobo2soiuh6gw3n/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/packages/envs/genai25.06/compiler_compat/ld: /packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.5.0-yaosn2wjlhxqbokllnobo2soiuh6gw3n/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/packages/envs/genai25.06/compiler_compat/ld: /packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.5.0-yaosn2wjlhxqbokllnobo2soiuh6gw3n/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/packages/envs/genai25.06/compiler_compat/ld: /packages/apps/spack/21/opt/spack/linux-rocky8-zen3/gcc-12.1.0/cuda-12.5.0-yaosn2wjlhxqbokllnobo2soiuh6gw3n/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/packages/envs/genai25.06/compiler_c

--> [Agent 3] Generated conversational answer.
--> [Agent 3] Both CPU and GPU code found. Invoking Agent 4 for benchmarking.
--> [Agent 4] Submitted benchmark job to SLURM with ID: 28544984
--> [Agent 4] Waiting for job to complete...
--> [Agent 4] Job 28544984 completed.
cpu_time_seconds 0.42 gpu_time_seconds 3.62
No raw log available.
--- Pipeline Complete ---
忧

</think>

### Recommended GPU Solution (with Numba and CUDA)

Here's how you can perform matrix multiplication using shared memory in **Numba with CUDA** for two 5x5 arrays:

```python
import numpy as np
from numba import cuda

@cuda.jit
def matmul_shared(A, B, C):
    # Define the block size (thread block dimensions)
    block_size = (16, 16)  # This is the size of the thread block

    # Get the thread's position within the block
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    # Get the block's position within the grid
    bx = cuda.blockIdx.x
    by = cuda.blockIdx.y

    # Get the thread's position within the gr