In [None]:
import os
import openai
import pandas as pd
import numpy as np
from azure.cosmos import CosmosClient
from sklearn.neighbors import NearestNeighbors

In [None]:
# --- Cosmos DB Setup ---
url = os.getenv('COSMOS_DB_URL', 'https://khipus-rag.documents.azure.com:443/')
key = os.getenv('COSMOS_DB_KEY', 'aHWuWA59JMIb7Z0giUhvbQSmzQgv9ETtg67razBKjXur8xi4XmETe45gY2hsIJnNNsXMKjKOYRG5ACDbq6xFbg==')
client = CosmosClient(url, credential=key)
database_name = 'rag-cosmos-db'
database = client.get_database_client(database_name)
container_name = 'data'
container = database.get_container_client(container_name)

In [None]:

# --- DataFrame Creation ---
df = pd.DataFrame(columns=['path', 'text'])
data_paths = [
    "data/frameworks.md?WT.mc_id=academic-105485-koreyst", 
    "data/own_framework.md?WT.mc_id=academic-105485-koreyst", 
    "data/perceptron.md?WT.mc_id=academic-105485-koreyst"
]
for path in data_paths:
    actual_path = path.split('?')[0]
    with open(actual_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
    df.loc[len(df)] = [actual_path, file_content]

In [None]:
# --- Text Splitting ---
def split_text(text, max_length, min_length):
    words = text.split()
    chunks = []
    current_chunk = []
    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) < max_length and len(' '.join(current_chunk)) > min_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

splitted_df = df.copy()
splitted_df['chunks'] = splitted_df['text'].apply(lambda x: split_text(x, 400, 300))
flattened_df = splitted_df.explode('chunks')

In [None]:
# --- Azure OpenAI Configuration ---
openai.api_type = "azure"
# Set the API base to your Azure OpenAI resource endpoint (do not include deployment paths here)
openai.api_base = "https://khipus-aoai.openai.azure.com"
openai.api_version = "2024-08-01-preview"
# Set your API key; ideally use an environment variable
openai.api_key = os.getenv("OPENAI_API_KEY", "BjSM1Dwo5UZVvPUizHw8w0n8i7TM3fHIK3GjbeIYX5Z1nqffyiCBJQQJ99BBACYeBjFXJ3w3AAABACOGRhVh")

In [None]:
# --- Function to Create Embeddings ---
def create_embeddings(text, model="text-embedding-ada-002"):
    # When using Azure OpenAI, specify the deployment name using 'engine'
    embeddings = openai.Embedding.create(input=[text], engine=model).data[0].embedding
    return embeddings


In [None]:

# --- Compute and Store Embeddings ---
flattened_df['embeddings'] = flattened_df['chunks'].apply(create_embeddings)
embeddings_list = flattened_df['embeddings'].to_list()

In [None]:
# --- Build Nearest Neighbors Index ---
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(embeddings_list)
distances, indices = nbrs.kneighbors(embeddings_list)
flattened_df['indices'] = indices.tolist()
flattened_df['distances'] = distances.tolist()

In [None]:
# --- Chatbot Function ---
def chatbot(user_input):
    # Convert the question to a query vector
    query_vector = create_embeddings(user_input)
    # Retrieve similar document chunks (for context)
    _, indices = nbrs.kneighbors([query_vector])
    history = [flattened_df['chunks'].iloc[i] for i in indices[0]]
    # Optionally, add the user's question at the end of the history
    history.append(user_input)
    
    # For the chat prompt, you can decide how much context to provide.
    # Here we pass the last chunk as the user message.
    messages = [
        {"role": "system", "content": "You are an AI assistant that helps with AI questions."},
        {"role": "user", "content": history[-1]}
    ]    
    # Create the chat completion request.
    # Note: Use the 'engine' parameter to specify your deployment name ("gpt-4o-mini").
    response = openai.ChatCompletion.create(
        engine="gpt-4o-mini",  # Your deployment name in Azure OpenAI
        temperature=0.7,
        max_tokens=800,
        messages=messages
    )
    return response.choices[0].message

In [None]:
# --- Example Usage ---
user_input = "what is a perceptron?"
response_message = chatbot(user_input)
print(response_message)