In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from azure.cosmos import CosmosClient
from typing import Dict, Any
import json

import os
from dotenv import load_dotenv
load_dotenv()

OPEN_AI_ENDPOINT = os.getenv("OPEN_AI_ENDPOINT")
subscription_key = os.getenv("SUBSCRIPTION_KEY")


In [18]:
llm = AzureChatOpenAI(
    api_version="2025-01-01-preview",
    azure_endpoint=OPEN_AI_ENDPOINT,
    api_key=subscription_key,
    temperature=0.1,     
)

In [19]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a Professor. You are to answer the question in 20-30 words "),
    ("user", "tell me a about {topic}"),
])

chain = prompt | llm

response = chain.invoke({"topic": "Vector Databases"})
print(response.content)

Vector databases store and manage high-dimensional vector embeddings, enabling efficient similarity search and retrieval for applications like recommendation systems, natural language processing, and computer vision tasks.


In [20]:
#embedding model text-embeddding-ada-002
EMBEDDING_MODEL_ENDPOINT = os.getenv("EMBEDDING_MODEL_ENDPOINT")

embedding_model = AzureOpenAIEmbeddings(
    azure_endpoint=EMBEDDING_MODEL_ENDPOINT,
    api_key=subscription_key,
)

In [21]:
query = "What is Azure OpenAI?"
query_embedding = embedding_model.embed_query(query)

#print the dimension of the embedding
print(len(query_embedding))

1536


In [22]:
DATABASE_NAME = os.getenv("DATABASE_NAME")
CONTAINER_NAME = os.getenv("CONTAINER_NAME")

In [23]:
COSMOS_CONNECTION_STRING = os.getenv("COSMOS_CONNECTION_STRING")

# Cosmos_client connnection using Connection string 
client = CosmosClient.from_connection_string(COSMOS_CONNECTION_STRING)
database = client.get_database_client(DATABASE_NAME)
container = database.get_container_client(CONTAINER_NAME)

In [24]:
def insert_document(test_id, content):
    content_vector = embedding_model.embed_query(content)
    document = {
        "id": str(test_id),
        "content": content,
        "content_vector": content_vector
    }
    container.create_item(body=document)
    print(f"Inserted document with test_id: {test_id}")


In [25]:
#check if test_id already exits 
def check_document_exists(test_id):
    query = f"SELECT * FROM c WHERE c.id = '{test_id}'"
    items = list(container.query_items(query=query, enable_cross_partition_query=True))
    return len(items) > 0


In [26]:
def vector_search(query_text, top_k=5):
    # Generate user query embedding 
    query_embedding = embedding_model.embed_query(query_text)
    
    # SQL query with vector distance calculation
    query = """
    SELECT TOP @top_k 
        c.id,
        c.content,
        VectorDistance(c.content_vector, @embedding) AS similarity_score
    FROM c
    ORDER BY VectorDistance(c.content_vector, @embedding)
    """
    
    parameters = [
        {"name": "@embedding", "value": query_embedding},
        {"name": "@top_k", "value": top_k}
    ]
    
    try:
        # vector search query
        results = list(container.query_items(
            query=query,
            parameters=parameters,
            enable_cross_partition_query=True
        ))
        
        print(f"Vector search completed - Found {len(results)} results")
        return results
        
    except Exception as e:
        print(f"Error during vector search: {e}")
        return []


In [None]:
import requests

# DATA in josn file does not have title, fetching title using OMDB API

OMDB_API_KEY = os.getenv("OMDB_API_KEY")

def fetch_movie_title(imdb_id: str) -> str:
    url = f"http://www.omdbapi.com/?i={imdb_id}&apikey={OMDB_API_KEY}"
    response = requests.get(url)
    data = response.json()
    if data.get('Response') == 'True':
        return data.get('Title', 'Unknown Title')


In [None]:
def insert_movie_into_db(movie_data: Dict[str, Any]) -> None:
    if(check_document_exists(movie_data["movie_id"])):
        print(f"Document with ID {movie_data['movie_id']} already exists. Skipping insertion.")
        return
    movie_id = movie_data["movie_id"]
    movie_title = fetch_movie_title(movie_id)
    print(movie_title)
    content = "plot_summary: " + movie_data["plot_summary"] + "  " + "plot_synopsis:" + movie_data["plot_synopsis"]
    vector_embedding = embedding_model.embed_query(content)
    document = {
        "id" : movie_id,
        "title" : movie_title,
        "genres": movie_data["genre"],
        "year": movie_data["release_date"],
        "rating": movie_data["rating"],
        "plot_summary": movie_data["plot_summary"],
        "plot_synopsis": movie_data["plot_synopsis"],
        "embedding": vector_embedding,
    }
    container.create_item(body=document)
    print(f"Inserted movie with ID: {movie_id}, Title: {movie_title}")

    

In [None]:
def process_movie_data(file_path: str) -> None:

    print(f"Start processing data")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            movie_list = json.load(f)
        
        print(f"Successfully loaded {len(movie_list)} movies.")
        
        # Iterate over the list, calling the insertion function for each movie
        for index, movie in enumerate(movie_list):
            print(f"\nProcessing movie {index + 1}/{len(movie_list)}...")
            insert_movie_into_db(movie)

    except json.JSONDecodeError as e:
        print(f"ERROR: Failed to decode JSON from file.")
        print(f"Details: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during file processing: {e}")


In [None]:
#  FILE_PATH = "./dataset/movies.json"
#  process_movie_data(FILE_PATH)

In [31]:
def vector_search(query_text, top_k=5):
    # Generate user query embedding 
    query_embedding = embedding_model.embed_query(query_text)
    
    # SQL query with vector distance calculation
    query = """
    SELECT TOP @top_k 
        c.id,
        c.title,
        c.genres,
        c.rating,
        c.plot_summary,
        c.plot_synopsis,
        VectorDistance(c.embedding, @embedding) AS similarity_score
    FROM c
    ORDER BY VectorDistance(c.embedding, @embedding)
    """
    
    parameters = [
        {"name": "@embedding", "value": query_embedding},
        {"name": "@top_k", "value": top_k}
    ]
    
    try:
        # vector search query
        results = list(container.query_items(
            query=query,
            parameters=parameters,
            enable_cross_partition_query=True
        ))
        
        print(f"Vector search completed - Found {len(results)} results")
        return results
        
    except Exception as e:
        print(f"Error during vector search: {e}")
        return []

In [32]:
query = "romantic movies with happy ending"
results = vector_search(query, top_k=5)
for idx, item in enumerate(results):
    print(item)

Vector search completed - Found 5 results
{'id': 'tt0107211', 'title': 'Indecent Proposal', 'genres': ['Drama', 'Romance'], 'rating': '5.8', 'plot_summary': 'A young couple very much in love are married and have started their respective careers, she as a real estate broker, he as an architect. She finds the perfect spot to build his dream house, and they get loans to finance it. When the recession hits, they stand to lose everything they own, so they go to Vegas to have one shot at winning the money they need. After losing at the tables, they are approached by a millionaire who offers them a million dollars for a night with the wife. Though the couple agrees that this is a way out of their financial dilemma, it threatens to destroy their relationship.                Written by\nEd Sutton <esutton@mindspring.com>', 'plot_synopsis': '', 'similarity_score': 0.8036993851661897}
{'id': 'tt0832266', 'title': 'Definitely, Maybe', 'genres': ['Comedy', 'Drama', 'Romance'], 'rating': '7.2', 'plo