In [1]:
import os
import numpy as np
from openai import AzureOpenAI
from dotenv import load_dotenv, find_dotenv
from sklearn.metrics.pairwise import cosine_similarity
load_dotenv(find_dotenv())

client = AzureOpenAI(
    azure_endpoint='https://oai-lab-test-eastus-001.openai.azure.com/',
    api_key="47221f36001a4b94839e3cea4365197f",
    api_version="2023-05-15",
)
deployment_name = "text-embedding-ada-002"

In [2]:
def read_files_from_directory(directory_path):
    """
    Read all text files from a directory
    Returns a dictionary with filenames as keys and file contents as values
    """
    file_contents = {}
    
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    file_contents[filename] = file.read()
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    return file_contents

In [3]:
def generate_embeddings(texts):
    """
    Generate embeddings for a list of texts
    Returns a dictionary mapping each text to its embedding vector
    """
    embeddings = {}
    
    for key, text in texts.items():
        try:
            response = client.embeddings.create(
                input=text,
                model=deployment_name
            )
            embeddings[key] = response.data[0].embedding
        except Exception as e:
            print(f"Error generating embedding for {key}: {e}")
    
    return embeddings

In [4]:
def find_similar_documents(query, file_embeddings, file_contents):
    """
    Find documents similar to the query using cosine similarity
    Returns sorted list of (filename, similarity_score) tuples
    """
    # Generate embedding for the query
    query_response = client.embeddings.create(
        input=query,
        model=deployment_name
    )
    query_embedding = query_response.data[0].embedding
    
    # Calculate cosine similarity with each document
    similarities = {}
    for filename, embedding in file_embeddings.items():
        # Convert embeddings to numpy arrays for cosine similarity calculation
        similarity = cosine_similarity(
            [query_embedding],
            [embedding]
        )[0][0]
        similarities[filename] = similarity
    
    # Sort by similarity score (highest first)
    sorted_results = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_results

In [5]:
def find_similar_documents(query, file_embeddings, num_results=5):
    """
    Find documents similar to the query using cosine similarity
    Returns sorted list of (filename, similarity_score) tuples
    """
    # Generate embedding for the query
    query_response = client.embeddings.create(
        input=query,
        model=deployment_name
    )
    query_embedding = query_response.data[0].embedding
    
    # Calculate cosine similarity with each document
    similarities = {}
    for filename, embedding in file_embeddings.items():
        # Convert embeddings to numpy arrays for cosine similarity calculation
        similarity = cosine_similarity(
            [query_embedding],
            [embedding]
        )[0][0]
        similarities[filename] = similarity
    
    # Sort by similarity score (highest first)
    sorted_results = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    
    # Return the top num_results
    return sorted_results[:num_results]

In [6]:
# Example usage
# Set the directory path where your text files are located
directory_path = "../data/phase2_data"  # Change this to your actual directory path

# Read all files from the directory
print("Reading files...")
file_contents = read_files_from_directory(directory_path)
print(f"Found {len(file_contents)} files")

# Generate embeddings for all files
print("Generating embeddings...")
file_embeddings = generate_embeddings(file_contents)
print(f"Generated embeddings for {len(file_embeddings)} files")

# Perform a similarity search
query = "What discounts do I have for dental?"  # Example query
print(f"Searching for documents similar to query: '{query}'")
results = find_similar_documents(query, file_embeddings, num_results=3)

# Display results
print("\nSearch Results:")
for filename, score in results:
    print(f"{filename}: Similarity score {score:.4f}")


Reading files...
Found 6 files
Generating embeddings...
Generated embeddings for 6 files
Searching for documents similar to query: 'What discounts do I have for dental?'

Search Results:
dentel_services.html: Similarity score 0.7619
alternative_services.html: Similarity score 0.7361
workshops_services.html: Similarity score 0.7304
