## Setup

In [1]:
from typing import List, Optional, Dict
from pathlib import Path
import os
import io
import re
import time
import json
import hashlib
import uuid


import openai
from langchain_openai import OpenAIEmbeddings
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticPrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSearch,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    VectorSearchProfile
)
from tenacity import retry, wait_random_exponential, stop_after_attempt

import pandas as pd
from dotenv import dotenv_values

In [8]:
BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"
TRAIN_DIR = DATA_DIR / "train"


ENV_VARIABLES = {
    **dotenv_values(str(BASE_DIR / ".env")),  # load environment variables from .env file
    #**os.environ,  # override loaded values with environment variables
}

print(BASE_DIR)

c:\Users\CristianDavidMarquez\Documents\research\demo_classification\backend_demo


In [4]:
ENV_VARIABLES["AZURE_SEARCH_SERVICE"]

'cscontratos'

## Utils

In [5]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    embedding_model = OpenAIEmbeddings(
        openai_api_key=ENV_VARIABLES["OPENAI_KEY"],
        model="text-embedding-3-small",
    )
    
    embeddins = embedding_model.embed_documents([text])
    return embeddins[0]

import hashlib

def create_hash(texto):
    # Crea un objeto de hash SHA-256
    sha256 = hashlib.sha256()
    
    # Actualiza el objeto de hash con la cadena de entrada codificada en bytes
    sha256.update(texto.encode('utf-8'))
    
    # Obtiene el hash en formato hexadecimal
    hash_resultado = sha256.hexdigest()
    
    return hash_resultado

def create_id():
    # uuid id
    return str(uuid.uuid4())

def validate_and_convert_documents(documents):
    for i, doc in enumerate(documents):
        for key, value in doc.items():
            if isinstance(value, int):
                print(f"Converting document {i} key {key} value {value} to string")
                doc[key] = str(value)
    return documents

In [6]:

class IndexManager:
    def __init__(self, search_service:str, search_key:str) -> None:
        self.search_key = search_key
        self.search_service = search_service
        self.index_client = SearchIndexClient(endpoint=f"https://{self.search_service}.search.windows.net/",
                                        credential=AzureKeyCredential(self.search_key))

    def create_index(
            self, 
            index_name: str, 
            fields,
            vector_search,
            semantic_search: Optional[SemanticConfiguration] = None
            ):
        print(f"Creating {index_name} search index")
        
        # Create the search index with the semantic settings
        index = SearchIndex(name=index_name, fields=fields,
                            vector_search=vector_search,  semantic_search=semantic_search)
        result = self.index_client.create_or_update_index(index)
        print(f' {result.name} created')
        return result


    def upload_documents(self, index_name: str, documents: List[Dict], batch_size: int = 1000):
        # Create a SearchClient object using the provided endpoint, index name, and credential
        search_client = SearchClient(
            endpoint=f"https://{self.search_service}.search.windows.net/",
            index_name=index_name,
            credential=AzureKeyCredential(self.search_key)
        )

        if not documents:
            print("No documents to upload")
            return
        
        results = []
        if len(documents) > batch_size:
            print("Too many documents to upload, split into smaller batches")
            for i in range(0, len(documents), batch_size):
                print("*"*16)
                print(f"Uploading documents {i} to {i+batch_size}")
                try:
                    result = search_client.upload_documents(documents=documents[i:i+batch_size])
                    # Count the number of succeeded uploads
                    succeeded = sum([1 for r in result if r.succeeded])

                    # Log the number of sections indexed and the number of successful uploads
                    print(f"\tIndexed {len(result)} sections, {succeeded} succeeded")
                    results.append(result)
                except Exception as e:
                    print(f"Error uploading documents: {e}")
            return results
            
        else:
            # Upload the documents to the search index and get the results
            results = search_client.upload_documents(documents=documents)

            # Count the number of succeeded uploads
            succeeded = sum([1 for r in results if r.succeeded])

            # Log the number of sections indexed and the number of successful uploads
            print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")

            # Return the results
            return results

## Config Structure

In [14]:
fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SimpleField(name="label", type=SearchFieldDataType.String, filterable=True),
        SearchableField(name="message", type=SearchFieldDataType.String),
        SimpleField(name="source", type=SearchFieldDataType.String, filterable=True),

        SearchField(
            name="main_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="my-vector-config",
        ),
    ]

vector_search = VectorSearch(
        profiles=[VectorSearchProfile(name="my-vector-config", algorithm_configuration_name="my-algorithms-config")],
        algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
    )

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="message")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

## Create Index

In [15]:
## Index manager
index_manager = IndexManager(
    search_service=ENV_VARIABLES["AZURE_SEARCH_SERVICE"],
    search_key=ENV_VARIABLES["AZURE_SEARCH_KEY"]
)

In [16]:
index_manager.create_index(
    index_name=ENV_VARIABLES["AZURE_SEARCH_INDEX"],
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search
)

Creating classification-index search index
 classification-index created


<azure.search.documents.indexes.models._index.SearchIndex at 0x1f30298be50>

## Load data

In [9]:
df_train = pd.read_csv(TRAIN_DIR / 'train.csv')
df_train.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
embedding_model = OpenAIEmbeddings(
        openai_api_key=ENV_VARIABLES["OPENAI_KEY"],
        model="text-embedding-3-small",
    )
    
batch_size = 1000
df_vector = pd.DataFrame()
for i in range(0, len(df_train), batch_size):
    batch = df_train.iloc[i:i+batch_size]
    texts = batch['message'].tolist()
    embeddings = embedding_model.embed_documents(texts)
    batch["main_vector"] = embeddings
    df_vector = pd.concat([df_vector, batch], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch["main_vector"] = embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch["main_vector"] = embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch["main_vector"] = embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [19]:
records = df_vector.to_dict(orient='records')
for d in records:
    d['id'] = create_id()


In [20]:
## index data
results = index_manager.upload_documents(
    index_name=ENV_VARIABLES["AZURE_SEARCH_INDEX"],
    documents=records,
    batch_size=1000
)

Too many documents to upload, split into smaller batches
****************
Uploading documents 0 to 1000
	Indexed 1000 sections, 1000 succeeded
****************
Uploading documents 1000 to 2000
	Indexed 1000 sections, 1000 succeeded
****************
Uploading documents 2000 to 3000
	Indexed 1000 sections, 1000 succeeded
****************
Uploading documents 3000 to 4000
	Indexed 1000 sections, 1000 succeeded
****************
Uploading documents 4000 to 5000
	Indexed 1000 sections, 1000 succeeded
****************
Uploading documents 5000 to 6000
	Indexed 272 sections, 272 succeeded
