### Importing libraries

In [None]:
from openai import AzureOpenAI
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.text_splitter import  RecursiveCharacterTextSplitter,  CharacterTextSplitter

from langchain_community.document_loaders import JSONLoader, DirectoryLoader
from langchain_community.document_loaders import UnstructuredExcelLoader,  UnstructuredPowerPointLoader, Docx2txtLoader, PyPDFLoader,  UnstructuredWordDocumentLoader, TextLoader, UnstructuredFileLoader

from langchain.chains import RetrievalQA

from langchain.retrievers.merger_retriever import MergerRetriever
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever
from langchain_experimental.text_splitter import SemanticChunker



In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
open_ai_type = "azure"
open_ai_api_version = "2023-05-15"
open_ai_api_key = "open_ai_api_key"
open_ai_api_key = open_ai_api_key 
open_ai_endpoint = "open_ai_endpoint" # base
embeddings_model = "janus-text-embedding-ada-002"
gpt_deployment_4 = "janus-gpt-4"
gpt_deployment_35 = "janus-gpt-35-turbo"
gpt_deployment_35_16="janus-gpt-35-turbo-16k"
embeddings_model = "janus-text-embedding-ada-002"
vector_store_endpoint = "vector_store_endpoint"
vector_store_key = "vector_store_key"
vector_index_name = "vector_index_name"


In [None]:
deployment_name = gpt_deployment_35
llm_model = AzureChatOpenAI(deployment_name=deployment_name, openai_api_key =open_ai_api_key, azure_endpoint = open_ai_endpoint, openai_api_version = open_ai_api_version, temperature=0) 

### Initiating the embedding model and connection to Azure ai search

In [None]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=embeddings_model,
    openai_api_version=open_ai_api_version,
    azure_endpoint = open_ai_endpoint,
    openai_api_key = open_ai_api_key
)

vector_store = AzureSearch(
    azure_search_endpoint=vector_store_endpoint ,
    azure_search_key=vector_store_key,
    index_name=vector_index_name,
    embedding_function=embeddings.embed_query
)

## Loading data

In [None]:
loader = DirectoryLoader('your_fike_path', glob="**/*.txt", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})
documents = loader.load()

In [None]:
documents[1].page_content

In [None]:
vector_store.add_documents(documents=documents)

In [None]:
vector_store.as_retriever

In [None]:
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,
    SearchField,  
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration
) 

In [None]:
search_client = SearchClient(vector_store_endpoint, vector_index_name, vector_store_key)

In [None]:
retriever = vector_store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.8}
)

In [None]:
docs = retriever.invoke("Where is my package. I can't find it")

In [None]:
query = "where is my parcel. I have not received yet.I am mad"
retriever1 = vector_store.similarity_search_with_relevance_scores(query = query, k = 1, kwargs ='0.8')

In [None]:
vector_store.similarity_search_with_relevance_scores

In [None]:
text_splitter = SemanticChunker(embeddings,breakpoint_threshold_type="standard_deviation")
# docs = text_splitter.create_documents(documents[0].page_content)
doc = []
for i in range(len(documents)):
    doc.append(documents[i].page_content)
docs = text_splitter.create_documents(doc)
# print(docs[0].page_content)

In [None]:
print(docs)

In [None]:
text_splitter.create_documents

In [None]:

def get_documents(search_client):
    results = search_client.search(search_text="*", top=1000)  # Adjust the top value as needed
    documents = list(results)
    return documents

def find_duplicates(documents):
    seen = set()
    duplicates = []
    for doc in documents:
        if doc['id'] in seen:
            duplicates.append(doc)
        else:
            seen.add(doc['id'])
    return duplicates

def delete_documents(client, documents):
    ids_to_delete = [doc['id'] for doc in documents]
    client.delete_documents(*ids_to_delete)
    print(f"Deleted {len(ids_to_delete)} documents.")

# Retrieve documents and find duplicates
documents = get_documents(search_client)
duplicates = find_duplicates(documents)
print(f"Found {len(duplicates)} duplicates.")

# Delete duplicate documents
delete_documents(search_client, duplicates)


In [None]:
# Get all documents
results = search_client.get_documents()

# Create a dictionary to hold unique documents
unique_docs = {}

# Loop through the results
for result in results:
    # Use a unique field (like 'id') to check for duplicates
    id = result["id"]
    if id not in unique_docs:
        # If the document is unique, add it to the dictionary
        unique_docs[id] = result
    else:
        # If the document is a duplicate, delete it
        search_client.delete_documents({"id": id})