In [6]:
from llama_index.multi_modal_llms.azure_openai import AzureOpenAIMultiModal
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import ( 
    SimpleDirectoryReader, 
    StorageContext, 
    Settings, 
    PromptTemplate
)
from llama_index.core.ingestion import DocstoreStrategy
from llama_index.embeddings.vertex import VertexTextEmbedding
from llama_index.core.schema import ImageNode
from llama_index.core.query_engine import SimpleMultiModalQueryEngine
from langchain.chat_models import AzureChatOpenAI


In [7]:
from PIL import Image
import matplotlib.pyplot as plt
import os
import sys
import math
from qdrant_client import QdrantClient, models

In [8]:
OPENAI_API_KEY = "3a6b230b917b4893a150f0ad7fa126cf"
os.environ["AZURE_OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://cpe-clx-openai.openai.azure.com/"
os.environ["OPENAI_API_VERSION"] = "2023-05-15" #"2024-02-15-preview"

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

# Replace the path with the path to the service account key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\CQTF47\\Downloads\\Dipjyoti RAG POC\\devtest-sa.json"


In [10]:
# embed_model = VertexTextEmbedding(project="msi-genai-frontdoor-499476", location="us-east1", credentials = os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

embed_model_openai = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    # deployment_name="cpe-clx-embedding",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"] ,
    azure_deployment="cpe-clx-embedding"
)

# azure_llm = AzureChatOpenAI(
#     model="cpe-clx-gpt4o",
#     azure_deployment="cpe-clx-gpt4o",
#     api_key=os.environ["AZURE_OPENAI_API_KEY"],
#     azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
#     api_version=os.environ["OPENAI_API_VERSION"],
# )

openai_mm_llm = AzureOpenAIMultiModal(
    engine="cpe-clx-gpt4o",
    api_version=os.environ["OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    model="gpt-4o-2024-05-13",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    max_new_tokens=1500,
    max_retries = 1
)

Settings.llm = openai_mm_llm
Settings.embed_model = embed_model_openai

In [11]:
directory_name = r"C:\Users\CQTF47\Desktop\IU Masters\Case Study-Model Engineering\sample submissions"

In [12]:
client = QdrantClient(path="model_engineering_db/")

text_store = QdrantVectorStore(
    client=client, collection_name=f"pdf_text_collection"
)
# image_store = QdrantVectorStore(
#     client=client, collection_name=f"pdf_image_collection"
# )
storage_context = StorageContext.from_defaults(
    vector_store=text_store
)


In [13]:
document_names = os.listdir(directory_name)

In [14]:
exclude = []

def find_and_remove_duplicates_from_vectordb(client, collection_name, document_name):
    data = client.scroll(
            collection_name=collection_name,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="file_name", match=models.MatchValue(value=document_name)
                    ),
                ],
            ),
        )
    
    if len(data[0]) > 0:
        print(f"Document {doc} already exists in the collection {collection_name}")
        print("Do you want to overwrite it? (y/n)")
        choice = input()
        if choice.lower() != 'y':
            exclude.append(f"*{doc}*")
        else:
            print(f"Removing duplicates for {doc} from the collection {collection_name}")
            client.delete(collection_name=collection_name, points_selector=models.Filter(
                must=[
                    models.FieldCondition(
                        key="file_name", match=models.MatchValue(value=document_name)
                    ),
                ],
            ))

In [15]:
if text_store._collection_exists(f"{directory_name}_text_collection"):
    for doc in document_names:
        find_and_remove_duplicates_from_vectordb(client, f"{directory_name}_text_collection", doc)
        # find_and_remove_duplicates_from_vectordb(client, f"{directory_name}_image_collection", doc)

In [16]:
documents = SimpleDirectoryReader(f"{directory_name}/", filename_as_id=True).load_data()

In [17]:
len(documents)

215

In [18]:
index = MultiModalVectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    show_progress = True,
    timeout = 60
)

Parsing nodes:   0%|          | 0/215 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/215 [00:00<?, ?it/s]



In [19]:
qa_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: ")

In [20]:
qa_tmpl = PromptTemplate(qa_tmpl_str)

In [21]:
query_engine = index.as_query_engine(text_qa_template=qa_tmpl, similarity_top_k=10)

In [28]:
prompt = 'write an abstract with 200 words for a research paper using the above given information'
response = query_engine.query(prompt)

In [29]:
response

Response(response='Credit card routing is a pivotal component of the online purchase process, where the selection of an appropriate payment service provider (PSP) significantly impacts transaction success rates and fraud prevention. This research aims to develop a predictive model to recommend the most suitable PSP based on transaction-specific information. Leveraging machine learning algorithms, the model will analyze various factors such as transaction amount, country of origin, type of card, and 3D security features to predict the optimal PSP for each transaction. The methodology follows an Agile Data Science process, encompassing problem definition, data acquisition, preparation, modeling, evaluation, deployment, and iterative improvement. Initial steps involve defining the problem scope and acquiring relevant data, followed by data preparation and exploration. The modeling phase includes building both baseline and advanced predictive models, with performance evaluated using metric

In [30]:
prompt = 'write an introduction section with 1000 words for a research paper using the above given information'
response = query_engine.query(prompt)

In [31]:
response

Response(response="# Introduction\n\n## 1.1 Overview\n\nIn the rapidly evolving landscape of online commerce, the efficiency and reliability of payment processing systems are paramount. Credit card routing, a critical component of these systems, directly impacts transaction success rates and operational costs. This research paper delves into the development and deployment of a predictive model aimed at optimizing credit card routing for online purchases. By leveraging advanced data science methodologies, this study seeks to enhance transaction success rates while minimizing associated fees, thereby providing a robust solution to a prevalent challenge in the e-commerce sector.\n\n## 1.2 Problem Definition\n\nThe primary problem addressed in this research is the suboptimal routing of credit card transactions in online purchases. Inefficient routing can lead to increased transaction failures, higher operational costs, and diminished customer satisfaction. The goal is to develop a predicti

In [None]:
# Qdrant setup
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.models import PointStruct, VectorParams, Distance, HnswConfig,HnswConfigDiff

# start_time = time.time()
qdrant_client = QdrantClient(path="case_study")

collection_name = "sample_submissions"
text_vector_size = 1536 

if qdrant_client.collection_exists(collection_name=collection_name):
    qdrant_client.delete_collection(collection_name=collection_name)
    
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=text_vector_size, distance=Distance.COSINE),
    hnsw_config={
        "m": 64,  # max number of connections a node can have in the graph
        "ef_construct": 512,  # higher value will search more extensively for neighbors (more accurate)
        "ef": 128  # search parameter at query time
    }
)
# print("Collection creation time:", time.time() - start_time)

In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore

qdrant_vector_store_binary = QdrantVectorStore(client=qdrant_client, collection_name=collection_name)
index = VectorStoreIndex.from_documents(documents, vector_store=qdrant_vector_store_binary)

NameError: name 'qdrant_client' is not defined

In [14]:
retriever_engine = index.as_retriever(
    similarity_top_k=10
)

In [15]:
qa_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: ")

In [16]:
qa_tmpl = PromptTemplate(qa_tmpl_str)

In [17]:
query_engine = index.as_query_engine(text_qa_template=qa_tmpl, similarity_top_k=10)

In [18]:
prompt = 'write abstract for research paper based on the above context'
response = query_engine.query(prompt)

AttributeError: 'AzureOpenAIMultiModal' object has no attribute 'predict'