In [2]:
from llama_index.multi_modal_llms.azure_openai import AzureOpenAIMultiModal
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import ( 
    SimpleDirectoryReader, 
    StorageContext, 
    Settings, 
    PromptTemplate
)
from llama_index.core.ingestion import DocstoreStrategy
# from llama_index.embeddings.vertex import VertexTextEmbedding
from llama_index.core.schema import ImageNode
from llama_index.core.query_engine import SimpleMultiModalQueryEngine
# from langchain.chat_models import AzureChatOpenAI

In [3]:
from PIL import Image
import matplotlib.pyplot as plt
import os
import sys
import math
from qdrant_client import QdrantClient, models

OPENAI_API_KEY = "3a6b230b917b4893a150f0ad7fa126cf"
os.environ["AZURE_OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://cpe-clx-openai.openai.azure.com/"
os.environ["OPENAI_API_VERSION"] = "2023-05-15" #"2024-02-15-preview"

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

# Replace the path with the path to the service account key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:\\Users\\CQTF47\\Downloads\\Dipjyoti RAG POC\\devtest-sa.json"


In [4]:
# embed_model = VertexTextEmbedding(project="msi-genai-frontdoor-499476", location="us-east1", credentials = os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

embed_model_openai = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    # deployment_name="cpe-clx-embedding",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["OPENAI_API_VERSION"] ,
    azure_deployment="cpe-clx-embedding"
)

# azure_llm = AzureChatOpenAI(
#     model="cpe-clx-gpt4o",
#     azure_deployment="cpe-clx-gpt4o",
#     api_key=os.environ["AZURE_OPENAI_API_KEY"],
#     azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
#     api_version=os.environ["OPENAI_API_VERSION"],
# )

openai_mm_llm = AzureOpenAIMultiModal(
    engine="cpe-clx-gpt4o",
    api_version=os.environ["OPENAI_API_VERSION"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    model="gpt-4o-2024-05-13",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    max_new_tokens=1500,
    max_retries = 1
)

Settings.llm = openai_mm_llm
Settings.embed_model = embed_model_openai

In [5]:
directory_name = r"C:\Users\CQTF47\Desktop\IU Masters\Thesis\pdf"

In [6]:
client = QdrantClient(path="financial_risk_analysis_vector_db/")

text_store = QdrantVectorStore(
    client=client, collection_name=f"pdf_text_collection"
)
# image_store = QdrantVectorStore(
#     client=client, collection_name=f"pdf_image_collection"
# )
storage_context = StorageContext.from_defaults(
    vector_store=text_store
)


In [7]:
document_names = os.listdir(directory_name)

In [8]:
exclude = []

def find_and_remove_duplicates_from_vectordb(client, collection_name, document_name):
    data = client.scroll(
            collection_name=collection_name,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="file_name", match=models.MatchValue(value=document_name)
                    ),
                ],
            ),
        )
    
    if len(data[0]) > 0:
        print(f"Document {doc} already exists in the collection {collection_name}")
        print("Do you want to overwrite it? (y/n)")
        choice = input()
        if choice.lower() != 'y':
            exclude.append(f"*{doc}*")
        else:
            print(f"Removing duplicates for {doc} from the collection {collection_name}")
            client.delete(collection_name=collection_name, points_selector=models.Filter(
                must=[
                    models.FieldCondition(
                        key="file_name", match=models.MatchValue(value=document_name)
                    ),
                ],
            ))

In [9]:
if text_store._collection_exists(f"{directory_name}_text_collection"):
    for doc in document_names:
        find_and_remove_duplicates_from_vectordb(client, f"{directory_name}_text_collection", doc)
        # find_and_remove_duplicates_from_vectordb(client, f"{directory_name}_image_collection", doc)

In [10]:
documents = SimpleDirectoryReader(f"{directory_name}/", filename_as_id=True).load_data()

In [11]:
index = MultiModalVectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    show_progress = True,
    timeout = 60)

Parsing nodes:   0%|          | 0/80 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/82 [00:00<?, ?it/s]

In [12]:
qa_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: ")

In [13]:
qa_tmpl = PromptTemplate(qa_tmpl_str)

In [14]:
query_engine = index.as_query_engine(text_qa_template=qa_tmpl, similarity_top_k=10)

In [15]:
prompt = ''' 
            create the slides content for thesis presentation of 30 minutes on the topic.
'''
response = query_engine.query(prompt)

In [18]:
print(response.response)

Certainly! Below is a suggested outline and content for a 30-minute thesis presentation on the topic "AI Stock Analyst: Financial Chatbot Using Large Language Models" by Gummadi Sai Dheeraj.

### Slide 1: Title Slide
- **Title:** AI Stock Analyst: Financial Chatbot Using Large Language Models
- **Presenter:** Gummadi Sai Dheeraj
- **Matriculation Number:** 3210935
- **Advisor:** Professor Broweleit, Tobias
- **University:** University of Applied Science - Online
- **Program:** Masters in Data Science (MSDS60ECTS)
- **Delivery Date:** January 08, 2025

### Slide 2: Introduction
- **Overview of the Presentation:**
  - Introduction to the topic
  - Problem Statement
  - Objectives
  - Methodology
  - Results
  - Conclusion and Future Work

### Slide 3: Background
- **Introduction to Financial Chatbots:**
  - Definition and importance
  - Current state of financial chatbots
- **Large Language Models (LLMs):**
  - Definition and examples (e.g., GPT-3, GPT-4)
  - Relevance to financial chatb

In [20]:
prompt = ''' 
            create the slides content for thesis presentation of 30 minutes on the topic: "AI Stock Analyst: Financial Chatbot Using Large Language Models". Divide the content into 5 sections.
'''
print(query_engine.query(prompt).response)

Creating a 30-minute thesis presentation on "AI Stock Analyst: Financial Chatbot Using Large Language Models" can be effectively divided into five sections. Here is a suggested structure and content for each section:

### Section 1: Introduction (5 minutes)
- **Title Slide**
  - Title: AI Stock Analyst: Financial Chatbot Using Large Language Models
  - Your Name: Gummadi Sai Dheeraj
  - Advisor: Professor Broweleit, Tobias
  - University: University of Applied Science - Online
  - Date: January 08, 2025

- **Overview**
  - Brief introduction to the topic
  - Importance of AI in financial markets
  - Objectives of the thesis

- **Agenda**
  - Outline the five sections of the presentation

### Section 2: Background and Literature Review (5 minutes)
- **The Evolving Landscape of Financial Markets**
  - Changes and challenges in financial stock analysis
  - The need for advanced analytical tools

- **Literature Review**
  - Overview of existing research on machine learning in finance
  - K

In [21]:
prompt = ''' 
            create the slides content for thesis presentation of 30 minutes on the topic: "AI Stock Analyst: Financial Chatbot Using Large Language Models". Divide the content into 6 sections.
'''
print(query_engine.query(prompt).response)

Creating a 30-minute thesis presentation on "AI Stock Analyst: Financial Chatbot Using Large Language Models" can be effectively divided into six sections. Here is a suggested outline for the slides content:

### Section 1: Introduction (5 minutes)
- **Slide 1: Title Slide**
  - Title: AI Stock Analyst: Financial Chatbot Using Large Language Models
  - Your Name: Gummadi Sai Dheeraj
  - University: University of Applied Science - Online
  - Advisor: Professor Broweleit, Tobias
  - Date: January 08, 2025

- **Slide 2: Overview**
  - Brief introduction to the topic
  - Importance of AI in financial markets
  - Objectives of the thesis

### Section 2: Background and Literature Review (5 minutes)
- **Slide 3: The Evolving Landscape of Financial Markets**
  - Historical context
  - Current trends and challenges

- **Slide 4: Literature Review**
  - Key studies and their findings
  - Gaps in existing research
  - Relevance to your work

### Section 3: Methodology (5 minutes)
- **Slide 5: Res

### Overview

In [22]:
prompt = ''' 
            write a 10 minute speech for slide 2 Overview that covers the following points:
                - Brief introduction to the topic
                - Importance of AI in financial markets
                - Objectives of the thesis

'''
print(query_engine.query(prompt).response)

Ladies and Gentlemen,

Good [morning/afternoon/evening]. Thank you for joining me today. I am excited to present an overview of my thesis titled "AI Stock Analyst: Financial Chatbot Using Large Language Models." This research delves into the transformative role of Artificial Intelligence (AI) in financial markets and aims to bridge the gap in financial stock analysis through innovative methodologies.

**Brief Introduction to the Topic**

The financial markets have experienced significant transformations over the past few decades. These changes have been driven by technological advancements, globalization, and the increasing complexity of financial instruments. As we navigate through the digital age, the sheer volume of information available to stock market participants has grown exponentially. Financial news, market data, economic indicators, social media sentiment, and corporate disclosures flood the information channels daily. This phenomenon, known as information overload, presents 

## The Evolving Landscape of Financial Markets

In [26]:
prompt = ''' 
            write a 5 minute speech for slide "The Evolving Landscape of Financial Markets" that covers the following points:
                - Historical context
                - Current trends and challenges
        '''
print(query_engine.query(prompt).response)

Ladies and Gentlemen,

Good [morning/afternoon/evening]. Today, I am honored to speak to you about "The Evolving Landscape of Financial Markets," a topic that is both fascinating and crucial for understanding the dynamics of our global economy. In the next few minutes, I will cover the historical context of financial markets, current trends, and the challenges we face today.

**Historical Context**

To appreciate the current state of financial markets, it's essential to look back at their evolution. Over the past few decades, financial markets have undergone significant transformations. Historically, markets were relatively simple and localized. Transactions were conducted in person, and information flow was slow and limited. The primary methods of analysis were fundamental and technical, relying heavily on manual calculations and human intuition.

However, the landscape began to change dramatically with the advent of technological advancements. The introduction of computers and the in

## Literature Review

In [29]:
prompt = ''' 
            write a 5 minute speech and the slide content for slide "Literature Review" that covers the following points:
                - Key studies and their findings
                - Gaps in existing research
                - Relevance to your work
        '''
print(query_engine.query(prompt).response)

**Speech:**

Good [morning/afternoon/evening] everyone,

Today, I am excited to present the literature review section of my thesis, which focuses on the intersection of financial data analysis and natural language processing (NLP). This review highlights key studies, identifies gaps in existing research, and discusses the relevance of these findings to my work.

**Key Studies and Their Findings:**

The importance of analyst reports in identifying stock and market trends has been well-documented by researchers such as Jegadeesh et al. Their work has inspired subsequent research that delves into the financial metrics and analysts' opinions within these reports. Studies by authors like Liu and Steins, Savavidya and Saha, and Prabhu et al. have explored the potential benefits of chatbots for customer service in various industries, demonstrating improvements in multi-turn answer accuracy, online banking, and rapid customer care.

In the realm of NLP, the advent of Large Language Models (LLM

## Research Framework

In [16]:
prompt = ''' 
            write a 5 minute speech and relevant slide content for slide "Research Framework" that covers the following points:
                - Overview of the methodology
                - Data and Prompting
                - LLM and RAG architecture
        '''
print(query_engine.query(prompt).response)

**Speech:**

Good [morning/afternoon/evening] everyone,

Today, I am excited to present to you the research framework for our study on enhancing stock analysis using advanced Natural Language Processing (NLP) techniques, dynamic data retrieval systems, and Large Language Models (LLMs). Our goal is to provide comprehensive and reliable stock analysis by integrating both fundamental and technical evaluation with the trustworthiness of LLMs. Let's dive into the key components of our methodology.

**Overview of the Methodology:**

Our research methodology is designed to ensure accuracy and efficiency in stock market analysis. We have divided our approach into several key steps, each contributing to the overall system's effectiveness. These steps include data collection, data processing, and the integration of LLMs with Retrieval-Augmented Generation (RAG) pipelines. By leveraging these advanced techniques, we aim to revolutionize the analysis and interpretation of stock market data, enabli

## Technical Components

In [19]:
prompt = ''' 
            write a 5 minute speech and relevant slide content for slide "Technical Components" that covers the following points:
                - GPT-4o and Gemini 1.5 flash
                - System architecture and software design using AWS
                - Evaluation (BSD detector)
        '''
print(query_engine.query(prompt).response)

**Speech:**

Ladies and Gentlemen,

Thank you for joining us today. I am excited to delve into the technical components that underpin our advanced financial analysis tools, specifically focusing on GPT-4o and Gemini 1.5 Flash, their system architecture, software design using AWS, and the evaluation process using the BSD detector.

**Slide 1: GPT-4o and Gemini 1.5 Flash**

Let's start with GPT-4o and Gemini 1.5 Flash, two cutting-edge language models that have revolutionized financial analysis.

- **GPT-4o**: Developed by OpenAI, GPT-4o builds on the architecture of its predecessors with enhancements in scale, training data, and fine-tuning techniques. It employs self-attention mechanisms to process and generate text, allowing it to grasp a broad spectrum of language patterns, facts, and reasoning abilities. One of its standout features is its ability to perform zero-shot and few-shot learning, making it highly versatile and capable of handling a wide range of tasks with minimal task-sp

In [20]:
prompt = ''' 
            write a 5 minute speech and relevant slide content for slide "Request Response journey" that covers the following points:
                - System Architecture
                - Software desgin using AWS
        '''
print(query_engine.query(prompt).response)

### Speech:

**Title: Request Response Journey: System Architecture and Software Design using AWS**

**Introduction:**
Good [morning/afternoon/evening] everyone. Today, I am excited to walk you through the request-response journey of our AI Stock Analyst Financial Chatbot. We will delve into the system architecture and the software design using AWS that powers this sophisticated application. This journey ensures that user queries about company stock information and related news sentiments are processed quickly and accurately.

**Slide 1: Request Response Journey**

**System Architecture:**
Our system architecture is designed to be robust, scalable, and efficient. It leverages a combination of AWS services to handle requests, process data, and provide responses to users. The journey begins with DNS resolution via AWS Route 53, which directs end-user requests to the relevant AWS resources. 

Next, AWS CloudFront, a Content Delivery Network (CDN) service, receives the request. CloudFront 

## Results

In [21]:
prompt = ''' 
            write a 5 minute speech and relevant slide content for slide "Analysis and Findings" that covers the following points:
                - Insights from the results
                - Implications for investors
        '''
print(query_engine.query(prompt).response)

**Speech:**

Good [morning/afternoon/evening] everyone,

Thank you for joining us today. I am excited to share with you the analysis and findings from our recent study on stock analysis reports generated using Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) techniques. Our research aimed to provide investors with comprehensive, real-time stock analysis reports to aid in their decision-making process. Let's dive into the insights from our results and discuss their implications for investors.

**Insights from the Results:**

1. **Improved Accessibility of Information:**
   Our system significantly enhances the accessibility of stock analysis information. By entering the name of a company, users can receive detailed reports in real-time. This feature allows investors to quickly obtain the information they need, facilitating more accurate and timely investment decisions.

2. **Data Integration:**
   One of the key strengths of our system is its ability to consolidate 

## Conclusion

In [23]:
prompt = ''' 
            write a 5 minute speech and relevant slide content for slide "conclusion" by summarizing the conclusion section in the paper that covers the following points:
                - Summary of key findings
                - Validation of the model's effectiveness
        '''
print(query_engine.query(prompt).response)

**Speech:**

"Good [morning/afternoon/evening] everyone,

Today, I am pleased to present the conclusion of our research on financial fraud detection using various machine learning and deep learning models. This study aimed to explore and validate the effectiveness of traditional machine learning methods, deep learning models, and Large Language Models (LLMs) in identifying fraudulent activities within financial reports and statements.

**Summary of Key Findings:**

Our research has yielded several significant findings. Firstly, we observed that traditional machine learning models such as Logistic Regression and Support Vector Machine (SVM) provided a solid baseline for fraud detection with reasonable accuracy, precision, and recall. However, these models showed limitations in handling complex patterns and large datasets.

Deep learning models, including Artificial Neural Networks (ANN) and Hierarchical Attention Networks (HAN), demonstrated improved performance over traditional methods

In [27]:
prompt = ''' 
            summarize the conclusion section in 5 bullet points in page 54,55.
        '''
print(query_engine.query(prompt).response)

Based on the provided context, here is a summary of the conclusion section in 5 bullet points:

1. **Study Focus**: The study primarily examines internal control methods, fraud prevention measures, and financial performance within "A" Bank.
2. **Objective**: The objective is to assess the internal control practices, their impact on fraud prevention, and the subsequent influence on the bank's financial performance.
3. **Data Collection**: Data was collected from 315 workers at the Head Office and branches of "A" Bank using simple random sampling.
4. **Findings on Control Environment**: The bank has a robust control environment with comprehensive training and clear operational guidelines, contributing to effective fraud mitigation.
5. **Risk Management and Control Operations**: "A" Bank has a precise and transparent risk management system, conducts regular inventory checks, and implements control operations at multiple levels to prevent fraud.


In [24]:
prompt = ''' 
            Imagine you are computer science professor. 
            ask all the possible quesitons for the given docuemtns.

'''
print(query_engine.query(prompt).response)

Certainly! Here are some possible questions that a computer science professor might ask based on the given documents:

### Questions for "AI Stock Analyst: Financial Chatbot Using Large Language Models" by Gummadi Sai Dheeraj

1. **General Understanding:**
   - What is the primary objective of the thesis "AI Stock Analyst: Financial Chatbot Using Large Language Models"?
   - Who is the advisor for this thesis, and what is the delivery date?

2. **Technical Details:**
   - What are the main components of the financial chatbot discussed in the thesis?
   - How do Large Language Models (LLMs) contribute to the functionality of the financial chatbot?
   - What datasets were used to train and evaluate the chatbot?

3. **Challenges and Solutions:**
   - What are the key challenges faced in developing a financial chatbot using LLMs?
   - How does the thesis propose to address the issue of cumulative errors in the SEP framework?

4. **Evaluation and Metrics:**
   - What metrics are used to eva