In [None]:
!pip install openai langchain chromadb langchain langchain_community tiktoken langchain-openai


Collecting langchain-openai
  Downloading langchain_openai-0.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading langchain_openai-0.2.0-py3-none-any.whl (51 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.5/51.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.2.0


In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.azure_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
import pandas as pd
from langchain.schema import Document
import getpass

In [None]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m153.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2

In [None]:
pdf_reader = PyPDF2.PdfReader('/content/Company_Data.pdf')
text = ""
for page in pdf_reader.pages:
    text += page.extract_text() + "\n"
text

'Company_Data\nPage 1SalesCompPriceIncomeAdvertisingPopulationPriceShelveLocAgeEducationUrbanUS\n9.51387311276120Bad4217YesYes\n11.22111481626083Good6510YesYes\n10.06113351026980Medium5912YesYes\n7.4117100446697Medium5514YesYes\n4.15141643340128Bad3813YesNo\n10.811241131350172Bad7816NoYes\n6.63115105045108Medium7115YesNo\n11.851368115425120Good6710YesYes\n6.541321100108124Medium7610NoNo\n4.691321130131124Medium7617NoYes\n9.01121789150100Bad2610NoYes\n11.9611794450394Good5013YesYes\n3.98122352393136Medium6218YesNo\n10.9611528112986Good5318YesYes\n11.1710711711148118Good5218YesYes\n8.71149955400144Medium7618NoNo\n7.58118320284110Good6313YesNo\n12.291477413251131Good5210YesYes\n13.91110110040868Good4617NoYes\n8.73129761658121Medium6912YesYes\n6.41125902367131Medium3518YesYes\n12.131342912239109Good6218NoYes\n5.08128466497138Medium4213YesNo\n5.87121310292109Medium7910YesNo\n10.1414511916294113Bad4212YesYes\n14.913932017682Good5411NoNo\n8.3310711511496131Good5011NoYes\n5.2798118019107Medium

# Data Loading

In [None]:
def load_data(file_path):
    if file_path.endswith('.txt'):
        loader = TextLoader(file_path)
        documents = loader.load()
        return 'text', documents

    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
        return 'dataset', df

    elif file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path)
        return 'dataset', df

    else:
        raise ValueError("Unsupported file format")

In [None]:
def prepare_chunks(file_type, data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

    if file_type == 'text':
        # Split text documents
        docs = text_splitter.split_documents(data)

    elif file_type == 'dataset':
        # Convert each row to a text chunk
        docs = []
        for idx, row in data.iterrows():
            chunk = " ".join([f"{col}: {row[col]}" for col in data.columns])
            docs.append(Document(page_content=chunk))

    return docs

In [None]:
file_path = '/content/Company_Data.csv'
file_type, data = load_data(file_path)
docs = prepare_chunks(file_type, data)

# embeddings and llm setup

In [None]:
openai_api_key =getpass.getpass("Enter your api key: ")
api_key=openai_api_key
azure_endpoint = 'XXX'

In [None]:
embeddings = AzureOpenAIEmbeddings(openai_api_key=openai_api_key,chunk_size=1000,azure_endpoint=azure_endpoint)

# Store embeddings in Chroma
vectorstore = Chroma.from_documents(docs, embeddings)

In [None]:
llm=AzureChatOpenAI(api_key=api_key,
                azure_endpoint=azure_endpoint,
                api_version='2024-05-01-preview',
                model='gpt-4o-mini',
                deployment_name='gpt-4o-mini')

In [None]:
# Create the retriever
retriever = vectorstore.as_retriever()

# Use the retriever in a Retrieval-QA chain
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       retriever=retriever,
                                       chain_type="stuff")

# Ask a question
query = "What are the key insights from the data or document?"
result = qa_chain.invoke(query)

print(result)


{'query': 'What are the key insights from the data or document?', 'result': 'The data provides insights into sales performance and various influencing factors. Here are the key insights:\n\n1. **Sales Performance**: There are two distinct sales figures: 11.27 and 7.74. The higher sales figure is associated with a lower price and advertising spend, while the lower sales figure corresponds to a higher price and no advertising.\n\n2. **Price Sensitivity**: The first set of data shows that lower prices (133) can lead to higher sales (11.27), while higher prices (154) result in lower sales (7.74), indicating price sensitivity among consumers.\n\n3. **Advertising Impact**: The first set has an advertising spend of 2, while the second has none. Despite the first set having a lower advertising budget, it still achieves higher sales, suggesting that other factors may be more influential than advertising in this case.\n\n4. **Demographics**: Both sets have similar population sizes (60 and 80), b

In [None]:
pdf_reader = PyPDF2.PdfReader('/content/Company_Data.pdf')
raw_text = ""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_text(raw_text)
for page_num in range(len(pdf_reader.pages)):
    raw_text += pdf_reader.pages[page_num].extract_text()

In [None]:
raw_text

'Company_Data\nPage 1SalesCompPriceIncomeAdvertisingPopulationPriceShelveLocAgeEducationUrbanUS\n9.51387311276120Bad4217YesYes\n11.22111481626083Good6510YesYes\n10.06113351026980Medium5912YesYes\n7.4117100446697Medium5514YesYes\n4.15141643340128Bad3813YesNo\n10.811241131350172Bad7816NoYes\n6.63115105045108Medium7115YesNo\n11.851368115425120Good6710YesYes\n6.541321100108124Medium7610NoNo\n4.691321130131124Medium7617NoYes\n9.01121789150100Bad2610NoYes\n11.9611794450394Good5013YesYes\n3.98122352393136Medium6218YesNo\n10.9611528112986Good5318YesYes\n11.1710711711148118Good5218YesYes\n8.71149955400144Medium7618NoNo\n7.58118320284110Good6313YesNo\n12.291477413251131Good5210YesYes\n13.91110110040868Good4617NoYes\n8.73129761658121Medium6912YesYes\n6.41125902367131Medium3518YesYes\n12.131342912239109Good6218NoYes\n5.08128466497138Medium4213YesNo\n5.87121310292109Medium7910YesNo\n10.1414511916294113Bad4212YesYes\n14.913932017682Good5411NoNo\n8.3310711511496131Good5011NoYes\n5.2798118019107Medium

# Testing on multiple quiries

In [None]:
queries = [
    "What is the highest advertising?",
    "How are the data columns related to each other?",
    "What is the highest sales?",
    "Summarize the key trends in the sales data.",
    "Identify any anomalies in the sales figures."
]

# Run the model on each query
for i, query in enumerate(queries):
    print(f"Query {i + 1}: {query}")
    answer = qa_chain.run(query)
    print(f"Answer {i + 1}: {answer}")
    print("-" * 80)

Query 1: What is the highest advertising?
Answer 1: The highest advertising value in the provided context is 18.
--------------------------------------------------------------------------------
Query 2: How are the data columns related to each other?
Answer 2: I don't know.
--------------------------------------------------------------------------------
Query 3: What is the highest sales?
Answer 3: The highest sales is 11.27.
--------------------------------------------------------------------------------
Query 4: Summarize the key trends in the sales data.
Answer 4: The sales data shows a few key trends:

1. **Sales Figures**: There are two distinct sales figures: 7.6 and 10.61. The higher sales figure (10.61) is associated with a better competitive price (CompPrice of 157) and higher income (93).

2. **Advertising Impact**: In the cases with higher sales (10.61), there is no advertising expenditure (Advertising: 0), suggesting that factors other than advertising may be driving sales 