**Installing all the required libraries**

In [None]:
!pip install langchain langchain-community groq pypdf pdfplumber faiss-cpu sentence-transformers tabula-py camelot-py




In [None]:
!pip install --upgrade langchain




In [None]:
!pip install groq




**Importing the required Libraries**

In [None]:
from groq import Groq
import os
import json
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from groq import Groq


**Set Groq API Key: It has been enabled as a secret key**



In [None]:
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')

Extraction of Data for storing

In [None]:
# Paths to PDF documents
pdf_files = ["/content/combined_document_10.pdf"]  #This is because I can later try for other documents so kept it in a list format.

# Extract text and tables using pdfplumber
#on research i figured that pdfplumber was doing a better job on extracting tables, camelot-py does a good job in extracting table structure like column
#wise information but doesnt capture the lateral columns info that well to get proper table structure and tabula-py was getting the table structure pretty well
#but didnt identify the column names properly as it is not gridded table in all cases.
documents = []
table_texts = []

for pdf in pdf_files:
    with pdfplumber.open(pdf) as pdf_reader:
        for page in pdf_reader.pages:
            # Extract text from the page
            text = page.extract_text()
            if text:
                documents.append(text)

            # Extract tables using pdfplumber
            tables = page.extract_tables()
            for table in tables:
                table_texts.append("\n".join([" | ".join(row) for row in table]))  # Format table data

# Combine extracted text and tables
full_text = "\n".join(documents)
full_table_text = "\n".join(table_texts)
combined_text = full_text + "\n" + full_table_text  # Final combined text


**Splits text into chunks and embeds them using SentenceTransformer**

In [None]:
# Preprocess text and split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
text_chunks = text_splitter.split_text(combined_text)

**Generating Embeddings for Chunks**

In [None]:
# Generate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
text_embeddings = embedding_model.encode(text_chunks)

**Creating and Storing Embeddings in FAISS**

In [None]:
# Create FAISS index
dimension = text_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(text_embeddings))


**Storing Chunks for Retrieval**

In [None]:
# Store chunks for retrieval
chunk_mapping = {i: text_chunks[i] for i in range(len(text_chunks))}

**Retrieving Relevant Context from FAISS**

In [None]:
# Initialize Groq API
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Function to retrieve top-k relevant chunks from FAISS
def retrieve_context(query, k=3):
    query_embedding = embedding_model.encode([query])
    _, indices = index.search(np.array(query_embedding), k)
    retrieved_chunks = [chunk_mapping[i] for i in indices[0] if i < len(text_chunks)]
    return "\n".join(retrieved_chunks) if retrieved_chunks else "No relevant data found."

**Querying Groq with Retrieved Context**

In [None]:
def query_groq(prompt):
    # Retrieve relevant chunks from FAISS
    retrieved_context = retrieve_context(prompt)

    # Create final prompt with context
    final_prompt = f"Use the following extracted data to answer the question:\n\n{retrieved_context}\n\nQuestion: {prompt}\nAnswer:"

    # Query Groq API
    response = groq_client.chat.completions.create(
        messages=[{"role": "user", "content": final_prompt}],
        model="mixtral-8x7b-32768"
    )
    return response.choices[0].message.content


**The 10 Queries are asked**

In [None]:
query = "How much did Apple spend on Research and Development in fiscal year 2018?"
response = query_groq(query)
print("Generated Answer:", response)

query = "How did Apple's Research and Development spending in fiscal year 2018 compare to 2017?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What were the total iPhone sales figures (in units) for Apple in 2018?"
response = query_groq(query)
print("Generated Answer:", response)

query = "How did Apple's total iPhone sales figures (in units) in 2018 compare to 2017?"
response = query_groq(query)
print("Generated Answer:", response)

query = "How many shares did Microsoft repurchase in fiscal year 2016?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What was the total amount Microsoft spent on share repurchases in fiscal year 2016?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What was Apple's net sales figure for the Americas region in 2018?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What percentage of Apple's total net sales did the Americas region represent in 2018?"
response = query_groq(query)
print("Generated Answer:", response)

query = "When did Microsoft acquire LinkedIn Corporation according to the quarterly information?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What was the dividend per share declared by Microsoft in September 2015?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What factors contributed to the increase in iPad net sales during 2018 compared to 2017?"
response = query_groq(query)
print("Generated Answer:", response)

query = "How much did Apple's Services segment contribute to total net sales in 2018?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What was the year-over-year growth percentage of Apple's Services segment in 2018?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What were the main components of Microsoft's 'Other Income (Expense), Net' for fiscal year 2018?"
response = query_groq(query)
print("Generated Answer:", response)

query = "What was Apple's gross margin percentage range anticipated for the first quarter of 2019?"
response = query_groq(query)
print("Generated Answer:", response)


Generated Answer: The provided data does not include the exact amount that Apple spent on Research and Development (R&D) in fiscal year 2018. However, it does mention that R&D expenses increased $665 million or 6% in fiscal year 2015 compared to fiscal year 2014. Additionally, it is stated that R&D expenses include payroll, employee benefits, stock-based compensation expense, and other headcount-related expenses associated with product development, as well as third-party development and programming costs, localization costs, and the amortization of purchased software code. Therefore, it can be inferred that R&D expenses include a wide range of costs related to product development.
Generated Answer: Apple's Research and Development spending in fiscal year 2018 was $14,236 million, which represents a year-over-year increase of 23% compared to $11,581 million in fiscal year 2017. This growth was primarily driven by increases in headcount-related expenses and infrastructure-related innovat