In [1]:
!pip install streamlit pyngrok transformers langchain faiss-cpu PyPDF2 sentence-transformers
!pip install -U langchain-community

Collecting streamlit
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.40.2-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Downloading faiss_cp

**First, i build the model, then put all the code in app.py script for deploying the engine on web page using streamlit app.**

**IMPORT LIBRARIES**

In [2]:
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import pickle
import os

**1) PDF PARSING**

In [3]:
@st.cache_resource
def load_pdfs(pdf_files):
    """Parse PDFs and extract text."""
    documents = []
    for pdf_file in pdf_files:
        reader = PdfReader(pdf_file)
        text = "".join([page.extract_text() for page in reader.pages])
        documents.append(text)
    return documents

**2) Vector Store Initialization (Using FAISS)**

In [4]:
def initialize_vector_store(pdf_texts):
    """Create embeddings and store them in FAISS for similarity search."""
    # Split text --> chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = [chunk for doc in pdf_texts for chunk in text_splitter.split_text(doc)]

    # Generate embeddings
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(chunks, embedding_model)

    # Save vector store
    with open("vector_store.pkl", "wb") as f:
        pickle.dump(vector_store, f)

    return vector_store

**3) Retrieval QA Chain**

In [5]:
def initialize_qa_chain(_vector_store):
    """Set up the retrieval-augmented QA chain."""
    # Set up retriever
    retriever = _vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

    # Configure the language model (LLM)
    llm_pipeline = pipeline(
        "text-generation",
        model="gpt2",
        tokenizer="gpt2",
        max_length=1024,
        max_new_tokens=200,
        pad_token_id=50256
    )
    llm = HuggingFacePipeline(pipeline=llm_pipeline)

    # Create the QA chain
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    return qa_chain

**4) Streamlit Interface**

In [6]:
st.title("10-K Document QA System")
st.write("Ask questions about the financial documents (Google, Tesla, Uber).")

pdf_files = [
    "/content/drive/MyDrive/ALMENO_AI/goog-10-k-2023 (1).pdf",  # Path to Google 10-K PDF
    "/content/drive/MyDrive/ALMENO_AI/tsla-20231231-gen.pdf",   # Path to Tesla 10-K PDF
    "/content/drive/MyDrive/ALMENO_AI/uber-10-k-2023.pdf"     # Path to Uber 10-K PDF
]

# Step 2: Process PDFs or Load Vector Store
if os.path.exists("vector_store.pkl"):
    # Load  vector store
    with open("vector_store.pkl", "rb") as f:
        vector_store = pickle.load(f)
    st.write("Vector store loaded from saved data!")
else:
    # ParsING
    pdf_texts = load_pdfs(pdf_files)
    vector_store = initialize_vector_store(pdf_texts)
    st.write("Vector store created and saved successfully!")

# Step 3: Initialize QA Chain
qa_chain = initialize_qa_chain(vector_store)

# Step 4: User Query and Display Response
query = st.text_input("Enter your question:", "What are the risk factors associated with Google and Tesla?")
if st.button("Get Answer"):
    with st.spinner("Retrieving the best answer..."):
        response = qa_chain.run(query)
    st.write("### Response:")
    st.write(response)

2024-11-26 13:52:33.315 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline=llm_pipeline)
2024-11-26 13:58:29.073 Session state does not function when running a script without `streamlit run`


**Two questions below for testing.**

In [7]:
query =  "What is the total revenue for Google Search?"
response = qa_chain.run(query)


  response = qa_chain.run(query)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=200) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [8]:
# Extract the Helpful Answer part from the response
start = response.find("Helpful Answer:")
if start != -1:
    # Extract everything after "Helpful Answer:"
    helpful_answer = response[start + len("Helpful Answer:"):].strip()
    print(helpful_answer)
else:
    print("Helpful Answer not found.")

Google Network 
Operating expenses revenue was $2.0 billion  from 2022  to 2023. The revenues have increased 
in line with the percentage of revenues attributable to revenue from ad campaigns, as well as to operating expenses, but there was no change in revenue or cost per share of , and
revenues from the advertiser is more sensitive under the ad bundling system. The Google Search and other revenues have increased , and Revenue from the advertiser is less sensitive under the ad bundling system as , but other revenues from the advertiser have decreased, and Revenue from the advertiser was more sensitive under the ad bundling system as . In January 2017, we reported that Google Search Traffic increased $1.0 billion  from , and Revenue from the advertiser is less sensitive under the ad bundling system as , but other revenues from the advertiser increased  through


In [9]:
query =  "What are the differences in the business of Tesla and Uber?"
response = qa_chain.run(query)


Both `max_new_tokens` (=200) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [10]:
# Extract the Helpful Answer part from the response
start = response.find("Helpful Answer:")
if start != -1:
    # Extract everything after "Helpful Answer:"
    helpful_answer = response[start + len("Helpful Answer:"):].strip()
    print(helpful_answer)
else:
    print("Helpful Answer not found.")

Tesla (“Tesla,“““), Tesla

The company originally began operation at Tesla Tech Center in 1984. The company is based in Fremont, California with operations on

and on Tesla's headquarters. While Tesla has produced products in general and automotive

production, they are primarily about building and maintaining the Model S and Model X,

and are primarily headquartered in Fremont, California. Tesla's largest business activities are making batteries (Tesla's

manufacturing), vehicles, and services to the automotive industry and to consumers, and offering

marketplaces such as AutoLand.

Tesla's business involves selling

parts and services, as well as making other

business-related products. In a first-of-its-kind business, Tesla builds, develops, manufactures

and sells the means for bringing products for sale to market. Tesla began as

a non-public company with limited liability

claim


**Code for extracting relevant answer part**

In [None]:
# Extract the Helpful Answer part from the response
start = response.find("Helpful Answer:")
if start != -1:
    # Extract everything after "Helpful Answer:"
    helpful_answer = response[start + len("Helpful Answer:"):].strip()
    print(helpful_answer)
else:
    print("Helpful Answer not found.")

Our relationship with the Elon Musk organization reflects both our commitment to our shared vision at Tesla in delivering the world's best products and services to consumers, which reflects our values as a business.

Tesla Technologies, Inc. operates at an operating capital of $2,350 million. The actual gross revenue from Tesla was $527 million. The net revenue from our business operations was $876 million. In fiscal year 2010, the Company reported that our operating income was $1,065 million.

Uber Technologies, Inc. was incorporated in Delaware, and is located in San Francisco, California. The initial public offering (IPOs) of our common stock commenced on June 30, 2011 and ended on June 30, 2017. Uber Technologies, Inc. is a leader and innovator in the mobile telecommunications and mobile gaming market. We also provide services and technology products including the UberApp, UberApp 2.0, and UberXR.

We have invested in Tesla's lithium


In [9]:
#PUT The below code in app.py and change the PDF directories accordingly.

%%writefile app.py
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from PyPDF2 import PdfReader
import pickle
import os

@st.cache_resource
def load_pdfs(pdf_paths):
    """Parse hardcoded PDFs and extract their text."""
    documents = []
    for pdf_file in pdf_paths:
        reader = PdfReader(pdf_file)
        text = "".join([page.extract_text() for page in reader.pages])
        documents.append(text)
    return documents

@st.cache_resource
def initialize_vector_store(pdf_texts):
    """Create embeddings and store them in FAISS for similarity search."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = [chunk for doc in pdf_texts for chunk in text_splitter.split_text(doc)]

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(chunks, embedding_model)

    with open("vector_store.pkl", "wb") as f:
        pickle.dump(vector_store, f)

    return vector_store

@st.cache_resource
def initialize_qa_chain(_vector_store):
    """Set up the retrieval-augmented QA chain."""
    retriever = _vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

    llm_pipeline = pipeline(
        "text-generation",
        model="gpt2",
        tokenizer="gpt2",
        max_length=1024,
        max_new_tokens=200,
        pad_token_id=50256
    )
    llm = HuggingFacePipeline(pipeline=llm_pipeline)
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    return qa_chain

# Streamlit Interface
st.title("10-K Document QA System")
st.write("Ask questions about the financial documents (Google, Tesla, Uber).")

# Step 1: Hardcoded PDF Paths
pdf_files =  ["/content/drive/MyDrive/ALMENO_AI/goog-10-k-2023 (1).pdf",  # Path to Google 10-K PDF
    "/content/drive/MyDrive/ALMENO_AI/tsla-20231231-gen.pdf",   # Path to Tesla 10-K PDF
    "/content/drive/MyDrive/ALMENO_AI/uber-10-k-2023.pdf" ]    # Path to Uber 10-K PDF


# Step 2: Process PDFs and vector store
if os.path.exists("vector_store.pkl"):
    with open("vector_store.pkl", "rb") as f:
        vector_store = pickle.load(f)
    st.write("Vector store loaded from saved data!")
else:
    pdf_texts = load_pdfs(pdf_files)
    vector_store = initialize_vector_store(pdf_texts)
    st.write("Vector store created and saved successfully!")

# Step 3: Initialize QA Chain
qa_chain = initialize_qa_chain(vector_store)

# Step 4: User Query
query = st.text_input("Enter your question:", " ")
if st.button("Get Answer"):
    with st.spinner("Retrieving the best answer..."):
        response = qa_chain.run(query)
    st.write("### Response:")
    # st.write(response)

    # Extract the Helpful Answer part from the response
    start = response.find("Helpful Answer:")
    if start != -1:
        helpful_answer = response[start + len("Helpful Answer:"):].strip()
        st.write(helpful_answer)
    else:
        print("Helpful Answer not found.")


Writing app.py


***After building the content engine. We deploy it on an UI, using Streamlit.***

*We use Localtunnel to expose streamlit app online.*

In [12]:
! pip install streamlit -q

In [13]:
!pip install streamlit pyngrok --quiet
!npm install -g localtunnel

[K[?25h
changed 22 packages, and audited 23 packages in 3s

3 packages are looking for funding
  run `npm fund` for details

1 [33m[1mmoderate[22m[39m severity vulnerability

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.


In [10]:
!wget -q -O - ipv4.icanhazip.com #To get the tunnel Passcode

34.75.52.88


In [11]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.75.52.88:8501[0m
[0m
your url is: https://twenty-friends-chew.loca.lt

>> from langchain.embeddings import HuggingFaceEmbeddings

with new imports of:

>> from langchain_community.embeddings import HuggingFaceEmbeddings
You can use the langchain cli to **automatically** upgrade many imports. Please see documentation here <https://python.langchain.com/docs/versions/v0_2/>
  from langchain.embeddings import HuggingFaceEmbeddings

>> from langchain.vectorstores import FAISS

with new imports of:

>> from langchain_community.vectorstores import FAISS
You can use the langchain cli to **automatically** upgrade many imports. Please see documentation here <https://python.langchain.co