In [127]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [128]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/pdfs-database/tsla-20231231-gen.pdf
/kaggle/input/pdfs-database/goog-10-k-2023.pdf
/kaggle/input/pdfs-database/uber-10-k-2023.pdf


In [129]:
pip install langchain langchain-community chromadb llama-index sentence-transformers PyPDF2 InstructorEmbedding

Note: you may need to restart the kernel to use updated packages.


In [130]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chains import RetrievalQA
from langchain.llms import LlamaCpp
from llama_index.core import Document
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextStreamer, pipeline, AutoModelForCausalLM
from langchain import HuggingFacePipeline, PromptTemplate

In [131]:
pdf_directory = "/kaggle/input/pdfs-database"
loader = PyPDFDirectoryLoader(pdf_directory)
docs = loader.load()

In [132]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-distilbert-base-v2")

In [142]:
persist_directory = "/kaggle/working/db"

if not os.path.exists(persist_directory):
    print("Creating Chroma database at:", persist_directory)
    db = Chroma.from_documents(texts, embedding_model, persist_directory=persist_directory)
else:
    print("Loading existing Chroma database from:", persist_directory)
    db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

Loading existing Chroma database from: /kaggle/working/db


In [134]:
model_name = "facebook/bart-base"  # Example model - adjust as needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <>
{system_prompt}
<>

{prompt} [/INST]
""".strip()

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
    num_beams=1,  # Disable beam search by setting num_beams to 1
)

llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."

template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

The model 'BartForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'Meg

In [141]:
result = qa_chain("What are the risk factors associated with Google and Tesla?")
print(result)

InvalidDimensionException: Embedding dimension 768 does not match collection dimensionality 384

In [137]:
result

{'query': 'What are the risk factors associated with Google and Tesla?',
 'result': "[INST] <>\nUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n<>\n\n\nthese\trisks,\tour\tbusiness,\tfinancial\tcondition\tand\toperating\tresults\tmay\tbe\tharmed.\nEmployees\tmay\tleave\tTesla\tor\tchoose\tother\temployers\tover\tTesla\tdue\tto\tvarious\tfactors,\tsuch\tas\ta\tvery\tcompetitive\tlabor\tmarket\tfor\ttalented\nindividuals\twith\tautomotive\tor\ttechnology\texperience,\tor\tany\tnegative\tpublicity\trelated\tto\tus.\tIn\tregions\twhere\twe\thave\tor\twill\thave\toperations,\nparticularly\tsignificant\tengineering\tand\tmanufacturing\tcenters,\tthere\tis\tstrong\tcompetition\tfor\tindividuals\twith\tskillsets\tneeded\tfor\tour\tbusiness,\tincluding\nspecialized\tknowledge\tof\telectric\tvehicles,\tengineering\tand\telectrical\tand\tbuilding\tconstruction\texpertise.\tWe\talso\