In [1]:
import streamlit as st
import tempfile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import CTransformers
from langchain.chains import ConversationalRetrievalChain
from ctransformers import AutoModelForCausalLM,AutoConfig


In [3]:
config = AutoConfig.from_pretrained("llama-2-7b-chat.ggmlv3.q8_0.bin")
# Explicitly set the max_seq_len
config.max_seq_len = 4096
config.max_answer_len= 1024

In [5]:
DB_FAISS_PATH = 'vectorstore/db_faiss'
prompts = []


In [6]:
# Loading the locally downloaded model
def load_llm():
    llm = CTransformers(
        model="llama-2-7b-chat.ggmlv3.q8_0.bin",
        model_type="llama",
        max_new_tokens=2048,
        temperature=0.2,
        max_length=2048,
        top_p = .95,
        top_k = 40
    )
    return llm

In [7]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

In [11]:
uploaded_file = 'ConceptsofBiology-WEB.pdf'

if uploaded_file:
    # Read the PDF file
    pdfReader = PdfReader(uploaded_file)
    documents = []
    for page in pdfReader.pages[:68]:  # Only take first 2 chapter
        page_text = page.extract_text()
        documents.append(Document(page_text))

In [21]:
# Split the text into Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
text_chunks = text_splitter.split_documents(documents)

In [22]:
# Create the embeddings and vectorstore
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
db = FAISS.from_documents(text_chunks, embeddings)
db.save_local(DB_FAISS_PATH)

In [23]:
# Load the model and set up the retrieval chain
llm = load_llm()
chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=db.as_retriever())

In [32]:
# Define a function to handle  chat
def conversational_chat(query):
    max_length = 2048
    chat_history = [] 
    answer = ""
    chunks = [query[i:i+max_length] for i in range(0, len(query), max_length)]
    for chunk in chunks:
        result = chain({"question": chunk, "chat_history": chat_history})
        answer += result["answer"]
    return answer

user_input = "what is this file all about?"
if user_input:
    output = conversational_chat(user_input)
    print(output)

 This file appears to be related to research on large datasets and the use of Creative Commons licenses. The mention of "data research" and "in silico research" suggests that the file may be related to the analysis of large data sets using computational methods, rather than traditional experimental approaches.
