### Load LLM model

In [12]:

from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain.llms import OpenAI
import google.generativeai as genai
import os
from dotenv import find_dotenv,load_dotenv

from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
# PDF reader
from pypdf import PdfReader
# Support for dataset retrieval with Hugging Face
from datasets import load_dataset
# Astra DB integration
import cassio

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

load_dotenv()  # take environment variables from .env.
GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
ASTRA_DB_ID=os.getenv("ASTRA_DB_ID")
ASTRA_DB_APPLICATION_TOKEN=os.getenv("ASTRA_DB_APPLICATION_TOKEN")

llm=GoogleGenerativeAI(model="gemini-pro", temperature=0.0,api_key=GOOGLE_API_KEY)
embedding = GoogleGenerativeAIEmbeddings(model = "models/embedding-001",api_key=GOOGLE_API_KEY)

# genai.configure(api_key=GOOGLE_API_KEY)

# llm=OpenAI(model="gpt-3.5-turbo-0125",temperature=0.5,openai_api_key=os.environ["OPENAI_API_KEY"])
# llm.invoke("capital of china")

### Read PDF

In [50]:
from langchain.text_splitter import CharacterTextSplitter
import glob

# pdfreader=PdfReader('segregationforce.pdf')
# raw_text=''
# for page in pdfreader.pages:
#     content=page.extract_text()
#     if content:
#         raw_text+=content


# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=200,
# )
# chunks = text_splitter.split_text(raw_text)

class readpdf:
    def __init__(self, pdf_docs) :
        self.pdf_docs=pdf_docs
        self.text = ""

    def get_pdf_text(self):
        for pdf in self.pdf_docs:
            pdf_reader = PdfReader(pdf)
            for page in pdf_reader.pages:
                content=page.extract_text() 
                if content:
                    self.text += content 

    def get_text_chunks(self):
        text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=2000,
        chunk_overlap=500,
        length_function=len
        )
        chunks = text_splitter.split_text(self.text)
        return chunks
    
pdf_files = glob.glob("./pdfs/*.pdf")
text = readpdf(pdf_files)
text.get_pdf_text()
chunks=text.get_text_chunks()


Created a chunk of size 1068, which is longer than the specified 1000
Created a chunk of size 1337, which is longer than the specified 1000
Created a chunk of size 1361, which is longer than the specified 1000
Created a chunk of size 1358, which is longer than the specified 1000
Created a chunk of size 1420, which is longer than the specified 1000
Created a chunk of size 1272, which is longer than the specified 1000
Created a chunk of size 1205, which is longer than the specified 1000
Created a chunk of size 1123, which is longer than the specified 1000
Created a chunk of size 1347, which is longer than the specified 1000
Created a chunk of size 2808, which is longer than the specified 1000
Created a chunk of size 3985, which is longer than the specified 1000
Created a chunk of size 3357, which is longer than the specified 1000
Created a chunk of size 4028, which is longer than the specified 1000
Created a chunk of size 2982, which is longer than the specified 1000
Created a chunk of s

In [51]:
len(chunks)

8076

### Create local vector store

In [52]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_texts(texts=chunks, embedding=embedding)
vectorstore.save_local("faiss_index")

### Create vector store in astradb

In [7]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN,database_id=ASTRA_DB_ID)
astra_vector_store=Cassandra(
    embedding=embedding,
    table_name="demo",
    session=None,
    keyspace=None,
)

astra_vector_store.add_texts(chunks)
print("Inserted %i headlines." % len(chunks))
astra_vector_index=VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 4055 headlines.


In [58]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

prompt_template = """
You are a researcher know how to write scientific paper on topics related to STEM. 
I will share related texts from previous documents with you and you will rewrite the provided texts in an academic language.

1/ the generated texts should be very similar to the style of the documents, 
in terms of ton of voice, logical arguments and other details

2/ If the past texts are irrelevant, then try to mimic the style of the documents to rewrite the paragraph

Below is the texts I want to rewrite:
{paragraph}

Here is a list of previous documents:
{document}

Please rewrite the paragraph:
    """
prompt=PromptTemplate(template=prompt_template,input_variables=["paragraph","document"])
chain = LLMChain(llm=llm, prompt=prompt)

query_text="particle size segregation and diffusive remixing"
# docs=astra_vector_store.similarity_search(query_text,k=4)

new_db = FAISS.load_local("faiss_index", embedding)
docs = new_db.similarity_search(query_text,k=5)
response=chain.run(paragraph=query_text,document=docs)
# answer=astra_vector_index.query(query_text,llm=llm).strip()
print(response)
print(docs)

**Particle Size Segregation and Diffusive Remixing**

Particle size segregation is a phenomenon observed in granular materials, where particles of different sizes tend to separate and form distinct layers or clusters within the material. This segregation can occur due to various mechanisms, including differences in particle size, density, shape, and surface properties. In a rotating tumbler, particle size segregation is often driven by a combination of centrifugal forces and gravitational forces. Larger particles tend to move towards the periphery of the tumbler due to centrifugal forces, while smaller particles tend to accumulate near the center due to gravitational forces.

Diffusive remixing is a process that counteracts particle size segregation and promotes the mixing of particles of different sizes. This process occurs due to the random motion of particles within the granular material. As particles move, they collide with each other and exchange momentum, which can lead to the mi

In [59]:
docs

[Document(page_content='Granular segregation in circular tumblers633'),
 Document(page_content='Here we study the segregation of particles of different'),
 Document(page_content='Diffusion dominated Segregation dominated Advection dominated\n10110010–110–210–1100101102103'),
 Document(page_content='Regimes of segregation and mixing in combined size and density\ngranular systems: an experimental study\nReceived: 18 May 2004 / Published online: 10 March 2005\n© Springer-Verlag 2005\nAbstract Granular segregation in a rotating tumbler occurs'),
 Document(page_content='International Journal of Multiphase Flow 30 (2004) 419–428www.elsevier.com/locate/ijmulﬂow')]

### Run QA cycle

In [6]:
first_question=True
while True:
    if first_question:
        query_text=input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text=input("\nWhat's your next question (or type 'quit' to exit): ").strip()   
    if query_text.lower()=='quit':
        break
    if query_text=="":
        continue

    first_question=False 
    print("\nQUESTION: \"%s\"" % query_text)

    answer=astra_vector_index.query(query_text,llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc,score in astra_vector_store.similarity_search_with_score(query_text,k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "who is the author"
ANSWER: "I do not know the answer to this question."

FIRST DOCUMENTS BY RELEVANCE:
    [0.8519] "ﬁgure 1 (a). The standard linear spring-dashpot model (Cundall & Strack 1979)i su s  ..."
    [0.8485] "varied to adjust the size ratio, dl/ds)h a v ea ±10 % uniform size distribution to m ..."
    [0.8426] "with size ratio for the range explored here (1 <dl/ds⩽3) and is reasonably well ﬁt b ..."
    [0.8377] "ZIFF,R . M .&T ORQUATO , S. 2017 Percolation of disordered jammed sphere packings. J ..."
