## The LLM bot

In [5]:
!module load python
!pip list
!source ../../bin/activate
!pip install langchain langchain_community langchain_core scholarly


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFPlumberLoader, TextLoader
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama
import json
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
import requests
from scholarly import scholarly



[0mPackage                   Version
------------------------- --------------
aiohttp                   3.9.5
aiosignal                 1.3.1
annotated-types           0.7.0
anyio                     4.4.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
attrs                     23.2.0
Babel                     2.15.0
beautifulsoup4            4.12.3
bleach                    6.1.0
certifi                   2024.7.4
cffi                      1.16.0
charset-normalizer        3.3.2
comm                      0.2.2
debugpy                   1.8.2
decorator                 5.1.1
defusedxml                0.7.1
executing                 2.0.1
fastjsonschema            2.20.0
fqdn                      1.5.1
frozenlist                1.4.1
greenlet                  3.0.3
h11                       0.14.0
httpcore                  1.0.5
httpx                     0.27.0
idna         

In [11]:

class Bot_LLM:
    def __init__(self,model='llama3',embed_model='mxbai-embed-large', folder_path='db2'):
        self.llm = Ollama(model=model)
        self.oembed = OllamaEmbeddings(model=embed_model)
        self.folder_path = folder_path
        self.vectorestore = None

    
    def get_topic_publication_abstract(self, abstract:str, input_file:str):
        with open(input_file, 'r') as file:
            data = json.load(file)
        
        parser = JsonOutputParser()
        
        new_text = """The output should be formatted as a JSON instance that conforms to the JSON schema below.

        As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
        the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

        Here is the output schema:
        ```
        {"topic": {'Machine Learning: [Keyword1, keyword2, keyword3], 'Batteries: [keyword1, keyword2, keyword3]}
        ```
        """


        prompt = PromptTemplate(
            template="Here is a text: {abstract} Please identify the topics from the following list: {liste}. Note: A single text can belong to multiple topics, so please list all relevant topics.  \n{format_instructions}"
        ,
            input_variables=["abstract","liste","topics"],
            partial_variables={"format_instructions": new_text}
        )


        chain = prompt | self.llm | parser
        topics = chain.invoke({"abstract": abstract, "liste": data.keys()})
        print('Topics: ', topics['topic'])
        return topics['topic']

    
    def rag(self, document:str):
        try:
            # The document is a pdf file
            loader = PDFPlumberLoader(document)
            data = loader.load()
            chunk_size = 500
            chunk_overlap = 20

        except:
            data = [TextLoader(text).load() for text in document]
            data = [item for sublist in data for item in sublist]

            
        text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        all_splits = text_splitter.split_documents(data)
        self.vectorstore = Chroma.from_documents(documents=all_splits, embedding=self.oembed, persist_directory=self.folder_path)
        
    def query_rag(self, question:str) -> None:
        if self.vectorstore:
            docs = self.vectorstore.similarity_search(question)
            from langchain.chains import RetrievalQA
            qachain=RetrievalQA.from_chain_type(self.llm, retriever=self.vectorstore.as_retriever(), verbose=True)
            res = qachain.invoke({"query": question})
            print(res['result'])


        else:
            raise Exception("No documents loaded")





ModuleNotFoundError: No module named 'langchain'

# The Publication Class: 


In [11]:


class Publication:
    def __init__(self,publication_filled, llm_use:bool=True) -> None:
        self.publication_filled = publication_filled
        self.title = self.get_publication_title()
        self.abstract = self.get_publication_abstract().lower()
        self.author = self.get_author_name()
        self.year = self.get_year()
        self.url = self.get_publication_url()
        self.citation = self.get_citation()
        self.pdf = self.get_pdf()
        self.topic = None
    
      
      
    '''  
    def get_topic(self,output_file="json/ouput.json", # à voir cette histoire avec get_topic et __get_topic
                  input_file="json/response.json") -> list[str]:
        try:
            with open(output_file,'r') as file:
                data = json.load(file)
            return data[self.author]['topic']
        except Exception as e:
            return self.__get_topic(input_file)
     '''   
    def get_publication_url(self) -> str:
        return self.publication_filled['pub_url']
    
    def get_publication_title(self) -> str:
        return self.publication_filled['bib']['title'] 

    def get_publication_abstract(self) -> str:
        return self.publication_filled['bib']['abstract']

    def get_author_name(self) -> str:
        return self.publication_filled['bib']['author']

    def get_year(self) -> str:
        return self.publication_filled['bib']['pub_year']
    
    def get_citation(self) -> str:
        return self.publication_filled['bib']['citation']
    
    def get_topic(self,llm,input_file="json/response.json") -> None:
        self.topic: dict = llm.get_topic_publication_abstract(abstract=self.abstract,input_file=input_file)
        return self.topic
    
    def get_pdf(self):
        url = f"https://scholar.google.com/scholar?q={self.title}"
        response = requests.get(url)
        if response.status_code == 200:
            html_content = response.text
            try:
                return self.__parse_google_scholar(html_content)
            except:
                return None


        else:
            print(f"Failed to fetch the page. Status code: {response.status_code}")
            return None

    def __repr__(self) -> str:
        self.available_attributes = {'title': self.title if self.title is not None else 'N/A',
                                     'abstract' : self.abstract if self.abstract is not None else 'N/A',
                                        'author': self.author if self.author is not None else 'N/A',
                                        'year': self.year if self.year is not None else 'N/A',
                                        'url': self.url if self.url is not None else 'N/A',
                                        'citation': self.citation if self.citation is not None else 'N/A',
                                        'pdf': self.pdf if self.pdf is not None else 'N/A',
                                        'topic': self.topic if self.topic is not None else 'N/A'}
        return str(self.available_attributes)


        
    def __parse_google_scholar(self,html_content):

        soup = BeautifulSoup(html_content, 'html.parser')

        a_tags = soup.find_all('a')
        try:
            pdf_link = [a['href'] for a in a_tags if 'href' in a.attrs and '.pdf' in a['href']][0]
            print(f"PDF link found: {pdf_link}")
            return pdf_link
        except:
            return None
        
        
    
    def download_pdf(self,path):
        
        import requests
        path = path + self.title + ".pdf"
        
        if self.pdf is not None:
            try:
                response = requests.get(self.pdf)
                if response.status_code == 200:
                    with open(path, 'wb') as file:
                        file.write(response.content)
                    print(f"PDF successfully downloaded and saved to {path}")
                else:
                    print(f"Failed to download the PDF. HTTP Status Code: {response.status_code}")

            except requests.exceptions.RequestException as e:
                print(f"An error occurred while downloading the PDF: {e}")
        else:
            from scidownl import scihub_download
            #  scidownl by title
            try:
                scihub_download(self.title, paper_type="title", out=path)
            except:
                try:
                    # By URL
                    scihub_download(self.pdf, out=path)
                except:
                    print("Couldn't download the PDF")
           

        
        
    
     
                    
            
        

# The Author



In [12]:



class Author:
    def __init__(self, author):
        """
        Initialize an Author object.

        Args:
            author (str): The name of the author.
        """
        self.author_name = author
        
    def __repr__(self):
        """
        Return a string representation of the Author object.

        Returns:
            str: The name of the author.
        """
        return self.author_name

    def get_last_publication(self):
        """
        Get the last publication of the author.

        Returns:
            dict: A dict containing information about the last publication.
        """
        search_query = scholarly.search_author(self.author_name)
        first_author_result = next(search_query)
        author = scholarly.fill(first_author_result)
        first_publication = sorted(author['publications'], 
                                   key=lambda x: int(x['bib']['pub_year'])
                                   if 'pub_year' in x['bib'] else 0, 
                                   reverse=True)[0]
        first_publication_filled = scholarly.fill(first_publication)
        return first_publication_filled

    def setup_author(self, output_file, llm):
        """
        Setup the author by adding their last publication to a JSON file.

        Args:
            output_file (str): The path to the JSON file.

        Returns:
            None
        """
        with open(output_file, 'r') as file:
            data = json.load(file)
        author_last_publication = Publication(self.get_last_publication())
        

        
        data[self.author_name] = {
            "title": author_last_publication.title,
            "abstract": author_last_publication.abstract,
            "topic": author_last_publication.get_topic(llm=llm), 
            "author": author_last_publication.author, 
            "year": author_last_publication.year,
            "url": author_last_publication.url,
            "pdf": author_last_publication.pdf,
        }
        
        
        with open(output_file, 'w') as file:
            json.dump(data, file)

'''
author = Author('Mehrad Ansari')
pub = Publication(author.get_last_publication())

pub.download_pdf('pdfs/')
'''


"\nauthor = Author('Mehrad Ansari')\npub = Publication(author.get_last_publication())\n\npub.download_pdf('pdfs/')\n"

## Main module to test function:


In [13]:

author = Author('Mehrad Ansari')

print("Author object created successfully")

pub = Publication(author.get_last_publication())

print("Publication object created successfully - Having fetched the last publication")

print('current available attributes: ')
print(pub)

print('Running LLM on the abstract of the publication')
pub.get_topic(llm=Bot_LLM())
print('current available attributes: ')
print(pub)





Author object created successfully
PDF link found: https://chemrxiv.org/engage/api-gateway/chemrxiv/assets/orp/resource/item/640a28a76642bf8c8f413f32/original/history-agnostic-battery-degradation-inference.pdf
Publication object created successfully - Having fetched the last publication
current available attributes: 
{'title': 'History-agnostic battery degradation inference', 'abstract': 'lithium-ion batteries (libs) have attracted widespread attention as an efficient energy storage device on electric vehicles (ev) to achieve emission-free mobility. however, the performance of libs deteriorates with time and usage, and the state of health of used batteries are difficult to quantify. having accurate estimations of a battery’s remaining life across different life stages would benefit maintenance, safety, and serve as a means of qualifying used batteries for second-life applications. since the full history of a battery may not always be available in downstream applications, in this study,