# Analysis of Abstracts Using a Large Language Model (LLM)

## Introduction

In this Jupyter Notebook, we will perform an in-depth analysis of abstracts extracted from a CSV file using a Large Language Model (LLM). The goal of this analysis is to leverage the capabilities of LLMs to extract meaningful insights, identify key themes, and perform various natural language processing (NLP) tasks on the abstracts.

### Objectives

- **Data Loading**: Import and preprocess abstracts from a CSV file.
- **Text Analysis**: Utilize LLMs to analyze the content of the abstracts.

### Tools and Libraries

- **LangChain**: To interface with the LLM.

### Workflow

1. **Data Import**: Load the CSV file containing the abstracts.
3. **LLM Integration**: Use the LLM to perform various NLP tasks.

By the end of this notebook, you will have a comprehensive understanding of how to use LLMs for analyzing textual data and extracting valuable insights from scientific abstracts.

1. **Data Import**: Load the CSV file containing the abstracts.


In [6]:
import csv
from langchain_community.llms import Ollama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from downloader import *


In [12]:
def read_csv(file_path, num_lines):
    with open(file_path, 'r', encoding="utf8", errors='ignore') as file:
        reader = csv.reader(file)
        dico = {}
        for i, row in enumerate(reader):
            if i >= num_lines:
                break
            if i == 0:
                continue
            dico[row[17]] = {"abstract": row[4]} 
            """
            try:
               
                downloader = Downloader(row[17], 'doi', f"pdfs/{row[17]}.pdf")
                downloader.download()
                dico[row[17]].update({"pdf": True})
            except Exception as e:
                dico[row[17]].update({"pdf": False})
                print(e)
          """      
        print(dico)
        return dico

file_path = "filtered_AC_publications.csv"
num_lines = 5
dico = read_csv(file_path, num_lines)

## Writing the pdfs into a txt file:

for key in dico:
    with open("pdfs.txt", "a") as f:
        f.write(key + "\n")


{'10.1038/s41586-020-2746-2': {'abstract': 'The genetic circuits that allow cancer cells to evade destruction by the host immune system remain poorly understood1\x963. Here, to identify a phenotypically robust core set of genes and pathways that enable cancer cells to evade killing mediated by cytotoxic T\xa0lymphocytes (CTLs), we performed genome-wide CRISPR screens across a panel of genetically diverse mouse cancer cell lines that were cultured in the presence of CTLs. We identify a core set of 182\xa0genes across these mouse cancer models, the individual perturbation of which increases either the sensitivity or the resistance of cancer cells to CTL-mediated toxicity. Systematic exploration of our dataset using genetic co-similarity reveals the hierarchical and coordinated manner in which genes and pathways act in cancer cells to orchestrate their evasion of CTLs, and shows that discrete functional modules that control the interferon response and tumour necrosis factor (TNF)-induced 

In [17]:


from selenium import webdriver
from selenium.webdriver.chrome.service import Service  # Import Service
import time
import os
import random

# --------------------- Section 1: Sci-Hub article downloader ---------------------
# Function to automate downloading of articles from Sci-Hub using DOIs
def scihub_get(doi):
    chromeOptions = webdriver.ChromeOptions()  # Set Chrome options for Selenium
    prefs = {"download.default_directory" : "/home/youssef/clone/spock/scholarly_creative_work/pdfs"}  # Your preferred download directory
    chromeOptions.add_experimental_option("prefs", prefs)
    
    # Specify the path to the Chrome driver
    chrome_driver_path = "/usr/bin/chromedriver"  # Update this with your actual path
    service = Service(chrome_driver_path)  # Create a Service object for ChromeDriver
    
    # Initialize Chrome WebDriver using Service
    wd = webdriver.Chrome(service=service, options=chromeOptions)
    
    # List of Sci-Hub domains to randomly select from
    scihub = ['https://sci-hub.ru/', 'https://sci-hub.st/', 'https://sci-hub.se/']
    root = scihub[random.randint(0, 2)]  # Select a random Sci-Hub domain
    
    # Search for the article using its DOI
    wd.get(root + doi)  # Open the Sci-Hub page for the DOI
    time.sleep(1)  # Pause to let the page load
    
    try:
        # Attempt to click the download button
        b = wd.find_element('xpath','//*[@id="buttons"]/button')
        b.click()
        flag = True
        elements = len( [f for f in os.listdir("/home/youssef/clone/spock/scholarly_creative_work/pdfs") if f.endswith('.pdf')])
        while elements == len([f for f in os.listdir("/home/youssef/clone/spock/scholarly_creative_work/pdfs") if f.endswith('.pdf')]):
            continue
        
        
        
    except Exception as e:
        # Print error message if the article access fails
        print('access failed.    doi = ' + doi)
        flag = False
        print(e)
        time.sleep(5)  # Shorter wait before next attempt
    
    wd.quit()  # Close the WebDriver
    return flag  # Return whether the download was successful

# --------------------- Section 2: Rename downloaded files (using DOI only) ---------------------
# Function to rename downloaded files based only on DOI
def rename_file(doi):
    time.sleep(1)  # Wait to ensure the file is downloaded
    path = '/home/youssef/clone/spock/scholarly_creative_work/pdfs/'  # Your preferred download directory
    # List all files in the download directory
    dir_list = os.listdir(path)
    
    # Check if any files were downloaded
    if len(dir_list) > 0:
        found = 0
        for file in dir_list:
            if file[0:3] != 'No_':  # Skip already renamed files
                found = 1  # File found
                break

        if found == 0:  # If no new file is found
            print('download failed.    doi = ' + doi)
        else:  # If a new file is found
            l = file.split('.')
            if l[len(l) - 1] != "pdf":  # Check if the file was fully downloaded
                print('download incomplete.    doi = ' + doi)
            else:
                # Rename the file based only on DOI
                old = path + '/' + file
                doi_cleaned = doi.replace("/", "_")  # Replace "/" in DOI with "_" to avoid issues in file name
                new = path + doi_cleaned + '.pdf'
                os.rename(old, new)  # Rename the file

# --------------------- Section 3: Article download manager ---------------------
# Function to handle the entire process for downloading and renaming articles
def article_get(doi):
    # If the article is successfully downloaded, rename the file
    if scihub_get(doi):
        rename_file(doi)

# --------------------- Section 4: Download articles based on dictionary ---------------------
# Assuming 'dico' is your dictionary where DOIs are the keys
for doi in dico.keys():
    article_get(doi)

# --------------------- Section 5: Retry for missing articles ---------------------
# Check for articles that were not downloaded and try again
path = '/home/youssef/clone/spock/scholarly_creative_work/pdfs/'  # Your preferred download directory
dir_list = os.listdir(path)

for doi in dico.keys():
    doi_cleaned = doi.replace("/", "_")  # Replace "/" in DOI for file name
    filename = 'No_' + doi_cleaned + '.pdf'
    
    if filename in dir_list:
        continue  # Skip if the file already exists
    else:
        article_get(doi)

# --------------------- Section 6: Output missing articles ---------------------
# Output information about articles that are still missing after retry
count = 0

for doi in dico.keys():
    doi_cleaned = doi.replace("/", "_")  # Replace "/" in DOI for file name
    filename = 'No_' + doi_cleaned + '.pdf'
    
    if filename in dir_list:
        continue  # Skip if the file already exists
    else:
        # Print details of missing articles
        print(filename + "    doi: " + doi)
        count += 1

# Print the total number of missing articles
print(str(count) + " articles missing in total.")


KeyboardInterrupt: 

LLM Analysis 

In [None]:
def get_topic(abstract:str):
    
    Llm = Ollama(model='llama3', temperature=0.2)
    
    
    if abstract is None:
        raise ValueError("Abstract is required")
    
    parser = JsonOutputParser()
    
    #AI / accelerated materials discovery / SDLs / autonomous labs / high-throughput experimentation / high-throughput DFT
    
    topics = ["Machine Learning", "Batteries", "AI", "accelerated materials discovery", "Self Driving Labs", "autonomous labs", "high-throughput experimentation", "high-throughput DFT"]
    
    
    new_text = """
    
    The output needs to be formated as the following: 
    
    {
    "topic": {
    "topic1": ["Keyword1", "Keyword2", "Keyword3"],
    "topic2": ["Keyword1", "Keyword2", "Keyword3"]
    }
    }
    
    Only output the dictionary above, nothing else with it.
    """

    prompt = PromptTemplate(
    template=" So you are a text assistant and I need you to help me identify the topics from the following list the text given to you {topics}. \n Here's the text: {abstract}. \n\n Note: A single text can belong to multiple topics, so please list all relevant topics. {format_instructions}",
    input_variables=["format_instructions", "abstract", "topics"]
    )

    chain = prompt | Llm | parser
    topics = chain.invoke({"format_instructions": new_text, "abstract": abstract, "topics": topics})
    return list(topics.values())[0]


print(get_topic("The development of high-performance batteries is crucial for the future of electric vehicles. The current generation of batteries are not able to provide the range and power required for long-distance travel. This project aims to develop new materials for batteries that can provide higher energy density and faster charging times."))

def get_info(abstract:str = None, **kwargs):
    Llm = Ollama(model='llama3', temperature=0.5)
    if abstract is None:
        raise ValueError("Abstract is required")
    
    dico = {}
    for key, question in kwargs.items():
        print(key, question)
        prompt = PromptTemplate(
            template="So you are a text assistant and I want you to assist me by providing the following information: {question}. \n\n Here's the text: {abstract}. \n\n If the text doesn't contain any information about the topic given, output: 'N/A'",
            input_variables=["abstract", "question"]
        )
        chain = prompt | Llm 
        info = chain.invoke({"abstract": abstract, "question": question})
        dico[key] = info
    print(dico)
    return dico
    
    


    

    


## Workflow orchestration

In [None]:
for key in dico:
    dico[key].update({"topic":get_topic(dico[key]["abstract"])})
    dico[key].update(get_info(dico[key]["abstract"],affiliation="What affiliations do the authors or characters in the text have?",
                        new_materials="Does the text mention any new materials or discoveries?",
                        screening_algorithms="Are there any screening algorithms or systematic procedures discussed in the text?",
                        ai_algorithms="Does the text reference any AI algorithms or methods related to artificial intelligence?",
                        workflow="Can you describe the workflow or process followed in the text?",
                        methods="Can you summarize the methods or approaches mentioned in the text?",
                        models="What models or frameworks are discussed or used in the text?",
                        funding="Does the text mention any funding sources or sponsors?"))

print(dico)

### Writing in json file

In [None]:


with open("output.json", 'w') as file:
    json.dump(dico, file)

