In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"]='0,1'

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.embeddings import HuggingFaceEmbeddings 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS


device = "cuda" if torch.cuda.is_available() else "cpu"
#Need only 1 GPU if loading 8-bit model
print(device)

print("Using %d GPUs" %torch.cuda.device_count())

import gradio as gr
import time


cuda
Using 4 GPUs


In [2]:
model_name = "eachadea/vicuna-13b-1.1"
tokenizer_path = "./tokenizer/"

#Create a local tokenizer copy the first time
if os.path.isdir(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
else:
    tokenizer = AutoTokenizer.from_pretrained("model_name")
    os.mkdir(tokenizer_path)
    tokenizer.save_pretrained(tokenizer_path)

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")#, load_in_8bit=True)
pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=2048,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.2
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
import os
from tqdm.notebook import tqdm

# specify the path of the folder containing the files
folder_path = "APS-Science-Highlight"

# specify the string to search for in the files
#search_string = "Jonathan Lang"

# get a list of all files in the folder
file_list = os.listdir(folder_path)

# initialize an empty string to store the file contents
book = ""

# iterate over the files in the folder
for filename in tqdm(file_list):
    # construct the full file path by joining the folder path and the filename
    file_path = os.path.join(folder_path, filename)
    # check if the file is a file (not a folder)
    if os.path.isfile(file_path):
        # read the contents of the file and append to the file_contents string
  #      with open(file_path, 'r') as f:
 #           if search_string in f.read():
                with open(file_path, 'r') as f:
                    book += "\n\n" + f.read()

# print the contents of all files in the folder
#print(book)

  0%|          | 0/991 [00:00<?, ?it/s]

In [10]:
#Load embedding model and use that to embed text from source
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
texts = text_splitter.split_text(book)
docsearch = FAISS.from_texts(
    texts, embeddings, metadatas=[{"source": str(i)} for i in range(len(texts))]
)

In [11]:
loc1 = 5
print(texts[loc1-1])
print(texts[loc1])
print(texts[loc1+1])

While some coronaviruses cause mild respiratory illness, highly pathogenic betacoronaviruses like those that cause SARS, MERS, and COVID-19 require effective therapeutics to reduce the severe, sometimes fatal, diseases they cause. To develop therapies for these diseases, it is critical to understand the structure and function of the viruses’ spike proteins. Spike (S) glycoproteins, which decorate the surface of coronaviruses, latch onto host-cell receptor proteins and undergo a dramatic structural change (from a “prefusion” to “postfusion” configuration) enabling the virus to forcefully fuse with the cell membrane. When designing therapies for coronaviruses, it’s more ideal to use antibodies that recognize prefusion spike proteins, since these antibodies create a stronger immune response. Other factors to consider are stability, mode of delivery, and production yield. For these reasons, the researchers opted to build on previous research which used camelid antibodies to study SARS and 

In [12]:
query = "Who is Jonathan Lang?"
N_hits = 3
docs = docsearch.similarity_search_with_score(query, k=N_hits)
#for doc in docs:
#    print(doc)

In [13]:
#Get context strings
context=""
for i in range(N_hits):
    context += docs[i][0].page_content +"\n"
print (context)

Author affiliations: 1McGill University, 2Sorbonne Université

Correspondence: *martin.schmeing@mcgill.ca
Correspondence: *awsch@uchicago.edu
Published Date

11.17.2020

Studying Our Galaxy’s “Water Worlds”

The original Arizona State University news release by Karin Valentine can be read here.



In [70]:

local_llm = HuggingFacePipeline(pipeline=pipe)


template = """The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Context:
{context}

Current conversation:
{history}
Human: {input}
AI:"""

PROMPT = PromptTemplate(
    input_variables=["history", "input", "context"], template=template
)


memory = ConversationBufferWindowMemory(memory_key="history", 
                                        input_key = "input", 
                                        k=6)


conversation = LLMChain(
        prompt=PROMPT,
        llm=local_llm, 
        verbose=True, 
        memory=memory
)

#print(conversation.prompt.template)

In [71]:
conversation.predict(input=query, context=context)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Context:
Argonne National Laboratory seeks solutions to pressing national problems in science and technology. The nation's first national laboratory, Argonne conducts leading-edge basic and applied scientific research in virtually every scientific discipline. Argonne researchers work closely with researchers from hundreds of companies, universities, and federal, state and municipal agencies to help them solve their specific problems, advance America's scientific leadership and prepare the nation for a better future. With employees from more than 60 nations, Argonne is managed by UChicago Argonne, LLC for the U.S. Department of Energy's Office of Science.

The U.S. Department of 

' Jonathan Lang is the director of the X-ray Science Division at the Advanced Photon Source (APS) at Argonne National Laboratory. He oversees operations of several beamlines at the APS, including the Sector 7-ID beamline, which is used for ultrafast time-resolved studies of materials. Dr. Lang received his PhD in physics from Cornell University and conducted postdoctoral research at the Stanford Synchrotron Radiation Lightsource before joining Argonne in 2003.'