In [1]:
import time

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# from utils.FastChat.fastchat.conversation import conv_templates, SeparatorStyle
from fastchat.conversation import conv_templates, SeparatorStyle

In [2]:
import sys
sys.path.append("utils/FastChat/repositories/GPTQ-for-LLaMa")
sys.path.insert(0, "utils/FastChat/repositories/GPTQ-for-LLaMa")

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:


class LlmPipeline():
    
    @torch.inference_mode()
    def generate_stream(self, tokenizer, model, params, device,
                        context_len=2048, stream_interval=2):
        """Adapted from fastchat/serve/model_worker.py::generate_stream"""

        prompt = params["prompt"]
        l_prompt = len(prompt)
        temperature = float(params.get("temperature", 1.0))
    #     max_new_tokens = int(params.get("max_new_tokens", 256))
        max_new_tokens = 1024
        stop_str = params.get("stop", None)

        input_ids = tokenizer(prompt).input_ids
        output_ids = list(input_ids)

        max_src_len = context_len - max_new_tokens - 8
        input_ids = input_ids[-max_src_len:]

        for i in range(max_new_tokens):
            if i == 0:
                out = model(
                    torch.as_tensor([input_ids], device=device), use_cache=True)
                logits = out.logits
                past_key_values = out.past_key_values
            else:
                attention_mask = torch.ones(
                    1, past_key_values[0][0].shape[-2] + 1, device=device)
                out = model(input_ids=torch.as_tensor([[token]], device=device),
                            use_cache=True,
                            attention_mask=attention_mask,
                            past_key_values=past_key_values)
                logits = out.logits
                past_key_values = out.past_key_values

            last_token_logits = logits[0][-1]
            if temperature < 1e-4:
                token = int(torch.argmax(last_token_logits))
            else:
                probs = torch.softmax(last_token_logits / temperature, dim=-1)
                token = int(torch.multinomial(probs, num_samples=1))

            output_ids.append(token)

            if token == tokenizer.eos_token_id:
                stopped = True
            else:
                stopped = False

            if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
                output = tokenizer.decode(output_ids, skip_special_tokens=True)
                pos = output.rfind(stop_str, l_prompt)
                if pos != -1:
                    output = output[:pos]
                    stopped = True
                yield output

            if stopped:
                break

        del past_key_values

    def __init__(self, device='cuda', model_name='anon8231489123/vicuna-13b-GPTQ-4bit-128g', 
                 num_gpus='1', wbits=4, temperature=0.7, max_new_tokens=512, conv_template='v1'):
        self.device = device
        self.model_name = model_name
        self.num_gpus = num_gpus
        self.wbits = wbits
        self.temperature = temperature
        self.max_new_tokens = max_new_tokens
        self.conv_template = conv_template
        
        # Model
        if device == "cuda":
            kwargs = {"torch_dtype": torch.float16}
            if num_gpus == "auto":
                kwargs["device_map"] = "auto"
            else:
                num_gpus = int(num_gpus)
                if num_gpus != 1:
                    kwargs.update({
                        "device_map": "auto",
                        "max_memory": {i: "13GiB" for i in range(num_gpus)},
                    })
        elif device == "cpu":
            kwargs = {}
        else:
            raise ValueError(f"Invalid device: {device}")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer = tokenizer

        if wbits > 0:
            from fastchat.serve.load_gptq_model import load_quantized

            print("Loading GPTQ quantized model...")
            model = load_quantized(model_name)
        else:
            model = AutoModelForCausalLM.from_pretrained(model_name,
                low_cpu_mem_usage=True, **kwargs)

        if device == "cuda" and num_gpus == 1:
            model.cuda()
            
        self.model = model
    
    
    def infer(self, inp, conv=None):
        if conv is None:
            conv = conv_templates[self.conv_template].copy()
        
        conv.append_message(conv.roles[0], inp) #Appending Input
        conv.append_message(conv.roles[1], None) #Appending Assitant:, Waiting for the right response from the model
        prompt = conv.get_prompt()
        params = {
            "model": self.model_name,
            "prompt": prompt,
            "temperature": self.temperature,
            "max_new_tokens": self.max_new_tokens,
            "stop": conv.sep if conv.sep_style == SeparatorStyle.SINGLE else conv.sep2,
        }
        
        pre = 0
        for outputs in self.generate_stream(self.tokenizer, self.model, params, self.device):
            outputs = outputs[len(prompt) + 1:].strip()
            outputs = outputs.split(" ")
        
        conv.messages[-1][-1] = " ".join(outputs)
        return " ".join(outputs[pre:]), conv

In [5]:
llm = LlmPipeline()

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


Loading GPTQ quantized model...
Loading model ...
Done.


In [6]:
res, conv = llm.infer("What is bert")
res

2023-05-11 13:51:58.201235: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 13:51:58.737767: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-05-11 13:51:58.737814: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


'BERT (Bidirectional Encoder Representations from Transformers) is a pre-trained language model developed by Google that is designed to understand natural language processing (NLP) tasks. BERT is based on the transformer architecture, which was introduced in a paper by Vaswani et al. in 2017, and it uses a bidirectional approach to process text, which means it processes text in both forward and backward directions.\nBERT is trained on a large corpus of text and is capable of understanding the context and relationships between words in a sentence, which makes it well-suited for a wide range of NLP tasks such as sentiment analysis, question answering, and text classification.\nThe BERT model has been made available as an open-source library, which has enabled a lot of research and development in the field of NLP, and it has been used in many applications such as chatbots, virtual assistants, and language translation.'

In [7]:
res, conv = llm.infer('How it work', conv)
res

'BERT works by using a multi-layer transformer architecture to process text in both forward and backward directions. The transformer architecture is a type of neural network that uses self-attention mechanisms to process sequences of data, such as words in a sentence.\nWhen processing a sentence, BERT first encodes the words in the sentence into a set of numerical features using a series of feedforward layers. These features are then passed through a multi-head self-attention layer, which allows the model to understand the relationships between words in the sentence. This is followed by a layer of bias and activation functions, which allow the model to make predictions and perform tasks such as classification and sentiment analysis.\nThe bidirectional processing in BERT allows it to understand the context of words in a sentence, which is important for tasks such as sentiment analysis and question answering. The model can also be fine-tuned for specific tasks by adjusting the layers and

In [8]:
conv

Conversation(system="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.", roles=('Human', 'Assistant'), messages=[['Human', 'What are the key differences between renewable and non-renewable energy sources?'], ['Assistant', 'Renewable energy sources are those that can be replenished naturally in a relatively short amount of time, such as solar, wind, hydro, geothermal, and biomass. Non-renewable energy sources, on the other hand, are finite and will eventually be depleted, such as coal, oil, and natural gas. Here are some key differences between renewable and non-renewable energy sources:\n1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable energy sources are finite and will eventually run out.\n2. Environmental impact: Renewable energy sources have a much lower environmental impact than non-renewable sources, which can lead to air and water po

In [9]:
documents = [
    "  COVID-19 is an acute respiratory disease that has been classified as a\npandemic by the World Health Organization. Characterization of this disease is\nstill in its early stages. However, it is known to have high mortality rates,\nparticularly among individuals with preexisting medical conditions. Creating\nmodels to identify individuals who are at the greatest risk for severe\ncomplications due to COVID-19 will be useful for outreach campaigns to help\nmitigate the disease's worst effects. While information specific to COVID-19 is\nlimited, a model using complications due to other upper respiratory infections\ncan be used as a proxy to help identify those individuals who are at the\ngreatest risk. We present the results for three models predicting such\ncomplications, with each model increasing predictive effectiveness at the\nexpense of ease of implementation.\n",
    "  The ongoing COVID-19 pandemic has had far-reaching effects throughout\nsociety, and science is no exception. The scale, speed, and breadth of the\nscientific community's COVID-19 response has lead to the emergence of new\nresearch literature on a remarkable scale -- as of October 2020, over 81,000\nCOVID-19 related scientific papers have been released, at a rate of over 250\nper day. This has created a challenge to traditional methods of engagement with\nthe research literature; the volume of new research is far beyond the ability\nof any human to read, and the urgency of response has lead to an increasingly\nprominent role for pre-print servers and a diffusion of relevant research\nacross sources. These factors have created a need for new tools to change the\nway scientific literature is disseminated. COVIDScholar is a knowledge portal\ndesigned with the unique needs of the COVID-19 research community in mind,\nutilizing NLP to aid researchers in synthesizing the information spread across\nthousands of emergent research articles, patents, and clinical trials into\nactionable insights and new knowledge. The search interface for this corpus,\nhttps://covidscholar.org, now serves over 2000 unique users weekly. We present\nalso an analysis of trends in COVID-19 research over the course of 2020.\n",
    "  The COVID-19 epidemic is considered as the global health crisis of the whole\nsociety and the greatest challenge mankind faced since World War Two.\nUnfortunately, the fake news about COVID-19 is spreading as fast as the virus\nitself. The incorrect health measurements, anxiety, and hate speeches will have\nbad consequences on people's physical health, as well as their mental health in\nthe whole world. To help better combat the COVID-19 fake news, we propose a new\nfake news detection dataset MM-COVID(Multilingual and Multidimensional COVID-19\nFake News Data Repository). This dataset provides the multilingual fake news\nand the relevant social context. We collect 3981 pieces of fake news content\nand 7192 trustworthy information from English, Spanish, Portuguese, Hindi,\nFrench and Italian, 6 different languages. We present a detailed and\nexploratory analysis of MM-COVID from different perspectives and demonstrate\nthe utility of MM-COVID in several potential applications of COVID-19 fake news\nstudy on multilingual and social media.\n",
    "  We present COVID-SEE, a system for medical literature discovery based on the\nconcept of information exploration, which builds on several distinct text\nanalysis and natural language processing methods to structure and organise\ninformation in publications, and augments search by providing a visual overview\nsupporting exploration of a collection to identify key articles of interest. We\ndeveloped this system over COVID-19 literature to help medical professionals\nand researchers explore the literature evidence, and improve findability of\nrelevant information. COVID-SEE is available at http://covid-see.com.\n",
    "  Coronavirus disease (COVID-19) is an infectious disease, which is caused by\nthe SARS-CoV-2 virus. Due to the growing literature on COVID-19, it is hard to\nget precise, up-to-date information about the virus. Practitioners, front-line\nworkers, and researchers require expert-specific methods to stay current on\nscientific knowledge and research findings. However, there are a lot of\nresearch papers being written on the subject, which makes it hard to keep up\nwith the most recent research. This problem motivates us to propose the design\nof the COVID-19 Search Engine (CO-SE), which is an algorithmic system that\nfinds relevant documents for each query (asked by a user) and answers complex\nquestions by searching a large corpus of publications. The CO-SE has a\nretriever component trained on the TF-IDF vectorizer that retrieves the\nrelevant documents from the system. It also consists of a reader component that\nconsists of a Transformer-based model, which is used to read the paragraphs and\nfind the answers related to the query from the retrieved documents. The\nproposed model has outperformed previous models, obtaining an exact match ratio\nscore of 71.45% and a semantic answer similarity score of 78.55%. It also\noutperforms other benchmark datasets, demonstrating the generalizability of the\nproposed approach.\n"
]
query = "What is covid"

In [10]:
%%time
context = " ".join(documents).replace("\n", " ")
prompt = f'Give me the overview of "{query}" from given context in one paragraph.\nContext: {context}'
ans, _ = llm.infer(prompt)

CPU times: user 29.8 s, sys: 401 ms, total: 30.2 s
Wall time: 30.2 s


In [11]:
ans

'COVID-19 has been declared a global health emergency by the World Health Organization and has affected many aspects of society, including the scientific community. The scale and speed of the pandemic have led to a rapid increase in COVID-19 related scientific literature, with over 81,000 publications released as of October 2020. This has created a need for new tools to help researchers stay up to date with the latest findings and make sense of the vast amount of information available.\n\nOne solution to this problem is the COVID-19 Search Engine (CO-SE), a system that uses natural language processing and machine learning to search a large corpus of publications and provide relevant results to users. The CO-SE consists of a retriever component trained on a TF-IDF vectorizer and a reader component that uses a Transformer-based model to find answers to user queries. The system has been tested on a dataset of 1000 queries and has achieved a high accuracy, outperforming previous models and

In [None]:
documents = [
    "  COVID-19 is an acute respiratory disease that has been classified as a\npandemic by the World Health Organization. Characterization of this disease is\nstill in its early stages. However, it is known to have high mortality rates,\nparticularly among individuals with preexisting medical conditions. Creating\nmodels to identify individuals who are at the greatest risk for severe\ncomplications due to COVID-19 will be useful for outreach campaigns to help\nmitigate the disease's worst effects. While information specific to COVID-19 is\nlimited, a model using complications due to other upper respiratory infections\ncan be used as a proxy to help identify those individuals who are at the\ngreatest risk. We present the results for three models predicting such\ncomplications, with each model increasing predictive effectiveness at the\nexpense of ease of implementation.\n",
    "  The ongoing COVID-19 pandemic has had far-reaching effects throughout\nsociety, and science is no exception. The scale, speed, and breadth of the\nscientific community's COVID-19 response has lead to the emergence of new\nresearch literature on a remarkable scale -- as of October 2020, over 81,000\nCOVID-19 related scientific papers have been released, at a rate of over 250\nper day. This has created a challenge to traditional methods of engagement with\nthe research literature; the volume of new research is far beyond the ability\nof any human to read, and the urgency of response has lead to an increasingly\nprominent role for pre-print servers and a diffusion of relevant research\nacross sources. These factors have created a need for new tools to change the\nway scientific literature is disseminated. COVIDScholar is a knowledge portal\ndesigned with the unique needs of the COVID-19 research community in mind,\nutilizing NLP to aid researchers in synthesizing the information spread across\nthousands of emergent research articles, patents, and clinical trials into\nactionable insights and new knowledge. The search interface for this corpus,\nhttps://covidscholar.org, now serves over 2000 unique users weekly. We present\nalso an analysis of trends in COVID-19 research over the course of 2020.\n",
    "  The COVID-19 epidemic is considered as the global health crisis of the whole\nsociety and the greatest challenge mankind faced since World War Two.\nUnfortunately, the fake news about COVID-19 is spreading as fast as the virus\nitself. The incorrect health measurements, anxiety, and hate speeches will have\nbad consequences on people's physical health, as well as their mental health in\nthe whole world. To help better combat the COVID-19 fake news, we propose a new\nfake news detection dataset MM-COVID(Multilingual and Multidimensional COVID-19\nFake News Data Repository). This dataset provides the multilingual fake news\nand the relevant social context. We collect 3981 pieces of fake news content\nand 7192 trustworthy information from English, Spanish, Portuguese, Hindi,\nFrench and Italian, 6 different languages. We present a detailed and\nexploratory analysis of MM-COVID from different perspectives and demonstrate\nthe utility of MM-COVID in several potential applications of COVID-19 fake news\nstudy on multilingual and social media.\n",
    "  We present COVID-SEE, a system for medical literature discovery based on the\nconcept of information exploration, which builds on several distinct text\nanalysis and natural language processing methods to structure and organise\ninformation in publications, and augments search by providing a visual overview\nsupporting exploration of a collection to identify key articles of interest. We\ndeveloped this system over COVID-19 literature to help medical professionals\nand researchers explore the literature evidence, and improve findability of\nrelevant information. COVID-SEE is available at http://covid-see.com.\n",
    "  Coronavirus disease (COVID-19) is an infectious disease, which is caused by\nthe SARS-CoV-2 virus. Due to the growing literature on COVID-19, it is hard to\nget precise, up-to-date information about the virus. Practitioners, front-line\nworkers, and researchers require expert-specific methods to stay current on\nscientific knowledge and research findings. However, there are a lot of\nresearch papers being written on the subject, which makes it hard to keep up\nwith the most recent research. This problem motivates us to propose the design\nof the COVID-19 Search Engine (CO-SE), which is an algorithmic system that\nfinds relevant documents for each query (asked by a user) and answers complex\nquestions by searching a large corpus of publications. The CO-SE has a\nretriever component trained on the TF-IDF vectorizer that retrieves the\nrelevant documents from the system. It also consists of a reader component that\nconsists of a Transformer-based model, which is used to read the paragraphs and\nfind the answers related to the query from the retrieved documents. The\nproposed model has outperformed previous models, obtaining an exact match ratio\nscore of 71.45% and a semantic answer similarity score of 78.55%. It also\noutperforms other benchmark datasets, demonstrating the generalizability of the\nproposed approach.\n"
]
query = "What is covid"

%%time
context = " ".join(documents).replace("\n", " ")
prompt = f'Give me the overview of "{query}" from given context in one paragraph.\nContext: {context}'
ans, _ = llm.infer(prompt)