In [1]:
#importing the main libraries for setting up code to interact with LLM
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp

In [2]:
# Defining a Promt Template to interact with LLM
template = """Question: {question}
Answer: Let’s work this out in a step by step way to be sure we have the right answer."""

In [3]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
n_gpu_layers = 1 # Change this value based on your model and your GPU VRAM pool.
n_batch = 4096 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

In [4]:
# Make sure the model path is correct for your system!
model_name = 'llama-2-7b-chat.Q5_K_M.gguf'
model_path="/home/anderson/Documents/LLM_Model/{}".format(model_name)

llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers, 
    n_batch=n_batch,
    n_ctx = 4096,
    temperature=0.0,
    max_tokens=2000,
    top_p=1,
    callback_manager=callback_manager,
    verbose=True, # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/anderson/Documents/LLM_Model/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.atten

In [5]:
#Question for LLM
question = """Identify the most importants point in the paper "NILM applications: Literature review of learning approaches, recent developments and challenges" by Angelis, published in 2022. If you don't know the answer, just say that you don't know. Keep the answer concise."""

In [6]:
#providing the results
print("<====================================== Outcome from model =======================================>")
llm.invoke(question);


The most important point in the paper "NILM applications: Literature review of learning approaches, recent developments and challenges" by Angelis, published in 2022 is:
* The variety of machine learning techniques used for NILM, including deep learning methods such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs), and traditional methods such as support vector machines (SVMs) and random forests.


llama_print_timings:        load time =     648.28 ms
llama_print_timings:      sample time =      10.90 ms /    99 runs   (    0.11 ms per token,  9085.07 tokens per second)
llama_print_timings: prompt eval time =     648.22 ms /    67 tokens (    9.67 ms per token,   103.36 tokens per second)
llama_print_timings:        eval time =   11589.45 ms /    98 runs   (  118.26 ms per token,     8.46 tokens per second)
llama_print_timings:       total time =   12337.74 ms /   165 tokens


In [7]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [8]:
#include some libraries to read and load data from web
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [9]:
loader = PyPDFLoader("/home/anderson/Documents/Articles/angelis2022.pdf")
data = loader.load()

In [10]:
#split the data into small chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

In [11]:
#Performing Embedding
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma

In [12]:
#storing the data in Vector Store
gpt4all_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
    model_name="all-MiniLM-L6-v2.gguf2.f16.gguf"
)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)

In [13]:
docs = vectorstore.similarity_search(question)
len(docs)

4

In [14]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

In [15]:
# Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnablePick

In [16]:
# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n')

In [17]:
# retrieving the data from vector store
retriever = vectorstore.as_retriever()
qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
#finally getting the outcome
print("<====================================== Outcome from model with RAG =============================>")
qa_chain.invoke(question);



Llama.generate: prefix-match hit


Answer: The most important points in the paper are:
1. Deep neural networks (DNNs) are gaining more attention in NILM research.
2. CNN architectures are widely used in NILM literature, representing almost 50% of deep learning approaches.
3. Attention-based networks and transformers have been recently adopted in NILM.
4. The emerging concept of NILM has a dominant role as a service in future smart energy grids.
5. Data transmission and storage issues dictate the utilization of relatively low sampling rate on smart meter data.
6. Future research should address one of the biggest weaknesses of NILM algorithms, which is generalization capability across different datasets/buildings.


llama_print_timings:        load time =     648.28 ms
llama_print_timings:      sample time =      18.24 ms /   161 runs   (    0.11 ms per token,  8828.69 tokens per second)
llama_print_timings: prompt eval time =    3821.65 ms /  1960 tokens (    1.95 ms per token,   512.87 tokens per second)
llama_print_timings:        eval time =   26094.53 ms /   160 runs   (  163.09 ms per token,     6.13 tokens per second)
llama_print_timings:       total time =   30113.26 ms /  2120 tokens
