# Install Packages

In [None]:
!pip install  \
  sentence-transformers \
  pinecone-client \
  datasets \
  einops \
  xformers 

## !pip install transformers
!pip install loralib langchain 
!pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.33.1 trl==0.4.7 accelerate==0.20.3


# Initialize

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
import os
import sys

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, GenerationConfig, LlamaForCausalLM, LlamaTokenizer
from sklearn.model_selection import train_test_split
import transformers
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import copy
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)


In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# Data pre-processing 

In [4]:
PATH = "/kaggle/input/rag-data/meta_Rag.xlsx"
if(torch.cuda.is_available()):
    device = 'cuda'

try:
    dataset = pd.read_excel(PATH)  # Adjust the delimiter ifeeded
except pd.errors.ParserError as e:
    print(f"Error parsing CSV: {e}")

In [5]:
dataset.head(5)

Unnamed: 0,Main_Paper,MP,Query
0,Background The available evidence about the p...,MP: ObjectiveSeveral randomized controlled tri...,Generate a meta-analysis abstract on [Synbioti...
1,Introduction Several studies have demonstrated...,MP:Background: Depression is a severe disease ...,Generate a meta-analysis abstract on [Interven...
2,Background Glycemic control is vital to patie...,MP:Purpose Diabetes mellitus (DM) is increasi...,Generate a meta-analysis abstract on [Physical...
3,Objective This meta-analysis aimed to evaluat...,MP: Background Recent in vitro and animal expe...,Generate a meta-analysis abstract on [Pioglita...
4,Background Recurrent hemarthrosis is one of t...,MP:Background: The primary clinical manifestat...,Generate a meta-analysis abstract on [Effectiv...


In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

In [10]:
docs = text_splitter.split_text(data)

In [12]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [22]:
query_result = embeddings.embed_query("Hello World")

In [None]:

print("Length", len(query_result))

# Vector Database

In [24]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '0b94156b-c8d1-4e2b-af36-087d9a7bf9d3')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [25]:

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchainpinecone" # put in the name of your pinecone index here


In [27]:
docsearch=Pinecone.from_texts([t for t in docs], embeddings, index_name=index_name)

# Use fine-tuned Model from Hub

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Bakugo123/LLama2_newPrompt", use_auth_token=True)

In [None]:
model = AutoModelForCausalLM.from_pretrained("Bakugo123/LLama2_newPrompt",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                             load_in_8bit=True
                                             )

In [None]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 1024,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )


In [38]:
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0.7})


# Instructions 

In [39]:
DEFAULT_SYSTEM_PROMPT = """
Given a collection of abstracts from papers used in various medical fields, generate a meta-analysis abstract summarizing the key findings of those abstracts and provide numerical values or statistical information for specific observations that are commonly reported in the provided abstracts. Some provided abstracts may have chunks, so maintain information similarities.
""".strip()

In [40]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<>\n", "\n<>\n\n"

In [41]:
SYSTEM_PROMPT = B_SYS + SYSTEM_PROMPT + E_SYS

In [50]:
instruction = """
{context}

Question: {question}
"""

In [51]:
template = B_INST + SYSTEM_PROMPT + instruction + E_INST

In [53]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

# Semantic Search Based on Query

In [54]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [55]:
result = qa_chain(f"{dataset['Query'][0]}")

  warn_deprecated(


In [None]:
result['result']

# Generate and evaluate similarity

In [58]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = [result['result'], dataset['Main_Paper'][0]]
# Load model from HuggingFace Hub
tokenizer_emb = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model_emb = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer_emb(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model_emb(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# print("Sentence embeddings:")
# print(sentence_embeddings)


In [None]:
from sentence_transformers import util

cos_sin = util.cos_sim(sentence_embeddings[0],sentence_embeddings[1])
print("Cosine_sim:", cos_sin)