In [None]:
!pip -q install langchain
!pip -q install bitsandbytes accelerate xformers einops
!pip -q install datasets loralib sentencepiece
!pip -q install pypdf

!pip -q install sentence_transformers
!pip install chromadb

!pip install openai
!pip install tiktoken

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from huggingface_hub import notebook_login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

import os
import sys

In [None]:
from google.colab import drive
drive.mount('/content/drive')

directory_path = '/content/drive/MyDrive/docs1'



In [None]:
files = os.listdir(directory_path)

In [None]:
document=[]
for file in files:

  if file.endswith(".pdf"):
    pdf_path=""+file
    loader=PyPDFLoader(pdf_path)
    document.extend(loader.load())
  elif file.endswith('.docx') or file.endswith('.doc'):
    doc_path=""+file
    loader=Docx2txtLoader(doc_path)
    document.extend(loader.load())
  elif file.endswith('.txt'):
    text_path = "/content/drive/MyDrive/docs1/"+file

    loader=TextLoader(text_path, encoding = 'UTF-8')
    document.extend(loader.load())

In [None]:
# document_splitter=CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)
document_splitter= RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, separators=[" ", ",", "\n"])

In [None]:
document_chunks=document_splitter.split_documents(document)

In [None]:
len(document_chunks)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [None]:
os.environ["OPENAI_API_KEY"]="s"
embeddings = OpenAIEmbeddings()

In [None]:
vectordb=Chroma.from_documents(document_chunks,embedding=embeddings, persist_directory='./data')

In [None]:
vectordb.persist()

Llama

In [None]:
notebook_login()

In [None]:
##llama

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)


model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                              #load_in_8bit=True,
                                              load_in_4bit=True
                                             )

In [None]:
pipe=pipeline("text-generation",
              model=model,
              tokenizer=tokenizer,
              torch_dtype=torch.bfloat16,
              device_map='auto',
              max_new_tokens=512,
              min_new_tokens=-1,
              top_k=30

              )

In [None]:
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})

GPT

In [None]:
llm=ChatOpenAI(temperature=0.7, model_name='gpt-4')

In [None]:
memory=ConversationBufferMemory(memory_key='chat_history', return_messages=True)

In [None]:
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':20}),
                                             verbose=False, memory=memory)

In [None]:
result=pdf_qa({"question":"Given the following ontology, extract the related parts of the documents match with the ontology, ontology:has_Name(Agent, Name), has_Surname(Agent, Surname), has_First_Name(Agent, First_Name), has_Alternate_Name(Agent, Alternate_Name), recorded_At(Agent_name, Event), recorded_At(Agent_First_Name, Event), recorded_At(Agent_Surname, Event), recorded_At(Agent_Alternate_Name, Event). geerate texts to populate  based on the relations and document"})

In [None]:
result['answer']

In [None]:
file_path = "content/drive/MyDrive/outputn.txt"

# Open the file in write mode
with open(file_path, "w", encoding="utf-8") as file:
    # Write the variable's value to the file
    file.write(result['answer'])
