In [None]:
!pip install transformers
!pip install faiss-cpu

## Download Confluence Data

In [None]:
!pip install atlassian-python-api

from atlassian import Confluence
import os

# Set up Confluence API connection
confluence = Confluence(
url='https://advendio.atlassian.net',
)
confluence
space_key = "SO"
pages = confluence.get_all_pages_from_space(space_key)
pages
# Create a directory to store the downloaded pages
if not os.path.exists('advendio_pages'):
    os.makedirs('advendio_pages')
# Download each page
for page in pages:
    page_id = page['id']
    page_title = page['title']
    page_filename = page_title.replace(' ', '_') + '.html'
    page_content = confluence.get_page_by_id(page_id, expand='body.storage')['body']['storage']['value']
    try:
        with open('advendio_pages/' + page_filename, 'w') as f:
            f.write(page_content)
    except:
        pass
    print('Downloaded:', page_filename)


In [None]:
from bs4 import BeautifulSoup
import os
import faiss

documents = []
for filename in os.listdir('advendio_pages'):
  f = os.path.join('advendio_pages', filename)
  with open(f, 'r', encoding='utf-8') as file:
    html_content = file.read()
    soup = BeautifulSoup(html_content, "lxml")

    text_content = soup.get_text(separator=" ", strip=True)
    documents.append(text_content)
  

In [None]:
len(documents)

## Use DPR Question and Context Encoders 


In [None]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import torch

# User query
query = "brown fox"



# Load DPR question and context encoders
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Encode the documents
encoded_documents = context_tokenizer(documents, return_tensors="pt", padding=True, truncation=True, max_length=512)
document_embeddings = context_encoder(**encoded_documents).pooler_output





## Create FAISS Index

In [None]:
import numpy as np

document_embeddings = document_embeddings.detach().numpy()
document_embeddings=np.ascontiguousarray(document_embeddings)


# Create Faiss Index
vector_dimension = document_embeddings.shape[1]
print(vector_dimension)
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(document_embeddings)
index.add(document_embeddings)
print(index.ntotal)


## Document Retrieval

In [None]:


question = input("Enter your prompt: ")

# # Encode the query
encoded_query = question_tokenizer(query, return_tensors="pt")
query_embedding = question_encoder(**encoded_query).pooler_output.detach().numpy()
query_embedding=np.ascontiguousarray(query_embedding)

D, I = index.search(query_embedding, 4)
print(I)

# # Compute cosine similarity between the query embedding and document embeddings
# cosine_similarities = torch.mm(query_embedding, document_embeddings.T).squeeze(0)

# # Rank the documents based on cosine similarity scores
# ranked_documents = cosine_similarities.argsort(descending=True)

# # Display the ranked documents
# for index in ranked_documents:
#     print(f"Document {index + 1} (Score: {cosine_similarities[index].item():.4f}): {documents[index]}")


In [None]:
documents[11]

## Generate Response

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
qa_tokenizer = AutoTokenizer.from_pretrained(
            "microsoft/GODEL-v1_1-large-seq2seq"
        )
qa_model = AutoModelForSeq2SeqLM.from_pretrained(
            "microsoft/GODEL-v1_1-large-seq2seq"
        )
knowledge = documents[I[0][0]]
knowledge = "[KNOWLEDGE] " + knowledge
dialog = [question]
dialog = " EOS ".join(dialog)
instruction = (
            f"Instruction: given a dialog context, you need to response empathically."
        )
query = f"{instruction} [CONTEXT] {dialog} {knowledge}"
print(query)
input_ids = qa_tokenizer(f"{query}", return_tensors="pt").input_ids
output = qa_model.generate(
            input_ids, max_length=128, min_length=8, top_p=0.9, do_sample=True
        )
output = qa_tokenizer.decode(output[0], skip_special_tokens=True)

print('Response: {}'.format(output))