In [None]:
!pip install accelerate>=0.12.0 transformers[torch]==4.25.1 
!pip install -U "ray[serve]"  # installs Ray + dependencies for Ray Serve
!pip install -U slack-bolt
!pip install faiss-cpu
!pip install atlassian-python-api
!pip install numpy




In [None]:
import torch
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from fastapi import FastAPI, Request
from ray import serve
from slack_bolt.async_app import AsyncApp
from slack_bolt.adapter.fastapi.async_handler import AsyncSlackRequestHandler
from slack_bolt.adapter.starlette.handler import SlackRequestHandler
import requests
from slack_sdk.signature import SignatureVerifier

import logging
# Configure the logger.


In [None]:
from atlassian import Confluence
import os

# Set up Confluence API connection
confluence = Confluence(
url='https://advendio.atlassian.net',
)
confluence
space_key = "SO"
pages = confluence.get_all_pages_from_space(space_key)
pages
# Create a directory to store the downloaded pages
if not os.path.exists('advendio_pages'):
    os.makedirs('advendio_pages')
# Download each page
for page in pages:
    page_id = page['id']
    page_title = page['title']
    page_filename = page_title.replace(' ', '_') + '.html'
    page_content = confluence.get_page_by_id(page_id, expand='body.storage')['body']['storage']['value']
    try:
        with open('advendio_pages/' + page_filename, 'w') as f:
            f.write(page_content)
    except:
        pass
    print('Downloaded:', page_filename)


In [None]:
!ls advendio_pages

In [None]:
import numpy as np

import torch
from typing import List
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from bs4 import BeautifulSoup
import os
import faiss
# Load questions and contexts


# make it integration with Slack Ingestor
@serve.deployment()
class DocumentVectorDB:
    def __init__(self, question_encoder_model: str = "facebook/dpr-question_encoder-single-nq-base"):
        self.token_limit = 512

        self.documents = self.format_documents()
        self.question_encoder = DPRQuestionEncoder.from_pretrained(
            question_encoder_model)
        self.question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
            question_encoder_model)
        self.context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
        self.context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
        count = self.index_documents(self.documents)
        print(count)
      
    # def __call__(self, request: Request) -> Dict:
    #     print("hellp")
    #     req_str = "what is ads campaign?"
    #     msg = self.documents[self.query_documents(req_str)]
    #     return {"result": msg}

    def index_documents(self, documents: List[str]) -> int:
        # Encode the documents
        encoded_documents = self.context_tokenizer(self.documents, return_tensors="pt", padding=True, truncation=True, max_length=self.token_limit)
        document_embeddings = self.context_encoder(**encoded_documents).pooler_output

        document_embeddings = document_embeddings.detach().numpy()
        document_embeddings=np.ascontiguousarray(document_embeddings)
        # Create Faiss Index
        vector_dimension = document_embeddings.shape[1]
        # print(vector_dimension)
        self.index = faiss.IndexFlatL2(vector_dimension)
        faiss.normalize_L2(document_embeddings)
        self.index.add(document_embeddings)
        return self.index.ntotal

    def insert_documents(self) -> List[str]:
        """
        store some whre
        """
        pass

    def encode_questions(self, query: str) -> List[torch.Tensor]:
        encoded_query = self.question_tokenizer(query, return_tensors="pt")
        query_embedding = self.question_encoder(
            **encoded_query).pooler_output.detach().numpy()
        query_embedding = np.ascontiguousarray(query_embedding)
        return query_embedding

    def query_documents(self, query: str) -> str:
        # Encode the query
        query_embedding = self.encode_questions(query)
        _, idx = self.index.search(query_embedding, 4)
        return self.documents[idx[0][0]]

    def format_documents(self):
        documents = []
        for filename in os.listdir('advendio_pages'):
            f = os.path.join('advendio_pages', filename)
            with open(f, 'r', encoding='utf-8') as file:
                html_content = file.read()
                soup = BeautifulSoup(html_content, "lxml")

                text_content = soup.get_text(separator=" ", strip=True)
                documents.append(text_content)
        return documents

@serve.deployment(route_prefix="/")
class ConversationBot:
    def __init__(self, db: DocumentVectorDB, model: str = "databricks/dolly-v2-12b"):
        self.model = model
        self.db = db

    async def prompt(self, input: str):
        print('in prompt:')
        context = await self.db.query_documents.remote(input)
        print("context: {}".format(context))
        result = "{input} \n context: {}\n".format(context)
        print(result)
        return result

    def generate_text(self, input_text: str) -> str:
        generator = pipeline(model=self.model,
                             torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
        print("here generated text")
        return generator(self.prompt(input_text))[0]["generated_text"]

    async def __call__(self, http_request: Request) -> str:
        input_text: str = await http_request.json()
        return self.generate_text(input_text)

# 2: Deploy the model.
serve.run(ConversationBot.bind(DocumentVectorDB.bind()))

english_text = "what is an ads event?"
# 3: Query the deployment and print the result.
response = requests.post("http://127.0.0.1:8000/", json=english_text)
french_text = response.text

print(french_text)



In [None]:
db = DocumentVectorDB()
db('what is ads campaign?')

In [None]:
# dolly_chat_bot = ConversationBot("databricks/dolly-v2-12b")
# slack_agent = SlackAgent(dolly_chat_bot)

# # TODO: setup the token and secrets

# memory_buffer = RedisManager()
# dolly_chat_bot.bind(memory_buffer)
# slack_agent.bind(dolly_chat_bot)

In [None]:
output = generate_text("""
Imagine a person John Lin is a pharmacy shopkeeper at the Willow Market and Pharmacy who loves to help people. He is always looking for ways to make the process of getting medication easier for his customers; John Lin is living with his wife, Mei Lin, who is a college professor, and son, Eddy Lin, who is a student studying music theory; John Lin loves his family very much; John Lin has known the old couple next-door, Sam Moore and Jennifer Moore, for a few years; John Lin thinks Sam Moore is a kind and nice man; John Lin knows his neighbor, Yuriko Yamamoto, well; John Lin knows of his neighbors, Tamara Taylor and Carmen Ortiz, but has not met them before; John Lin and Tom Moreno are colleagues at The Willows Market and Pharmacy; John Lin and Tom Moreno are friends and like to discuss local politics together; John Lin knows the Moreno family somewhat well — the husband Tom Moreno and the wife Jane Moreno.

Generate conversation between John Lin and his wife in the morning:
""")
print(output)

In [None]:
prompt = """
Imagine a person John Lin is a pharmacy shopkeeper at the Willow Market and Pharmacy who loves to help people. He is always looking for ways to make the process of getting medication easier for his customers; John Lin is living with his wife, Mei Lin, who is a college professor, and son, Eddy Lin, who is a student studying music theory; John Lin loves his family very much; John Lin has known the old couple next-door, Sam Moore and Jennifer Moore, for a few years; John Lin thinks Sam Moore is a kind and nice man; John Lin knows his neighbor, Yuriko Yamamoto, well; John Lin knows of his neighbors, Tamara Taylor and Carmen Ortiz, but has not met them before; John Lin and Tom Moreno are colleagues at The Willows Market and Pharmacy; John Lin and Tom Moreno are friends and like to discuss local politics together; John Lin knows the Moreno family somewhat well — the husband Tom Moreno and the wife Jane Moreno.

Plan a day for John Lin
"""
output = generate_text(prompt)
print(output[0]['generated_text'])

In [None]:
prompt = """Please label the sentiment towards the movie of the given movie review. The sentiment label should be "positive" or "negative". 
Text: i'll bet the video game is a lot more fun than the film. 
Sentiment:

"""
output = generate_text(prompt)
print(output[0]['generated_text'])

In [None]:
prompt = """
Only output the last one

Definition: Determine the speaker of the dialogue, "agent" or "customer".
Input: I have successfully booked your tickets.
Ouput: agent

Definition: Determine which category the question asks for, "Quantity" or "Location".
Input: What's the oldest building in US?
Ouput: Location

Definition: Classify the sentiment of the given movie review, "positive" or "negative".
Input: i'll bet the video game is a lot more fun than the film.
Output:


"""

output = generate_text(prompt)
print(output[0]['generated_text'])

In [None]:
generate_text("""I have three models, 1. visual transformer, 2. Text generation model
,  if i encounter this input from user: what is in the image? followed by an image file which model should I use?

output your prediction of yes or no for option 1 and 2

"""
)

In [None]:
prompt = """
What are some kinds of embroidery stitches for writing letters?	"""

output = generate_text(prompt)
print(output[0]['generated_text'])

In [None]:
!pip install evaluate

In [None]:
from evaluate import load
perplexity = load("perplexity", module_type="metric")
predictions = [generate_text("how does digital ads work in general?")]
results = perplexity.compute(predictions=predictions, model_id='databricks/dolly-v2-12b')


In [None]:
results