In [1]:
# import modules

from getpass import getpass
from elasticsearch import Elasticsearch, helpers
import wget
import zipfile
import pandas as pd
import json
import google.generativeai as genai
from pinecone import Pinecone, PodSpec
import langchain
from sentence_transformers import SentenceTransformer
import textwrap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pc = Pinecone(
    api_key="60ec97ef-7862-4555-aa8c-8f8b3f7989e2",
)
index_name = 'retrieval-augmentation-generation'

In [3]:
index = pc.Index(index_name)
# view index stats
index.describe_index_stats() 

{'dimension': 1792,
 'index_fullness': 0.02781,
 'namespaces': {'': {'vector_count': 2781}},
 'total_vector_count': 2781}

In [4]:
embed_self.genapi = SentenceTransformer('aspire/acge_text_embedding')
result = embed_self.genapi.encode("What is the meaning of life?")
print(result)

[ 0.59684855 -0.74949414  0.4362398  ...  0.03004938  0.36892733
 -0.10507792]


In [5]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [6]:
# Get google API key
genai.configure(api_key="AIzaSyC71miq1uuOH1BYm5PiaoqAvDKHPbp712A")

# Define self.genapi
MODEL = "gemini-pro"
genapi = genai.GenerativeModel(MODEL)

In [9]:
class Chatbot:
    def __init__(self):
        self.pc = Pinecone(api_key="60ec97ef-7862-4555-aa8c-8f8b3f7989e2")
        self.index_name = 'retrieval-augmentation-generation'
        self.index = pc.Index(index_name)
        self.embed_model = SentenceTransformer('aspire/acge_text_embedding')
        self.tokenizer = tiktoken.get_encoding('cl100k_base')
        self.text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)
        genai.configure(api_key="AIzaSyC71miq1uuOH1BYm5PiaoqAvDKHPbp712A")
        MODEL = "gemini-pro"
        self.genapi = genai.GenerativeModel(MODEL)
    
    def generate_related_query(self, qeury, num_results=4):
        prompt = textwrap.dedent(
            f"""
            You are a helpful assistant that can generate multiple search queries based on a given search query. 
            Please enerate multiple search queries related to: {qeury},
            OUTPUT: ({num_results} queries):
            """
        )
        response = self.genapi.generate_content(prompt)
        return response.text
    
    def generate_fake_answer(self, qeury):
        query_prompt = textwrap.dedent(
            f"""
            You are a helpful assistant of 資訊工程學系 in 國立中央大學(NCU). Please write a passage to answer the qustion. Your answer should be in sentences only and with the same language as the query.
            Question: {qeury}
            Passage:
        """
        )
        response = self.genapi.generate_content(query_prompt)
        return response.text.strip()
    
    def vector_search(self, query, top_k=100):
        embed_query = self.embed_model.encode(query)
        matches = index.query(vector=embed_query.tolist(), top_k=top_k)
        return {ret["id"]: ret["score"] for ret in matches.matches}
    
    def reciprocal_rank_fusion(self, search_results, k=60):
        fused_results = {}

        for query, results in search_results.items():
            for rank, (doc_id, score) in enumerate(results.items()):
                if doc_id not in fused_results:
                    fused_results[doc_id] = 0
                fused_results[doc_id] += 1 / (rank + k)
        
        reranked_results = {doc_id: score for doc_id, score in sorted(fused_results.items(), key=lambda x: x[1], reverse=True)}
        return reranked_results
    
    def generate_hit_results(self, results, num_results=3):
        retrival_data = []
        searching_ids = list(results.keys())[:num_results]
        result = index.fetch(searching_ids)
        for id in searching_ids:
            retrival_data.append(result["vectors"][id].metadata)
        hit_results = []
        for i, data in enumerate(retrival_data):
            r = ""
            r += f"'''No.{i+1} {data['title']}: {data['source']}\n"
            r += f"No.{i+1} data: {data['text']}'''\n"
            hit_results.append(r)
        return hit_results
    
    def generate_output(self, query, hit_result):
        prompt = textwrap.dedent("""QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'
    You are a great assistant. There are a total of several pieces of searched information here. Please extract the relevant parts of each piece of information based on the user's question and organize it into complete and understandable content and reply to the user. Please extract the information one by one with the given order. You should use the language of input to answer this question. Make sure there are no omission, and provide the source URL of all pieces of information.
    OUTPUT: (Information extracted with all {length} pieces of information in order.)
    """).format(query=query, relevant_passage=hit_result, length=len(hit_result))
        return self.genapi.generate_content(prompt)
    
    def generate_answer(self, query):
        fake_answers = [self.generate_fake_answer(q) for q in [query, *[self.generate_related_query(query).split("\n")]] if q]
        # print(fake_answers)
        gathered_results = {}
        for q in fake_answers:
            search_results = self.vector_search(q)
            gathered_results[q] = search_results
        # display(gathered_results)
        results = self.reciprocal_rank_fusion(gathered_results)
        hit_result = self.generate_hit_results(results)
        # print(hit_result)
        return self.generate_output(query, hit_result)
    
    def choose_action(self, query):
        

In [10]:
chatbot = Chatbot()

In [12]:
query = "系上的修業規定?"
answer = chatbot.generate_answer(query)

print("------------------------------------------------------------")
print(" | Search: ", query, " | ")
print("根據您的問題，我們找到了以下資訊：")
print(answer.text)
print("------------------------------------------------------------")

------------------------------------------------------------
 | Search:  系上的修業規定?  | 
根據您的問題，我們找到了以下資訊：
1. **系上的修業規定?**
[修業規定](https://www.csie.ncu.edu.tw/information/course)
修業規定請參考註冊組網頁


2. **系訂必修 60 學分\n選俢45 學分**
[課程設計](https://www.csie.ncu.edu.tw/information/course)
專業師資
畢業生就業流向
入學資訊
招生資訊
課程設計
最新修業辦法請參考 教務處網頁
課程設計理念
本系課程規劃分三部份，藉由共同必修、數學及基礎科學課程、工程專業課程等三方面的課程規劃給於學生完整的學習系統。其中共同必修由校共同必修科目以及核心通識組成：包含校共同必修國文、英文、歷史以及通識課程。
共同必修 23 學分： 校共同必修(9學分)及通識(含核心必修、 選修)(14 學分)。
系訂必修 60 學分
選俢45 學分


3. **修業規定請參考註冊組網頁**
[課程設計](https://www.csie.ncu.edu.tw/information/course)
修業規定
修業規定請參考註冊組網頁
學分學程修讀規定
請參考課務組網頁
------------------------------------------------------------


: 