# Assignment 2: Retrieval-Augmented Generation

In [1]:
# install packages
!pip install -qU google-generativeai pandas wget elasticsearch pinecone-client langchain tiktoken sentence-transformers

In [1]:
# import modules

from getpass import getpass
from elasticsearch import Elasticsearch, helpers
import wget
import zipfile
import pandas as pd
import json
import google.generativeai as genai
from pinecone import Pinecone, PodSpec
import langchain
from sentence_transformers import SentenceTransformer
import textwrap

  from .autonotebook import tqdm as notebook_tqdm


## Connect to Pinecone
You need to sign up a Pinecone Project first.

In [2]:
pc = Pinecone(
    api_key="60ec97ef-7862-4555-aa8c-8f8b3f7989e2",
)

Create Index


In [3]:
index_name = 'retrieval-augmentation-generation'

In [129]:
pc.delete_index(index_name)
pc.create_index(
    index_name,
    dimension=1792,  # dimensionality of your embedding model
    metric="cosine", # euclidean, cosine, or dotproduct
    spec=PodSpec(
      environment="gcp-starter"
    )
)

In [4]:
index = pc.Index(index_name)
# view index stats
index.describe_index_stats() 

{'dimension': 1792,
 'index_fullness': 0.02781,
 'namespaces': {'': {'vector_count': 2781}},
 'total_vector_count': 2781}

## Download and set the embedding model
You can find another embedding model from [MTEB Leaderboard Chinese](https://huggingface.co/spaces/mteb/leaderboard)

In [5]:
embed_model = SentenceTransformer('aspire/acge_text_embedding')
result = embed_model.encode("What is the meaning of life?")
print(result)

[ 0.59684855 -0.74949414  0.4362398  ...  0.03004938  0.36892733
 -0.10507792]


In [8]:
result.shape

(1792,)

## Load your document
Upload your data or you can get sample data from here [result.json](https://drive.google.com/file/d/1851ouBLReFwO7-T4W8NIpIn2BYGOkESm/view?usp=sharing)

In [5]:

with open('result.json', 'r') as file:
  data = json.load(file)

for item in data:
  url_title = []
  url = []
  for link in item['links']:
    url_title.append(link[0])
    url.append(link[1])
  item['links_title'] = url_title
  item['links_url'] = url
  
  img_alt = []
  img_url = []
  for img in item['imgs']:
    img_alt.append(img[0] if img[0] else "")
    img_url.append(img[1])
  item['images_alt'] = img_alt
  item['images_url'] = img_url

print(data[0])

{'url': 'https://www.csie.ncu.edu.tw', 'title': '國立中央大學資訊工程學系', 'depth': 0, 'content': '國立中央大學資訊工程學系\n首頁系所介紹系所簡介成員簡介研究領域實驗室簡介招生資訊大學部碩士班博士班外籍生在職班課程內容大學部碩士班博士班檔案下載畢業生就業流向\n得獎訊息\n徵才訊息\n招生快訊\n演講公告\n活動快訊\n課程訊息\n系辦公告\n得獎訊息\n徵才訊息\n招生快訊\n演講公告\n活動快訊\n課程訊息\n系辦公告\n得獎訊息\n                                    更多                                \n1122學期書卷獎(1121學期成績優異)得獎名單\n2024-03-06\n【獎狀領取】1121學期書卷獎(1112學期成績優異)得獎名單\n2024-03-06\n賀！楊鎮華講座教授榮獲Future Earth Taipei 2023年度貢獻獎\n2024-01-03\n賀！本系學生參加112年度全國大專電腦軟體設計競賽榮獲第四名及佳作\n2023-11-30\n賀！吳曉光教授、孫敏德教授指導實驗室團隊參加ECICE 2023榮獲最佳論文獎\n2023-11-30\n賀！本系學生參加ICPC 2023國際大學生程式設計競賽亞洲區桃園站榮獲銀牌獎和銅牌獎\n2023-11-21\n賀！林家瑜教授指導實驗室團隊參加第28屆大專校院資訊應用服務創新競賽榮獲第三名及最佳人氣獎\n2023-11-10\n賀！張嘉惠教授指導實驗室團隊參加ROCLING 2023榮獲最佳論文獎\n2023-11-01\n賀！王家慶教授研究室團隊參加2023客語語音辨認競賽榮獲學生組客語漢字組第一名\n2023-11-01\n【獎狀領取】1112學期書卷獎(1111學期成績優異)得獎名單\n2023-10-13\n徵才訊息\n                                    更多                                \n員榮醫療體系【員榮醫院獎助學金】\n2024-03-14\n國立中央大學資訊工程學系 誠徵助理教授(含)以上之專任教師\n2024-01-30\n職涯發展中心活動報名取消機制\n

## Indexing
Split your text into chunks of a specified max length

In [6]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [9]:
from tqdm.auto import tqdm
from uuid import uuid4

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'title': record['title'],
        'source': record['url'],
        "depth": record['depth'],
        "links_title": record['links_title'],
        "links_url": record['links_url'],
        "images_alt": record['images_alt'],
        "images_url": record['images_url']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['content'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches

    id = str(uuid4())
    record["id"] = id # assign the id to the record
    ids = [f"{id}#chunk{d['chunk']}" for d in record_metadatas]
    embeds = embed_model.encode(record_texts)
    try:
        index.upsert(vectors=zip(ids, embeds, record_metadatas))
    except Exception as e:
        print(e)
        print(f"Upsert failed: {metadata['title']}, {metadata['source']}")
        continue
    record_texts = []
    record_metadatas = []

 24%|██▍       | 158/653 [01:02<03:18,  2.50it/s]

(400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '115', 'x-pinecone-request-latency-ms': '199', 'x-pinecone-request-id': '3217441989890665105', 'date': 'Thu, 21 Mar 2024 19:16:45 GMT', 'x-envoy-upstream-service-time': '23', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":3,"message":"Metadata size is 61943 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}

Upsert failed: 國立中央大學招生資訊網-大學「申請入學」招生管道, https://admission.ncu.edu.tw/zh-TW/content/71/12


100%|██████████| 653/653 [03:42<00:00,  2.94it/s]


In [125]:
index.describe_index_stats()

{'dimension': 1792,
 'index_fullness': 0.02781,
 'namespaces': {'': {'vector_count': 2781}},
 'total_vector_count': 2781}

Save the new data with id as database (because free plan can't perform index listing)

In [19]:
with open("result_db.json", "w", encoding="utf-8") as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

Restore the database by loading the database json

In [6]:
with open("result_db.json", "r", encoding="utf-8") as file:
    db = json.load(file)

## Encode query with OpenAI embedding model
To perform kNN search, we need to encode queries with the ***same embedding model*** used to encode the documents at index time.

In [15]:
# Define your question
query = '系上有哪些獎學金?'
embed_query = embed_model.encode(query)
print(embed_query.shape)
print(type(embed_query))

(1792,)
<class 'numpy.ndarray'>


Then search the query from your pinecone index.

In [9]:
matches = index.query(
  vector=embed_query.tolist(),
  top_k=10,
  include_values=False,
)
print(len(matches.matches))

10


Preprocess your data by your way.

In [None]:
retrival_data = []
for item in matches.matches:
  result = index.fetch([item.id])
  retrival_data.append(result['vectors'][item.id].metadata)

top3_hit_result = ""
for i, data in enumerate(retrival_data):
  top3_hit_result += f"No.{i+1} data: {data['text']}\n"
  top3_hit_result += f"No.{i+1} url source: {data['source']}\n"
print(top3_hit_result)

## Use Chat Completions API for retrieval augmented generation
Using a LLM model together with a retrieval model is known as retrieval augmented generation (RAG). We're using Pinecone to do what it does best, retrieve relevant documents. Then we use the LLM to do what it does best, tasks like generating summaries and answering questions, using the retrieved documents as context.

The model will generate a response to the question, using the top kNN hit as context. In this example, we're using the google [gemini-pro](https://ai.google.dev/docs?_gl=1*1h1fkzn*_up*MQ..&gclid=CjwKCAiA0PuuBhBsEiwAS7fsNcytPlXijnHKH8rOIAPXZqLjnWID-cnLQP7fdgMcPsKTVm2TaEUyNBoCfd4QAvD_BwE) model.


In [6]:
# Get google API key
genai.configure(api_key="AIzaSyC71miq1uuOH1BYm5PiaoqAvDKHPbp712A")

# Define model
MODEL = "gemini-pro"
model = genai.GenerativeModel(MODEL)

In [7]:
query = "介紹一下中央大學的資訊工程系"
query_prompt = textwrap.dedent(
    f"""User query: {query}
    You are a great keyword extraction model. Please extract the keywords from the given user query. Please provide keywords separated by a comma. Please
    try to extract as least as you can.
    """
)
query_response = model.generate_content(query_prompt)
print(query_response.text)

cleaned_query = query_response.text
embedding = embed_model.encode(cleaned_query)

splited_keywords = cleaned_query.split(',')
matches = index.query(
    vector=embedding.tolist(),
    top_k=3,
    filter={
        "content": {"$nin": splited_keywords}
    }
)
print(matches.matches)

中央大學, 資訊工程系
[{'id': '38bb478c-ea45-4a43-bc99-73181ac37a0e#chunk0',
 'score': 0.890612423,
 'values': []}, {'id': 'c62587ed-0c84-4822-8cc4-3cc3720b6bd2#chunk0',
 'score': 0.884719,
 'values': []}, {'id': '16e57fb0-b707-4f9c-b1f5-d96fcdd92edc#chunk0',
 'score': 0.879389107,
 'values': []}]


In [8]:
retrival_data = []
for item in matches.matches:
  id_prefix, chunk_idx = item.id.split("#")
  chunk_idx = int(chunk_idx.split("chunk")[1])
  searching_ids = []
  for add in [0]:
    if chunk_idx + add < 0:
      continue
    else:
      searching_ids.append(f"{id_prefix}#chunk{chunk_idx + add}")
  result = index.fetch(searching_ids)
  metadata = result["vectors"][searching_ids[0]].metadata
  content = ""
  for id in searching_ids:
    try:
      content += result["vectors"][id].metadata["text"]  
    except:
      continue
  metadata["content"] = content
  retrival_data.append(metadata)

top3_hit_result = ""
for i, data in enumerate(retrival_data):
  top3_hit_result += f"'''No.{i+1} {data['title']}: {data['source']}\n"
  top3_hit_result += f"No.{i+1} data: {data['content']}'''\n"
print(top3_hit_result)

'''No.1 學生專區 - 國立中央大學資訊工程學系: https://www.csie.ncu.edu.tw/information
No.1 data: 學生專區 - 國立中央大學資訊工程學系
首頁系所介紹系所簡介成員簡介研究領域實驗室簡介招生資訊大學部碩士班博士班外籍生在職班課程內容大學部碩士班博士班檔案下載畢業生就業流向
課程設計
專業師資
畢業生就業流向
入學資訊
招生資訊
學生專區
認識中大資工'''
'''No.2 學生專區 - 國立中央大學資訊工程學系: https://www.csie.ncu.edu.tw/information/research
No.2 data: 學生專區 - 國立中央大學資訊工程學系
首頁系所介紹系所簡介成員簡介研究領域實驗室簡介招生資訊大學部碩士班博士班外籍生在職班課程內容大學部碩士班博士班檔案下載畢業生就業流向
課程設計
專業師資
畢業生就業流向
入學資訊
招生資訊
專業師資
專業師資(四大學群教師)
人工智慧、資料科學與多媒體研究群
研究方向：
身分認證、人工智慧、深度學習、神經網路
智慧型計算、圖論、最佳化演算法
機器學習、電腦視覺、機器人視覺
視覺偵測、辨識、定位、檢測
影像處理、壓縮、生成、轉換
虛擬實境、擴增實境、人機互動
多媒體資料庫、邊緣計算
醫學影像處理與復健科技
資訊分析技術與應用
資料探勘及倉儲技術的構建與應用'''
'''No.3 網站地圖 - 國立中央大學資訊工程學系: https://www.csie.ncu.edu.tw/sitemap
No.3 data: 網站地圖 - 國立中央大學資訊工程學系
首頁系所介紹系所簡介成員簡介研究領域實驗室簡介招生資訊大學部碩士班博士班外籍生在職班課程內容大學部碩士班博士班檔案下載畢業生就業流向
網站地圖
系所資訊
系所簡介
成員簡介
研究領域
課程內容
公告資訊
最新消息
課程訊息
招生快訊
活動演講
徵才訊息
招生資訊
大學部
碩士班
博士班
外籍生
在職班
線上服務
教室租借系統
聯絡我們
檔案下載
電子報
計畫相關說明
學生資訊
學生專區
高中生專區
系學會
相關連結
系友會
版權宣告
IEET工程及科學教育認證
舊版系網站
中央資工FB
系學會
教室借用
資電院院刊
SNMG
        Copyright © 2024 Department

In [9]:
prompt = textwrap.dedent("""QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'
  You are a great assistant. There are a total of 3 pieces of searched information here. Please extract the relevant parts of each piece of information based on the user's question and organize it into complete and understandable content and reply to the user. Please extract the information one by one with the given order. You should use the language of input to answer this question. Make sure there are no omission, and provide the source URL of all 3 pieces of information. Follow this format:
  
  {{Title of the information}}: {{URL of the information}}
  {{Content of the information}}
  
  You must not exclude any pieces of the searched information. You must keep the url the same with the original url.
  
  """).format(query=query, relevant_passage=top3_hit_result)

answer = model.generate_content(prompt)

print("------------------------------------------------------------")
print(answer.text)
print("------------------------------------------------------------")

------------------------------------------------------------
1. No.1 學生專區 - 國立中央大學資訊工程學系: https://www.csie.ncu.edu.tw/information
系所簡介

國立中央大學資訊工程學系成立於民國 59 年，為台灣最早成立的資訊工程學系之一，至今已超過半世紀的歷史。本系自創系以來，一直秉持「培育創新科技人才，引領社會數位轉型」的使命，致力於培育具備理論基礎、實務經驗、創新思維及社會責任的資訊工程人才。

2. No.2 學生專區 - 國立中央大學資訊工程學系: https://www.csie.ncu.edu.tw/information/research
專業師資

人工智慧、資料科學與多媒體研究群

研究方向：

* 身分認證、人工智慧、深度學習、神經網路
* 智慧型計算、圖論、最佳化演算法
* 機器學習、電腦視覺、機器人視覺
* 視覺偵測、辨識、定位、檢測
* 影像處理、壓縮、生成、轉換
* 虛擬實境、擴增實境、人機互動
* 多媒體資料庫、邊緣計算
* 醫學影像處理與復健科技
* 資訊分析技術與應用
* 資料探勘及倉儲技術的構建與應用

3. No.3 網站地圖 - 國立中央大學資訊工程學系: https://www.csie.ncu.edu.tw/sitemap
招生資訊

* 大學部
* 碩士班
* 博士班
* 外籍生
* 在職班
------------------------------------------------------------
