## Installation

In [1]:
!pip install -q -U openai
!pip install -q -U faiss-cpu
!pip install -q -U datasets
!pip install -q -U git+https://github.com/Cafelatte1/hugging-rag

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.
tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m2

In [2]:
# for user to use .env file
!pip install python-dotenv
from dotenv import load_dotenv
load_dotenv()
# # for user to set API key manually
# os.environ['OPENAI_API_KEY'] = "abc123"

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


True

## Setup

In [3]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys

import random as rnd
import pandas as pd
import numpy as np
from numpy import random as np_rnd
from tqdm import tqdm
import gc
import time

import torch
from torch import nn
import torch.nn.functional as F

from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from HuggingRAG.vector_data import VectorDataContainer
from HuggingRAG.chatgpt_vector_embedding import ChatGPTVectorEmbedding
from HuggingRAG.faiss_vector_store import FaissVectorStore
from HuggingRAG.vector_ranker import VectorRanker
from HuggingRAG.chatgpt_api import ChatGPTAPI

import warnings
warnings.filterwarnings(action='ignore')

## Loading rawdata

In [4]:
df = load_dataset("cnn_dailymail", "3.0.0")
df = df["validation"].to_pandas()
# sampling
df_test = df.iloc[100:110].reset_index(drop=True)
df = df.iloc[:100].reset_index(drop=True)

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [5]:
df_test.head()

Unnamed: 0,article,highlights,id
0,"(CNN)This week, Google CFO Patrick Pichette ma...",Google CFO Patrick Pichette's memo announcing ...,3f30ddc2b0b2dfaa6455dc53c06858354dd552be
1,"(CNN)A.J. Pero, a longtime drummer for the met...","A.J. Pero, the longtime Twisted Sister drummer...",6561228f2de49a532933531cddd5b14a0876491a
2,(CNN)The world of Mexican wrestling wrestling ...,Pro wrestler Hijo del Perro Aguayo collapses i...,61a34a0926cdd21fcf0c5a12191d7a4cd167b047
3,(CNN)New Zealand police have revealed a threat...,New Zealand police reveal threat to poison inf...,3d7c727b0dc57e1afcfad5219a4b5442a0ab7a9d
4,(CNN)Sigma Alpha Epsilon is under fire for a v...,Sigma Alpha Epsilon is being tossed out by the...,00716be72be8cf48cc23ac3b4b8924e569628be2


In [6]:
df.head()

Unnamed: 0,article,highlights,id
0,"(CNN)Share, and your gift will be multiplied. ...",Zully Broussard decided to give a kidney to a ...,a4942dd663020ca54575471657a0af38d82897d6
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",The 20th MLS season begins this weekend .\nLea...,4157bc4da185971e2742f349d69a037343bc0d95
2,"(CNN)French striker Bafetimbi Gomis, who has a...",Bafetimbi Gomis collapses within 10 minutes of...,60736693e3b1b32d14337a317190c6606e879a85
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy throws club into water at WGC Cad...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,(CNN)A Pennsylvania community is pulling toget...,"Cayman Naib, 13, hasn't been heard from since ...",2e6613d531843515bf5401286cc3e45c4df530d2


## Create Vector Data

In [7]:
vector_data = VectorDataContainer(
    text_preprocessor=(lambda text: " ".join(text.split())),
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
)
vector_data.get_vector_data(
    doc_id=df["id"].to_list(),
    doc_features={
        "article": df["article"],
        "highlights": df["highlights"],
    }
)
print("Number of chunks:", len(vector_data.get_chunks()))

100%|██████████| 100/100 [00:00<00:00, 948.50it/s]

Number of chunks: 2050





## Create Vector Embedding

In [8]:
model_id = "text-embedding-3-small" # <- Change the model whatever you want
# max length of text-embedding-3-small is 8191
max_len = 512

In [9]:
# create vector embedding class
vector_embedding = ChatGPTVectorEmbedding(model_id, max_len)
embedding = vector_embedding.get_vector_embedding(vector_data.get_chunks(), batch_size=32)

100%|██████████| 65/65 [00:49<00:00,  1.31it/s]


## Create Vector Store & Ranker

In [10]:
# create vector store class

# Brute-force searching with faiss
# 'exponential_weighted' type calcuate similarity on all chunks and average the similarity with exponential weights
vector_ranker = VectorRanker(ranking_type="exponential_weighted")
vector_store = FaissVectorStore(vector_data, vector_ranker)
vector_store.get_vector_store(embedding, use_gpu=False)

# # Approximate searching with scann (additionally need to install scann library)
# scann_params = {
#     "build": {
#         "num_leaves": 100,
#         "num_leaves_to_search": 10,
#         "training_sample_size": 100_000,
#     },
# }
# vector_store = VectorStore(vector_data, store_model_type="scann")
# vector_store.get_vectorstore(embedding, building_params=scann_params)

In [11]:
# example on document searching
search_query = df_test["article"].iloc[0]
print("=== Input text ===")
print(search_query.replace(". ", ".\n"))
doc_embedding = vector_embedding.get_vector_embedding(search_query)
output = vector_store.search(doc_embedding)
print("\n\n")
# get a best similar document
print(f"=== Highlights of output document (id: {output['score_by_docs']['doc_id'].iloc[0]}) ===")
print(df.loc[df["id"] == output["score_by_docs"]["doc_id"].iloc[0], "article"].iloc[0].replace(". ", ".\n"))
# searched document is similar on query document, which is related to crime

=== Input text ===
(CNN)This week, Google CFO Patrick Pichette made headlines when his resignation memo announcing his retirement surfaced in the media.
But the uproar wasn't that Pichette was quitting so much as why.
"After nearly seven years as CFO," he began, "I will be retiring from Google to spend more time with my family." What he wanted now was to enjoy life at home and abroad with his wife, to "grab our backpacks and hit the road -- celebrate our last 25 years together by turning the page and enjoy a perfectly fine midlife crisis full of bliss and beauty." The letter, which he said he wrote in part because, "so many people struggle to strike the right balance between work and personal life," has been held up as a manifesto for the "work/life balance" ideal that's become something of the new American dream.
The media has described it as "powerful" and "unusually reflective." Google co-founder and CEO Larry Page said, "Well worth reading, it will warm your heart." But if Pichette

100%|██████████| 1/1 [00:00<00:00,  4.53it/s]




=== Highlights of output document (id: b4fdd95dfcf0a6898ea0152c3837f266e76aae35) ===
(CNN)We have no problem taking Wall Street executives to task for decisions that leave American families financially devastated, yet we give Silicon Valley billionaires a pass when they do the same thing.
America needs to realize that instead of creating jobs, Silicon Valley is erasing them, leaving millennials financially stranded before their careers can get off the ground.
Silicon Valley is tossing millennials aside like yesterday's laptop.
The commonly held belief is that with hard work and a good education, a young person in America can get a good job.
But despite falling unemployment, college grads age 22 to 27 are stuck in low-paying jobs that don't even require a college degree.
The percentage of young people languishing in low-skill, low-paying jobs is 44%, a 20-year high.
Only 36% of college grads have jobs that pay at least $45,000, a sharp decline from the 1990s, after adjusting for infl




## Generation with Retrieval Documents

In [12]:
model_id = "gpt-3.5-turbo" # <- Change the model whatever you want
# max length of gpt-3.5-turbo is 8191
max_len = 1024
llm = ChatGPTAPI(
    model_id, max_len, vector_data, vector_embedding, vector_store,
)

In [13]:
generation_params = {
    "max_tokens": 300,
    "temperature": 0.8,
    "seed": GLOBAL_SEED,
}

prompt = llm.create_prompt_template(lang="eng")
# query is the texts of an article
search_query = df_test["article"].iloc[0]
question = "Summarize on article of [Document 1]"
output = llm.generate(
    prompt, search_query=search_query, question=question, generation_params=generation_params,
    feature_length_strategy="balanced", max_feature_length=768, feature_length_threshold=95, reformat_output=True,
)

100%|██████████| 1/1 [00:00<00:00,  4.59it/s]


In [14]:
print(output["response"])

Please refer to the searched documents to provide an appropriate response to the request.
The searched documents are in the format [Document N] within the ``` delimiter.
If you do not know the request, please respond with 'I don't know.'.

Searched documents
```
[Document 1]
article: (CNN)We have no problem taking Wall Street executives to task for decisions that leave American families financially devastated, yet we give Silicon Valley billionaires a pass when they do the same thing. America needs to realize that instead of creating jobs, Silicon Valley is erasing them, leaving millennials financially stranded before their careers can get off the ground. Silicon Valley is tossing millennials aside like yesterday's laptop. The commonly held belief is that with hard work and a good education, a young person in America can get a good job. But despite falling unemployment, college grads age 22 to 27 are stuck in low-paying jobs that don't even require a college degree. The percentage of y

In [15]:
# top rank documents of which scores are average by the type you have gived
print(output["retrieval_docs"]["score_by_docs"].head(10))

                                     doc_id    scores
0  b4fdd95dfcf0a6898ea0152c3837f266e76aae35  0.673265
1  d880a8ef6c527db2e1d9378399e4312a2267101e  0.632507
2  efa996e2bc36eac33348ebd3768a9f7ed02c5e49  0.631985
3  fbc5ac3a3a7bb6c4d628cfbeef92b67bb18562f9  0.614255
4  aa9a164b2f592570b64063546dda4eed22133a0e  0.610614
5  dc0f01376a2e0cfe8dff118a43eae341ac24cf41  0.605731
6  ce7f51dc71d8367bc6d5265c290531351355a337  0.604500
7  7cbac82527ee6f0e72f2003fd0fe8d479d4becfa  0.603436
8  14c1b26ea2b78196c2d1cf3a5882aedb2f6558aa  0.602611
9  d83332b67ad10eebe60beba30160869dc942f8ff  0.602436


In [16]:
# all chunks from top rank docs
print(output["retrieval_docs"]["score_by_chunks"].head(10))

                                     doc_id  chunk_id    scores
0  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        18  0.727263
1  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        21  0.716876
2  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        31  0.715666
3  b4fdd95dfcf0a6898ea0152c3837f266e76aae35         0  0.713705
4  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        22  0.711989
5  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        29  0.699294
6  b4fdd95dfcf0a6898ea0152c3837f266e76aae35         2  0.694611
7  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        13  0.694580
8  b4fdd95dfcf0a6898ea0152c3837f266e76aae35         1  0.693305
9  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        15  0.688116
