## Installation

In [1]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U faiss-cpu
!pip install -q -U datasets
!pip install -q -U git+https://github.com/Cafelatte1/hugging-rag

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━

## Setup

In [2]:
GLOBAL_SEED = 42

import os
os.environ['PYTHONHASHSEED'] = str(GLOBAL_SEED)
import sys

import random as rnd
import pandas as pd
import numpy as np
from numpy import random as np_rnd
from tqdm import tqdm
import gc
import time

import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoConfig, BitsAndBytesConfig

from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from HuggingRAG.vector_data import VectorDataContainer
from HuggingRAG.huggingface_vector_embedding import HuggingFaceVectorEmbedding
from HuggingRAG.faiss_vector_store import FaissVectorStore
from HuggingRAG.vector_ranker import VectorRanker
from HuggingRAG.huggingface_api import HuggingFaceAPI

import warnings
warnings.filterwarnings(action='ignore')

## Loading rawdata

In [3]:
df = load_dataset("cnn_dailymail", "3.0.0")
df = df["validation"].to_pandas()
# sampling
df_test = df.iloc[100:110].reset_index(drop=True)
df = df.iloc[:100].reset_index(drop=True)

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [4]:
df_test.head()

Unnamed: 0,article,highlights,id
0,"(CNN)This week, Google CFO Patrick Pichette ma...",Google CFO Patrick Pichette's memo announcing ...,3f30ddc2b0b2dfaa6455dc53c06858354dd552be
1,"(CNN)A.J. Pero, a longtime drummer for the met...","A.J. Pero, the longtime Twisted Sister drummer...",6561228f2de49a532933531cddd5b14a0876491a
2,(CNN)The world of Mexican wrestling wrestling ...,Pro wrestler Hijo del Perro Aguayo collapses i...,61a34a0926cdd21fcf0c5a12191d7a4cd167b047
3,(CNN)New Zealand police have revealed a threat...,New Zealand police reveal threat to poison inf...,3d7c727b0dc57e1afcfad5219a4b5442a0ab7a9d
4,(CNN)Sigma Alpha Epsilon is under fire for a v...,Sigma Alpha Epsilon is being tossed out by the...,00716be72be8cf48cc23ac3b4b8924e569628be2


In [5]:
df.head()

Unnamed: 0,article,highlights,id
0,"(CNN)Share, and your gift will be multiplied. ...",Zully Broussard decided to give a kidney to a ...,a4942dd663020ca54575471657a0af38d82897d6
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",The 20th MLS season begins this weekend .\nLea...,4157bc4da185971e2742f349d69a037343bc0d95
2,"(CNN)French striker Bafetimbi Gomis, who has a...",Bafetimbi Gomis collapses within 10 minutes of...,60736693e3b1b32d14337a317190c6606e879a85
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy throws club into water at WGC Cad...,8cdf9cc3ed0276b7a7944cc18ba459355b5984ad
4,(CNN)A Pennsylvania community is pulling toget...,"Cayman Naib, 13, hasn't been heard from since ...",2e6613d531843515bf5401286cc3e45c4df530d2


## Create Vector Data

In [6]:
vector_data = VectorDataContainer(
    text_preprocessor=(lambda text: " ".join(text.split())),
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
)
vector_data.get_vector_data(
    doc_id=df["id"].to_list(),
    doc_features={
        "article": df["article"],
        "highlights": df["highlights"],
    }
)
print("Number of chunks:", len(vector_data.get_chunks()))

100%|██████████| 100/100 [00:00<00:00, 1102.28it/s]

Number of chunks: 2050





## Create Vector Embedding

In [7]:
model_id = "sentence-transformers/all-MiniLM-L6-v2" # <- Change the model whatever you want in huggingface
# check maximum length of model
model_config = AutoConfig.from_pretrained(model_id)
print(model_config)
max_len = 512

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

BertConfig {
  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.38.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [8]:
# create vector embedding class
vector_embedding = HuggingFaceVectorEmbedding(model_id, max_len, device="cuda")
embedding = vector_embedding.get_vector_embedding(vector_data.get_chunks(), batch_size=64)

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

100%|██████████| 33/33 [00:18<00:00,  1.76it/s]


## Create Vector Store & Ranker

In [9]:
# create vector store class

# Brute-force searching with faiss
# 'exponential_weighted' type calcuate similarity on all chunks and average the similarity with exponential weights
vector_ranker = VectorRanker(ranking_type="exponential_weighted")
vector_store = FaissVectorStore(vector_data, vector_ranker)
vector_store.get_vector_store(embedding, use_gpu=False)

# # Approximate searching with scann (additionally need to install scann library)
# scann_params = {
#     "build": {
#         "num_leaves": 100,
#         "num_leaves_to_search": 10,
#         "training_sample_size": 100_000,
#     },
# }
# vector_store = VectorStore(vector_data, store_model_type="scann")
# vector_store.get_vectorstore(embedding, building_params=scann_params)

In [10]:
# example on document searching
search_query = df_test["article"].iloc[0]
print("=== Input text ===")
print(search_query.replace(". ", ".\n"))
doc_embedding = vector_embedding.get_vector_embedding(search_query)
output = vector_store.search(doc_embedding)
print("\n\n")
# get a best similar document
print(f"=== Highlights of output document (id: {output['score_by_docs']['doc_id'].iloc[0]}) ===")
print(df.loc[df["id"] == output["score_by_docs"]["doc_id"].iloc[0], "article"].iloc[0].replace(". ", ".\n"))
# searched document is similar on query document, which is related to crime

=== Input text ===
(CNN)This week, Google CFO Patrick Pichette made headlines when his resignation memo announcing his retirement surfaced in the media.
But the uproar wasn't that Pichette was quitting so much as why.
"After nearly seven years as CFO," he began, "I will be retiring from Google to spend more time with my family." What he wanted now was to enjoy life at home and abroad with his wife, to "grab our backpacks and hit the road -- celebrate our last 25 years together by turning the page and enjoy a perfectly fine midlife crisis full of bliss and beauty." The letter, which he said he wrote in part because, "so many people struggle to strike the right balance between work and personal life," has been held up as a manifesto for the "work/life balance" ideal that's become something of the new American dream.
The media has described it as "powerful" and "unusually reflective." Google co-founder and CEO Larry Page said, "Well worth reading, it will warm your heart." But if Pichette

100%|██████████| 1/1 [00:00<00:00,  5.44it/s]





=== Highlights of output document (id: b4fdd95dfcf0a6898ea0152c3837f266e76aae35) ===
(CNN)We have no problem taking Wall Street executives to task for decisions that leave American families financially devastated, yet we give Silicon Valley billionaires a pass when they do the same thing.
America needs to realize that instead of creating jobs, Silicon Valley is erasing them, leaving millennials financially stranded before their careers can get off the ground.
Silicon Valley is tossing millennials aside like yesterday's laptop.
The commonly held belief is that with hard work and a good education, a young person in America can get a good job.
But despite falling unemployment, college grads age 22 to 27 are stuck in low-paying jobs that don't even require a college degree.
The percentage of young people languishing in low-skill, low-paying jobs is 44%, a 20-year high.
Only 36% of college grads have jobs that pay at least $45,000, a sharp decline from the 1990s, after adjusting for infl

## Generation with Retrieval Documents

In [11]:
model_id = "openai-community/gpt2-xl" # <- Change the model whatever you want in huggingface
# check maximum length of model
model_config = AutoConfig.from_pretrained(model_id)
print(model_config)
max_len = 1024

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

GPT2Config {
  "_name_or_path": "openai-community/gpt2-xl",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.38.0.dev0",
  "use_cache": true,
  "vocab_size": 50257
}



In [12]:
# config on model for quantization
quantization_params = BitsAndBytesConfig(
    # 4bit quantization
    load_in_4bit=True,
    # set data type in saving the weights
    bnb_4bit_quant_type="nf4",
    # use double quantization
    bnb_4bit_use_double_quant=True,
    # set data type in calculating the weights
    bnb_4bit_compute_dtype=torch.bfloat16,
)

llm = HuggingFaceAPI(
    model_id, max_len, vector_data, vector_embedding, vector_store, quantization_params=quantization_params, device="cuda"
)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [13]:
generation_params = {
    "max_new_tokens": 300,
    "num_beams": 3,
    "do_sample": True,
    "temperature": 0.8,
    "top_k": 50,
    "top_p": 0.95,
    "length_penalty": 1.0,
    "repetition_penalty": 1.2,
    "no_repeat_ngram_size": 3,
}
generation_params["early_stopping"] = True if generation_params["num_beams"] > 1 else False

prompt = llm.create_prompt_template(lang="eng")
# query is the texts of an article
search_query = df_test["article"].iloc[0]
question = "Summarize on article of [Document 1]"
output = llm.generate(
    prompt, search_query=search_query, question=question, generation_params=generation_params,
    feature_length_strategy="balanced", max_feature_length=768, feature_length_threshold=95,
)

100%|██████████| 1/1 [00:00<00:00,  5.72it/s]


In [14]:
print(output["response"])

Instructions: Please refer to the searched documents to provide an appropriate response to the request.
The searched documents are in the format [Document N] within the ``` delimiter.
If you do not know the request, please respond with 'I don't know.'

Searched documents
```
[Document 1]
article: (CNN)We have no problem taking Wall Street executives to task for decisions that leave American families financially devastated, yet we give Silicon Valley billionaires a pass when they do the same thing. America needs to realize that instead of creating jobs, Silicon Valley is erasing them, leaving millennials financially stranded before their careers can get off the ground. Silicon Valley is tossing millennials aside like yesterday's laptop. The commonly held belief is that with hard work and a good education, a young person in America can get a good job. But despite falling unemployment, college grads age 22 to 27 are stuck in low-paying jobs that don't even require a college degree. The pe

In [15]:
# top rank documents of which scores are average by the type you have gived
print(output["retrieval_docs"]["score_by_docs"].head(10))

                                     doc_id    scores
0  b4fdd95dfcf0a6898ea0152c3837f266e76aae35  0.629838
1  dc0f01376a2e0cfe8dff118a43eae341ac24cf41  0.614461
2  50f309d789f0a7a27ca49fb80976b6b34fd084c5  0.606136
3  d880a8ef6c527db2e1d9378399e4312a2267101e  0.599973
4  f4a21564e3c65809f110b832138f2a3cf5b019cc  0.593352
5  49045b9f988c35b8310c114ffcf4599a6b84d3b3  0.592631
6  ce7f51dc71d8367bc6d5265c290531351355a337  0.591708
7  7cbac82527ee6f0e72f2003fd0fe8d479d4becfa  0.586563
8  fbc5ac3a3a7bb6c4d628cfbeef92b67bb18562f9  0.581821
9  d3a8c8f3aa8aeb9fbc24151a223e3a4b17447644  0.581668


In [16]:
# all chunks from top rank docs
print(output["retrieval_docs"]["score_by_chunks"].head(10))

                                     doc_id  chunk_id    scores
0  b4fdd95dfcf0a6898ea0152c3837f266e76aae35         7  0.690630
1  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        18  0.684058
2  b4fdd95dfcf0a6898ea0152c3837f266e76aae35         0  0.684013
3  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        30  0.674659
4  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        22  0.672502
5  b4fdd95dfcf0a6898ea0152c3837f266e76aae35         6  0.663150
6  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        10  0.656962
7  b4fdd95dfcf0a6898ea0152c3837f266e76aae35         2  0.653012
8  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        23  0.649016
9  b4fdd95dfcf0a6898ea0152c3837f266e76aae35        21  0.648498
