In [4]:
#!pip install -qU datasets\
 #       apache_beam\
 #        mwparserfromhell

## Building the Knowledge Base

In [5]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')

In [6]:
#get the encoding for the model
import tiktoken

tiktoken.encoding_for_model(model_name='gpt-4')  #check the encoding for the model

tokenizer = tiktoken.get_encoding('cl100k_base')

In [7]:
# create a function to encode a text

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)


tiktoken_len("Hello I am a chunk of text and using the tiktoken_len function we can find the length of this chunk of text in tokens")

26

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 20,
    length_function = tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [9]:
chunks = text_splitter.split_text(data[5]['text'])[:3]
chunks

['Spain is divided in 17 parts called autonomous communities. Autonomous means that each of these autonomous communities has its own executive, legislative judicial powers. These are similar to, but not the same as, states in the United States of America, for example.\n\nSpain has fifty smaller parts called provinces. In 1978 these parts came together, making the autonomous communities. \nBefore then, some of these provinces were together but were broken. The groups that were together once before are called "historic communities": Catalonia, Basque Country, Galicia and Andalusia.\n\nThe Spanish language is the sole official language in every autonomous community but six, where Spanish is co-official with other languages, as follows:\n Catalonia: Catalan and Occitan\n Valencian Community: Catalan (also called Valencian there)\n Balearic Islands: Catalan\n Galicia: Galician\n Basque Country: Basque\n Navarre: Basque (only in the north and near the border with the Basque County)\n\nList o

In [10]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1])

(222, 243)

## Now this has been split into chunks we need to create the embeddings

In [11]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = "text-embedding-ada-002"


embed  = OpenAIEmbeddings(
    model=model_name,
    openai_api_base=os.environ["OPENAI_REVERSE_PROXY"],
    openai_api_key = os.environ['OPENAI_API_KEY']
)

In [13]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

In [14]:
res = embed.embed_documents(texts)
len(res), len(res[0])

(2, 1536)

## Lets now create our vector database 

In [15]:
index_name = "langchain3"

In [16]:
import pinecone


pinecone.init(
    api_key = os.environ['PINECONE_API_KEY'],
    environment = 'asia-southeast1-gcp-free'
)



#Create pincone index

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name = index_name,
        metric = 'cosine',
        dimension = len(res[0]) # 1536 dim of text-embedding-ada-002
    )

In [24]:
#Then we connect to the new index

index = pinecone.Index(index_name=index_name)

## Indexing

In [30]:
# We can perform the indexing task using the LangChain vector store object. But for now it is much faster to do it via the pinecone python client directly. We will do this in batches of 100 or more.
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100


texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }

    # now we create chunks from the record text

    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each 

    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]


    # append these to current batches

    
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
texts

In [48]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.3,
 'namespaces': {'': {'vector_count': 28422}},
 'total_vector_count': 28422}

## Creating a Vector Store and Querying

In [49]:
#Now that we have buiild our index we can switch back over to LangChain.We start by initializeing a vector store uising the same index we just built. We do that like so:

from langchain.vectorstores import Pinecone
text_field = "text"


#switch back to normal index for langchain

index = pinecone.Index(index_name=index_name)


vectorstore = Pinecone(
    index, embed.embed_query, text_field
)




In [51]:
query = "who was benito mussolini?"

vectorstore.similarity_search(query=query, k=3)

[Document(page_content='Benito Amilcare Andrea Mussolini KSMOM GCTE (29 July 1883 – 28 April 1945) was an Italian politician and journalist. He was also the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party.\n\nBiography\n\nEarly life\nBenito Mussolini was named after Benito Juarez, a Mexican opponent of the political power of the Roman Catholic Church, by his anticlerical (a person who opposes the political interference of the Roman Catholic Church in secular affairs) father. Mussolini\'s father was a blacksmith. Before being involved in politics, Mussolini was a newspaper editor (where he learned all his propaganda skills) and elementary school teacher.\n\nAt first, Mussolini was a socialist, but when he wanted Italy to join the First World War, he was thrown out of the socialist party. He \'invented\' a new ideology, Fascism, much out of Nationalist\xa0and Conservative views.\n\nRise to power and becoming dictator\nIn 1922, he took power b

## Generative Question-Answering


In [52]:
# In GQA we take the query as a question that is to be answered by a LLM, but the LLM must answert the question on the information it is seeing being returned from the vectorstore.

# TO do this we initialize a RetrievalQA object like so:

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA


# completion llm

llm = ChatOpenAI(
    openai_api_base=os.environ['OPENAI_REVERSE_PROXY'],
    openai_api_key=os.environ['OPENAI_API_KEY'],
    model_name = 'gpt-3.5-turbo',
    temperature=0.0
)

In [53]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever()
)

In [54]:
qa.run(query)

'Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy. Mussolini established a dictatorship and implemented policies that aimed to create a new Roman Empire. He allied with Adolf Hitler and led Italy into World War II as part of the Axis Powers. Mussolini was eventually deposed and captured by Italian partisans in 1945, and he was executed by firing squad.'

In [41]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = vectorstore.as_retriever()
)

In [42]:
qa_with_sources(query)

{'question': 'who wa benito mussolini?',
 'answer': "Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of fascism in Italy. Mussolini's form of fascism, known as Italian Fascism, had some differences from Hitler's Nazism and focused on the idea of creating a new Roman Empire. He led Italy into World War II as part of the Axis Powers but was eventually deposed and captured by the Allies. Mussolini was executed by partisans in 1945. After his death, several Neo-Fascist movements emerged in Italy, and his granddaughter Alessandra Mussolini has expressed similar views to fascism. \n",
 'sources': '\n- https://simple.wikipedia.org/wiki/Benito%20Mussolini\n- https://simple.wikipedia.org/wiki/Italy'}