# Castena - Chatbot for multilingual podcasts

* Document retrieval via langchain

In [None]:
!pip install langchain huggingface_hub tiktoken -q
!pip install chromadb -q
!pip install PyPDF2 pypdf sentence_transformers -q
!pip install -U together -q
!pip install -U FlagEmbedding -q
!pip install googletrans==3.1.0a0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m437.8/437.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# -- Import libraries
from   typing                      import Any, Dict, List, Mapping, Optional
from   pydantic                    import Extra, Field, root_validator
from   langchain.callbacks.manager import CallbackManagerForLLMRun
from   langchain.memory            import ConversationBufferWindowMemory
from   langchain.llms.base         import LLM
from   langchain.llms.utils        import enforce_stop_tokens
from   langchain.chains.llm        import LLMChain
from   langchain.utils             import get_from_dict_or_env
from   googletrans                 import Translator
from   langchain.vectorstores      import Chroma
from   langchain.text_splitter     import RecursiveCharacterTextSplitter, CharacterTextSplitter
from   langchain.chains            import RetrievalQA, ReduceDocumentsChain, MapReduceDocumentsChain
from   langchain.document_loaders  import TextLoader, DirectoryLoader
from   langchain.embeddings        import HuggingFaceEmbeddings
from   langchain.prompts           import PromptTemplate
from   langchain.schema            import prompt
from   langchain.chains.mapreduce  import MapReduceChain
from   langchain.chains.combine_documents.stuff import StuffDocumentsChain
import pandas                      as pd
import logging
import together
import textwrap
import os

# -- Constants
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

with open("./main/prompts/default_system_prompt.txt", "r") as f:
  DEFAULT_SYSTEM_PROMPT = f.readlines()

# Setup API Key

In [None]:
os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"

# Pre-process data

In [None]:
translator = Translator(service_urls=['translate.googleapis.com'])

def translate_text(text, target_lang='en'):
    translator = Translator()
    translated = translator.translate(str(text), dest=target_lang)
    return translated.text

In [None]:
# -- Chunk to translate spanish transcripts if necessary
transcription_df = pd.read_table('./data/original_spanish_transcriptions/worldcast_roberto_vaquero_transcription.txt', sep='|', header=None)
transcription_df.rename(columns={0: 'time', 1: 'speaker', 2: 'transcript'}, inplace=True)

transcription_df['time'] = pd.to_timedelta(transcription_df['time'])
transcription_df['speaker_change'] = transcription_df['speaker'] != transcription_df['speaker'].shift()

result = transcription_df.groupby(['speaker', transcription_df['speaker_change'].cumsum()]).agg({\
                                                                                                 'time': ['min', 'max'],
                                                                                                 'transcript': lambda x: '.'.join(x)
                                                                                                })
result.columns = result.columns.droplevel()
result.columns = ['min_time', 'max_time', 'transcript']
result.reset_index(inplace=True)
result['min_time'] = result['min_time'].apply(lambda x: str(x).replace('0 days ', ''))
result['max_time'] = result['max_time'].apply(lambda x: str(x).replace('0 days ', ''))
result['literal_transcript'] = 'Desde el instante ' + result['min_time'] + ' hasta ' + result['max_time'] + ' ' + result['speaker'] + ' dice: \"' + result['transcript'] + '\"'
result['literal_transcript'] = result['literal_transcript'].apply(translate_text)
result = result.sort_values('min_time')
# Sample output to save as .txt file (uncomment following line)
# '\n\n'.join(result['literal_transcript'])

# Setting up Together API


In [None]:
# Set your API key
together.api_key = os.environ["TOGETHER_API_KEY"]
# List available models and descriptons
models = together.Models.list()
# Set llama2 7b LLM
together.Models.start("togethercomputer/llama-2-7b-chat")

{'success': True,
 'value': 'abf10168c5fbd33e6c11168d6f168c4ffea6476c7ec025da91c157f25d64d8cb'}

In [None]:
class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key: str = os.environ["TOGETHER_API_KEY"]
    """Together API key"""

    temperature: float = 0.7
    """What sampling temperature to use."""

    max_tokens: int = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def clean_duplicates(self, transcription: str) -> str:
      lines = transcription.strip().split('\n')
      unique_lines = set()

      new_transcription = []

      for linea in lines:
          if linea not in unique_lines:
              new_transcription.append(linea)
              unique_lines.add(linea)

      # Create new transcription without duplicates
      new_transcription = '\n\n'.join(new_transcription)
      return new_transcription

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        cleaned_text = self.clean_duplicates(text)
        return cleaned_text


# LangChain multi-doc retriever with ChromaDB

***Key Points***
- Multiple Files - PDFs
- ChromaDB
- llama-2-7b-chat LLM
- BGE Embeddings (newest version)


## Setting up LangChain


## Load multiple and process documents

In [None]:
# Load and process the text files
loader = TextLoader('./data/translated_transcriptions/worldcast_roberto_vaquero_transcription.txt')
documents = loader.load()

In [None]:
len(documents)

1

In [None]:
# Splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

len(texts)

92

## Load HF BGE Embeddings

In [None]:
model_name = "BAAI/bge-base-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

## create the DB

 T4 GPU

__Why Chroma instead of FAISS?__

Answer:

In terms of performance, there is no direct benchmark comparison available between **Chroma** and **FAISS**. This is because **FAISS** is not regularly used as a stand-alone vector database, so it is difficult to compare it directly with **Chroma**.

**FAISS** is designed for efficient similarity search, which can be crucial for applications involving large-scale semantic search. However, for a production environment, it may need to be built into a custom container or larger system to support CRUD operations, high availability, horizontal scalability, concurrent access, etc. It is built around an Index object. This object encapsulates the set of database vectors and optionally preprocesses them to make the search efficient. There are many types of indexes, but the simplest version performs a brute force Euclidean (L2) distance search.

On the other hand, **Chroma** is designed to run on your machine and was built to handle modern AI workloads, making it suitable for embedding-intensive applications.

Therefore, the choice between **Chroma** and **FAISS** depends on your specific use case. If you're looking for a standalone vector database that's easy to set up and use for local development, **Chroma** may be a good choice. If you need a tool for efficient similarity search and dense vector clustering and ready to build additional functionality around it, **FAISS** might be suitable.

In [None]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: user 2.98 s, sys: 138 ms, total: 3.11 s
Wall time: 2.95 s


## Make a retriever

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 4})

## Make a chain

In [None]:
def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [None]:
sys_prompt = DEFAULT_SYSTEM_PROMPT
instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""
get_prompt(instruction, sys_prompt)

"[INST]<<SYS>>\nYou are an assistant. Your mission is to provide accurate answers to questions regarding the transcription of a YouTube interview.\n\nBe concise and omit disclaimers.\n\nEmphasize the importance of accuracy and refrain from making guesses or assumptions.\n\nIf a question is incoherent, politely explain the issue without directly answering it.\n\nDo not add emojis to the response.\n\nAlways conclude your response with the following text at the end: 'Is there anything else I can assist you with?'\n\n<</SYS>>\n\nCONTEXT:/n/n {context}/n\n\nQuestion: {question}[/INST]"

In [None]:
llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.0,
    max_tokens = 1024
)

In [None]:
prompt_template = get_prompt(instruction, sys_prompt)

llama_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
# -- NOTE: in case we want to add memory, 'history' field must be added to the prompt
# HISTORY:/n/n {history}/n
#memory = ConversationBufferWindowMemory(k=1, memory_key="history", input_key="question")
#chain_type_kwargs = {"prompt": llama_prompt, "memory": memory}
chain_type_kwargs = {"prompt": llama_prompt}

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)

In [None]:
## Cite sources
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
  response = llm_response['result']
  return wrap_text_preserve_newlines(translate_text(response, target_lang='es'))

In [None]:
# Full example
query = "¿Cuál es la opinión del entrevistado sobre la juventud?"
translated_query = translate_text(query, target_lang='en')
llm_response = qa_chain(translated_query)
print(process_llm_response(llm_response))

El entrevistado destaca la importancia de ser útil, tener criterio propio, ser crítico y cuidar de uno mismo y
del propio cuerpo. Creen que estos son los aspectos fundamentales para una persona, y que son imprescindibles
para el desarrollo de los jóvenes. El entrevistado también menciona que tienen un enfoque realista y que sus
propuestas pueden ser difíciles de implementar en el corto o mediano plazo, pero son idealistas y vale la pena
esforzarse por lograrlas. No expresan explícitamente su opinión sobre la juventud, pero su énfasis en el
autocuidado y el desarrollo personal sugiere que valoran el bienestar y la agencia de los jóvenes.


In [None]:
together.Models.stop("togethercomputer/llama-2-7b-chat")

{'success': True}

# Summary of transcription: map reduce technique

In [None]:
with open("/content/transcriptions.es.en.txt") as f:
    docs = f.read()

llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.0,
    max_tokens = 1024
)

# Map
with open("./main/prompts/map_template.txt", "r") as f:
  map_template = f.readlines(f)
map_prompt = PromptTemplate(template=map_template, input_variables=["docs"])
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
with open("./main/prompts/reduce_template.txt", "r") as f:
  reduce_template = f.readlines(f)
reduce_prompt = PromptTemplate(template=reduce_template, input_variables=["doc_summaries"])

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    verbose=True,
    token_max=1024
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
    verbose=True
)
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = True,
)
split_docs = text_splitter.create_documents([docs])



In [None]:
text_summary = map_reduce_chain.run(split_docs)
print(text_summary)



[1m> Entering new MapReduceDocumentsChain chain...[0m

[1m> Finished chain.[0m
The main themes in the conversation between Roberto Vaquero and the interviewer include:



1. The importance of addressing material conditions in building a successful communist society.

2. The need for a practical approach to building a communist society, rather than relying solely on intellectual debates or "woke" movements.

3. Criticism of excessive focus on intellectual debates and "woke" movements.

4. The importance of doing things for the benefit of what you believe in and respecting oneself and others.

5. The need to move beyond just imagining or debating and take action.

6. Vaquero's experiences in the military and how they have shaped his political beliefs.

7. Frustration with the repetition of mistakes and lack of critical thinking within communist groups.

8. Vaquero's active presence on social media platforms, including YouTube and Twitch.

These themes highlight Vaquero's commitment

In [None]:
together.Models.stop("togethercomputer/llama-2-7b-chat")

{'success': True}