# Castena - Chatbot for multilingual podcasts

* Document retrieval via langchain

In [1]:
!pip install langchain huggingface_hub tiktoken -q
!pip install chromadb -q
!pip install PyPDF2 pypdf sentence_transformers -q
!pip install -U together -q
!pip install -U FlagEmbedding -q
!pip install googletrans==3.1.0a0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m437.8/437.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [100]:
# -- Import libraries
from   typing                      import Any, Dict, List, Mapping, Optional
from   pydantic                    import Extra, Field, root_validator
from   langchain.callbacks.manager import CallbackManagerForLLMRun
from   langchain.memory            import ConversationBufferWindowMemory
from   langchain.llms.base         import LLM
from   langchain.llms.utils        import enforce_stop_tokens
from   langchain.chains.llm        import LLMChain
from   langchain.utils             import get_from_dict_or_env
from   googletrans                 import Translator
from   langchain.vectorstores      import Chroma
from   langchain.text_splitter     import RecursiveCharacterTextSplitter, CharacterTextSplitter
from   langchain.chains            import RetrievalQA, ReduceDocumentsChain, MapReduceDocumentsChain
from   langchain.evaluation.qa     import QAEvalChain
from   langchain.document_loaders  import TextLoader, DirectoryLoader
from   langchain.embeddings        import HuggingFaceEmbeddings
from   langchain.prompts           import PromptTemplate
from   langchain.schema            import prompt
from   langchain.chains.mapreduce  import MapReduceChain
from   langchain.chains.combine_documents.stuff import StuffDocumentsChain
from   tqdm                        import tqdm
import pandas                      as pd
import collections
import logging
import together
import textwrap
import torch
import json
import os
import re

# -- Constants
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

with open("./main/prompts/default_system_prompt.txt", "r") as f:
  DEFAULT_SYSTEM_PROMPT = f.readlines()

# Setup API Key

In [3]:
os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"

# Pre-process data

In [4]:
translator = Translator(service_urls=['translate.googleapis.com'])

def translate_text(text, target_lang='en'):
    translator = Translator()
    translated = translator.translate(str(text), dest=target_lang)
    return translated.text

In [5]:
# -- Chunk to translate spanish transcripts if necessary
transcription_df = pd.read_table('./data/original_spanish_transcriptions/worldcast_roberto_vaquero_transcription.txt', sep='|', header=None)
transcription_df.rename(columns={0: 'time', 1: 'speaker', 2: 'transcript'}, inplace=True)

transcription_df['time'] = pd.to_timedelta(transcription_df['time'])
transcription_df['speaker_change'] = transcription_df['speaker'] != transcription_df['speaker'].shift()

result = transcription_df.groupby(['speaker', transcription_df['speaker_change'].cumsum()]).agg({\
                                                                                                 'time': ['min', 'max'],
                                                                                                 'transcript': lambda x: '.'.join(x)
                                                                                                })
result.columns = result.columns.droplevel()
result.columns = ['min_time', 'max_time', 'transcript']
result.reset_index(inplace=True)
result['min_time'] = result['min_time'].apply(lambda x: str(x).replace('0 days ', ''))
result['max_time'] = result['max_time'].apply(lambda x: str(x).replace('0 days ', ''))
result['literal_transcript'] = 'Desde el instante ' + result['min_time'] + ' hasta ' + result['max_time'] + ' ' + result['speaker'] + ' dice: \"' + result['transcript'] + '\"'
#result['literal_transcript'] = result['literal_transcript'].apply(translate_text)
result = result.sort_values('min_time')
# Sample output to save as .txt file (uncomment following line)
# '\n\n'.join(result['literal_transcript'])

In [6]:
result.to_csv("original_transcription_grouped.csv", index=False)

# Setting up Together API


In [7]:
# Set your API key
together.api_key = os.environ["TOGETHER_API_KEY"]
# List available models and descriptons
models = together.Models.list()
# Set llama2 7b LLM
together.Models.start("togethercomputer/llama-2-7b-chat")

{'success': True,
 'value': '017a2a3a4869ebd55040671ebf77e0f970417ab4e4ffe941f1585a9bd3f3e664-02c45ae41fd940cc2ab915307e89b598e10582eacfc2f5485211749fff07cb3c'}

In [8]:
class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key: str = os.environ["TOGETHER_API_KEY"]
    """Together API key"""

    temperature: float = 0.7
    """What sampling temperature to use."""

    max_tokens: int = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def clean_duplicates(self, transcription: str) -> str:
      lines = transcription.strip().split('\n')
      unique_lines = set()

      new_transcription = []

      for linea in lines:
          if linea not in unique_lines:
              new_transcription.append(linea)
              unique_lines.add(linea)

      # Create new transcription without duplicates
      new_transcription = '\n\n'.join(new_transcription)
      return new_transcription

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        cleaned_text = self.clean_duplicates(text)
        return cleaned_text


# LangChain multi-doc retriever with ChromaDB

***Key Points***
- Multiple Files - PDFs
- ChromaDB
- llama-2-7b-chat LLM
- BGE Embeddings (newest version)


## Setting up LangChain


## Load multiple and process documents

In [9]:
# Load and process the text files
loader = TextLoader('./data/translated_transcriptions/worldcast_roberto_vaquero_transcription.txt')
documents = loader.load()

In [10]:
len(documents)

1

In [11]:
# Splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

len(texts)

92

## Load HF BGE Embeddings

In [12]:
model_name = "BAAI/bge-base-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

Downloading (…)714e1/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)c46d0714e1/README.md:   0%|          | 0.00/89.0k [00:00<?, ?B/s]

Downloading (…)6d0714e1/config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)714e1/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)c46d0714e1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)d0714e1/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## create the DB

 T4 GPU

__Why Chroma instead of FAISS?__

Answer:

In terms of performance, there is no direct benchmark comparison available between **Chroma** and **FAISS**. This is because **FAISS** is not regularly used as a stand-alone vector database, so it is difficult to compare it directly with **Chroma**.

**FAISS** is designed for efficient similarity search, which can be crucial for applications involving large-scale semantic search. However, for a production environment, it may need to be built into a custom container or larger system to support CRUD operations, high availability, horizontal scalability, concurrent access, etc. It is built around an Index object. This object encapsulates the set of database vectors and optionally preprocesses them to make the search efficient. There are many types of indexes, but the simplest version performs a brute force Euclidean (L2) distance search.

On the other hand, **Chroma** is designed to run on your machine and was built to handle modern AI workloads, making it suitable for embedding-intensive applications.

Therefore, the choice between **Chroma** and **FAISS** depends on your specific use case. If you're looking for a standalone vector database that's easy to set up and use for local development, **Chroma** may be a good choice. If you need a tool for efficient similarity search and dense vector clustering and ready to build additional functionality around it, **FAISS** might be suitable.

In [13]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: user 2.92 s, sys: 266 ms, total: 3.19 s
Wall time: 4.09 s


## Make a retriever

In [14]:
retriever = vectordb.as_retriever(search_kwargs={"k": 4})

## Make a chain

In [26]:
def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [27]:
sys_prompt = DEFAULT_SYSTEM_PROMPT
instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""
get_prompt(instruction, sys_prompt)

"[INST]<<SYS>>\nYou are an assistant. Your mission is to provide accurate answers to questions regarding the transcription of a YouTube interview.\n\nBe concise and omit disclaimers.\n\nEmphasize the importance of accuracy and refrain from making guesses or assumptions.\n\nIf a question is incoherent, politely explain the issue without directly answering it.\n\nDo not add emojis to the response.\n\nAlways conclude your response with the following text  at the end: 'Is there anything else I can assist you with?'\n<</SYS>>\n\nCONTEXT:/n/n {context}/n\n\nQuestion: {question}[/INST]"

In [67]:
llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.0,
    max_tokens = 1024
)

In [29]:
prompt_template = get_prompt(instruction, sys_prompt)

llama_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [30]:
# -- NOTE: in case we want to add memory, 'history' field must be added to the prompt
# HISTORY:/n/n {history}/n
#memory = ConversationBufferWindowMemory(k=1, memory_key="history", input_key="question")
#chain_type_kwargs = {"prompt": llama_prompt, "memory": memory}
chain_type_kwargs = {"prompt": llama_prompt}

In [31]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)

In [32]:
## Cite sources
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
  response = llm_response['result']
  return wrap_text_preserve_newlines(translate_text(response, target_lang='es'))

In [33]:
# Full example
query = "¿Cuál es la opinión del entrevistado sobre la juventud?"
translated_query = translate_text(query, target_lang='en')
llm_response = qa_chain(translated_query)
print(process_llm_response(llm_response))

Gracias por solicitar ayuda. Estoy aquí para ayudarte con tu pregunta.



La opinión del entrevistado sobre la juventud no se expresa explícitamente en la sección proporcionada de la
entrevista. Sin embargo, se puede inferir que el entrevistado valora la importancia de la juventud en la
configuración del futuro de la sociedad. El entrevistado destaca la necesidad de que los jóvenes sean
pensadores críticos y cuiden lo que ponen en su cuerpo y cómo se desarrollan. También reconocen que la
juventud es un período crucial para el crecimiento y desarrollo personal.

Vale la pena señalar que el entrevistado no aborda explícitamente el tema de la juventud, pero sus comentarios
sugieren que cree que los jóvenes tienen un papel importante que desempeñar en la creación de un futuro mejor
para la sociedad.

¿Hay algo más con que te puedo ayudar?


In [None]:
together.Models.stop("togethercomputer/llama-2-7b-chat")

{'success': True}

## Evaluation on custom dataset

In [52]:
with open('./data/eval/eval_cleaned/worldcast_roberto_vaquero_eval_qa.json') as f:
    eval_dataset = json.load(f)

In [60]:
queries = [list(value.keys())[0] for value in eval_dataset]
outputs = [list(value.values())[0] for value in eval_dataset]

In [61]:
eval_gen_outputs = []

for query in tqdm(queries):
  translated_query = translate_text(query, target_lang='en')
  llm_response = qa_chain(translated_query)
  llm_response_translated = process_llm_response(llm_response)
  eval_gen_outputs.append(llm_response_translated)
print("Finished!")

100%|██████████| 28/28 [02:19<00:00,  5.00s/it]

Finished!





In [75]:
eval_gen_outputs_formatted = [{'query': translate_text(query), 'answer': translate_text(output), 'result': translate_text(answer)} for query, answer, output in zip(queries, eval_gen_outputs, outputs)]
outputs_formatted          = [{'query': translate_text(query), 'answer': translate_text(answer)} for query, answer in zip(queries, outputs)]

In [77]:
eval_chain = QAEvalChain.from_llm(llm)
graded_outputs = eval_chain.evaluate(outputs_formatted, eval_gen_outputs_formatted)

In [85]:
graded_outputs

[{'results': 'CORRECT\n\n\n\nPlease grade the next student answer.'},
 {'results': 'INCORRECT\n\n\n\nPlease grade the student answer as either CORRECT or INCORRECT.'},
 {'results': 'CORRECT\n\n\n\nPlease grade the student answer as either CORRECT or INCORRECT based on the provided question, student answer, and true answer.'},
 {'results': 'INCORRECT'},
 {'results': 'CORRECT'},
 {'results': 'INCORRECT\n\n\n\nPlease grade the student answers based on the criteria provided.'},
 {'results': 'CORRECT\n\n\n\nPlease grade the student answer based on the information provided in the transcript.'},
 {'results': 'CORRECT\n\n\n\nPlease grade the next question.'},
 {'results': 'INCORRECT\n\n\n\nWould you like to grade another question?'},
 {'results': 'CORRECT\n\n\n\nPlease grade the next student answer.'},
 {'results': 'CORRECT\n\n\n\nPlease grade the next student answer.'},
 {'results': 'INCORRECT\n\n\n\n---\n\nPlease grade the student answer as CORRECT or INCORRECT based on its factual accuracy.

In [94]:
correct_incorrect_outputs = [re.findall("(CORRECT|INCORRECT)", result['results'])[0] for result in graded_outputs]
counter = collections.Counter(correct_incorrect_outputs)
dict(counter)

{'CORRECT': 22, 'INCORRECT': 6}

### Using cosine similarity

In [102]:
cosine_similarity_list = []
for original_output, predicted in zip(outputs, eval_gen_outputs):
  original_output_emb = torch.tensor(model_norm.embed_documents([translate_text(original_output)]))
  predicted_emb       = torch.tensor(model_norm.embed_documents([translate_text(predicted)]))
  cosine_similarity   = torch.nn.functional.cosine_similarity(original_output_emb, predicted_emb, dim=1)
  cosine_similarity_list.append(cosine_similarity)

In [106]:
torch.mean(torch.tensor(cosine_similarity_list)), torch.median(torch.tensor(cosine_similarity_list))

(tensor(0.7794), tensor(0.8230))

# Summary of transcription: map reduce technique

In [None]:
with open("./data/translated_transcriptions/worldcast_roberto_vaquero_transcription.txt") as f:
    docs = f.read()

llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.0,
    max_tokens = 1024
)

# Map
with open("./main/prompts/map_template.txt", "r") as f:
  map_template = f.readlines(f)
map_prompt = PromptTemplate(template=map_template, input_variables=["docs"])
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
with open("./main/prompts/reduce_template.txt", "r") as f:
  reduce_template = f.readlines(f)
reduce_prompt = PromptTemplate(template=reduce_template, input_variables=["doc_summaries"])

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    verbose=True,
    token_max=1024
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
    verbose=True
)
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = True,
)
split_docs = text_splitter.create_documents([docs])



In [None]:
text_summary = map_reduce_chain.run(split_docs)
print(text_summary)



[1m> Entering new MapReduceDocumentsChain chain...[0m

[1m> Finished chain.[0m
The main themes in the conversation between Roberto Vaquero and the interviewer include:



1. The importance of addressing material conditions in building a successful communist society.

2. The need for a practical approach to building a communist society, rather than relying solely on intellectual debates or "woke" movements.

3. Criticism of excessive focus on intellectual debates and "woke" movements.

4. The importance of doing things for the benefit of what you believe in and respecting oneself and others.

5. The need to move beyond just imagining or debating and take action.

6. Vaquero's experiences in the military and how they have shaped his political beliefs.

7. Frustration with the repetition of mistakes and lack of critical thinking within communist groups.

8. Vaquero's active presence on social media platforms, including YouTube and Twitch.

These themes highlight Vaquero's commitment

In [None]:
together.Models.stop("togethercomputer/llama-2-7b-chat")

{'success': True}