# Castena - Chatbot for multilingual podcasts

In [1]:
!pip install langchain huggingface_hub tiktoken -q
!pip install chromadb -q
!pip install PyPDF2 pypdf sentence_transformers -q
!pip install --upgrade together -q
!pip install -U FlagEmbedding -q
!pip install googletrans==3.1.0a0 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m437.8/437.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# -- Import libraries
from   typing                      import Any, Dict, List, Mapping, Optional
from   pydantic                    import Extra, Field, root_validator
from   langchain.callbacks.manager import CallbackManagerForLLMRun
from   langchain.llms.base         import LLM
from   langchain.llms.utils        import enforce_stop_tokens
from   langchain.chains.llm        import LLMChain
from   langchain.utils             import get_from_dict_or_env
from   googletrans                 import Translator
from   langchain.vectorstores      import Chroma
from   langchain.text_splitter     import RecursiveCharacterTextSplitter, CharacterTextSplitter
from   langchain.chains            import RetrievalQA, ReduceDocumentsChain, MapReduceDocumentsChain
from   langchain.document_loaders  import TextLoader, DirectoryLoader
from   langchain.embeddings        import HuggingFaceEmbeddings
from   langchain.prompts           import PromptTemplate
from   langchain.schema            import prompt
from   langchain.chains.mapreduce  import MapReduceChain
from   langchain.chains.combine_documents.stuff import StuffDocumentsChain
import pandas                      as pd
import logging
import together
import textwrap
import os

# Setup API Key

In [3]:
os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"

# Pre-process data

In [4]:
translator = Translator(service_urls=['translate.googleapis.com'])

def translate_text(text, target_lang='en'):
    translator = Translator()
    translated = translator.translate(str(text), dest=target_lang)
    return translated.text

In [None]:
# -- Chunk to translate spanish transcripts if necessary (DO NOT RUN IT)
transcription_df = pd.read_table('/content/transcriptions.txt', sep='|', header=None)
transcription_df.rename(columns={0: 'time', 1: 'speaker', 2: 'transcript'}, inplace=True)

transcription_df['time'] = pd.to_timedelta(transcription_df['time'])
transcription_df['speaker_change'] = transcription_df['speaker'] != transcription_df['speaker'].shift()

result = transcription_df.groupby(['speaker', transcription_df['speaker_change'].cumsum()]).agg({\
                                                                                                 'time': ['min', 'max'],
                                                                                                 'transcript': lambda x: '.'.join(x)
                                                                                                })
result.columns = result.columns.droplevel()
result.columns = ['min_time', 'max_time', 'transcript']
result.reset_index(inplace=True)
result['min_time'] = result['min_time'].apply(lambda x: str(x).replace('0 days ', ''))
result['max_time'] = result['max_time'].apply(lambda x: str(x).replace('0 days ', ''))
result['literal_transcript'] = 'Desde el instante ' + result['min_time'] + ' hasta ' + result['max_time'] + ' ' + result['speaker'] + ' dice: \"' + result['transcript'] + '\"'
result['literal_transcript'] = result['literal_transcript'].apply(translate_text)
result = result.sort_values('min_time')
'\n\n'.join(result['literal_transcript'])

# Setting up Together API


In [41]:
# set your API key
together.api_key = os.environ["TOGETHER_API_KEY"]
# list available models and descriptons
models = together.Models.list()
# set llama LLM
together.Models.start("togethercomputer/llama-2-7b-chat")

{'success': True,
 'value': '55d6730907e593ff7d4d01af9597066c91b0955b4cda48437ed860ea469c0b26'}

In [6]:
class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key: str = os.environ["TOGETHER_API_KEY"]
    """Together API key"""

    temperature: float = 0.7
    """What sampling temperature to use."""

    max_tokens: int = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        return text


# LangChain multi-doc retriever with ChromaDB

***Key Points***
- Multiple Files - PDFs
- ChromaDB
- llama-2-7b-chat LLM
- BGE Embeddings (newest version)


## Setting up LangChain


## Load multiple and process documents

In [8]:
# Load and process the text files
loader = TextLoader('/content/transcriptions.es.en.txt')
documents = loader.load()

In [9]:
len(documents)

1

In [10]:
# Splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7000, chunk_overlap=1000)
texts = text_splitter.split_documents(documents)

len(texts)

20

In [11]:
texts[1]

Document(page_content='From the minute 00:04:09.690000 to 00:04:46.690000 Roberto Vaquero says: "what can anyone who sees us imagine... And I believe that those years were thrown away. and I was putting myself at risk for things that in In reality, they weren\'t even my own interests... And I came out of there very renegade... And I don\'t defend either the left or the right... I think that\'s a dichotomy that is already out of date. And what there is is people dividing themselves so much. those who are more moderate on the left or right, as well as their radical versions that when things happen in the end they always end up joining the big guy... And we have broken with that... So some call us fascists, others call us communists, we They call for everything, everyone...But it doesn\'t matter to us...We want to build something different...and we do want to be a little bit of that fresh air that you talk about."\n\nFrom the minute 00:04:49.690000 to 00:06:06.690000 Interviewer says: "Be

## Load HF BGE Embeddings

In [12]:
model_name = "BAAI/bge-base-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

Downloading (…)714e1/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)c46d0714e1/README.md:   0%|          | 0.00/89.0k [00:00<?, ?B/s]

Downloading (…)6d0714e1/config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)714e1/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)c46d0714e1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)d0714e1/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## create the DB

 T4 GPU

In [13]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: user 1.33 s, sys: 256 ms, total: 1.59 s
Wall time: 2.3 s


## Make a retriever

In [14]:
retriever = vectordb.as_retriever(search_kwargs={"k": 6})

## Make a chain

In [15]:
## Default LLaMA-2 prompt style
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful, and honest assistant. Your mission is to provide accurate answers to questions regarding the transcription of a YouTube interview.

Please provide responses in complete paragraphs.

Emphasize the importance of accuracy and refrain from making guesses or assumptions.

If a question is inappropriate or incoherent, politely explain the issue without directly answering it.

Do not add emojis to the response.

Always conclude your response with the following text: 'Is there anything else I can assist you with?'

Avoid promoting or endorsing any content that may be considered harmful or inappropriate.

Thank you for your cooperation in maintaining a respectful and informative environment.
"""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [16]:
sys_prompt = """\
You are a helpful, respectful, and honest assistant. Your mission is to provide accurate answers to questions regarding the transcription of a YouTube interview.

Please provide responses in complete paragraphs.

Emphasize the importance of accuracy and refrain from making guesses or assumptions.

If a question is inappropriate or incoherent, politely explain the issue without directly answering it.

Do not add emojis to the response.

Always conclude your response with the following text: 'Is there anything else I can assist you with?'

Avoid promoting or endorsing any content that may be considered harmful or inappropriate.

Thank you for your cooperation in maintaining a respectful and informative environment.
"""

instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""
get_prompt(instruction, sys_prompt)

"[INST]<<SYS>>\nYou are a helpful, respectful, and honest assistant. Your mission is to provide accurate answers to questions regarding the transcription of a YouTube interview.\n\nPlease provide responses in complete paragraphs.\n\nEmphasize the importance of accuracy and refrain from making guesses or assumptions.\n\nIf a question is inappropriate or incoherent, politely explain the issue without directly answering it.\n\nDo not add emojis to the response.\n\nAlways conclude your response with the following text: 'Is there anything else I can assist you with?'\n\nAvoid promoting or endorsing any content that may be considered harmful or inappropriate.\n\nThank you for your cooperation in maintaining a respectful and informative environment.\n\n<</SYS>>\n\nCONTEXT:/n/n {context}/n\n\nQuestion: {question}[/INST]"

In [40]:
llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.0,
    max_tokens = 1024
)

In [18]:
prompt_template = get_prompt(instruction, sys_prompt)

llama_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [19]:
chain_type_kwargs = {"prompt": llama_prompt}


In [20]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)

In [32]:
## Cite sources
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
  response = wrap_text_preserve_newlines(llm_response['result'])
  return translate_text(response, target_lang='es')

In [None]:
# full example
query = "¿En qué momentos del vídeo se habla del pueblo kurdo?"
llm_response = qa_chain(translate_text(query, target_lang='en'))
print(process_llm_response(llm_response))

In [38]:
together.Models.stop("togethercomputer/llama-2-7b-chat")

{'success': True, 'wasAlreadyDisabled': True}

# Summary of transcription: map reduce technique

In [None]:
with open("/content/transcriptions.es.en.txt") as f:
    docs = f.read()

llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.1,
    max_tokens = 1024
)

# Map
map_template = """The following is a list of a conversation between two speakers
{docs}
Please summarise the conversation in detail
Summary:"""
map_prompt = PromptTemplate(template=map_template, input_variables=["docs"])
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is set of summaries:
{doc_summaries}
Take these and distill it into a final, consolidated summary of the main themes in detail.
Summary:"""
reduce_prompt = PromptTemplate(template=reduce_template, input_variables=["doc_summaries"])

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    verbose=True,
    token_max=1024
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
    verbose=True
)
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 8000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = True,
)
split_docs = text_splitter.create_documents([docs])

In [None]:
text_summary = map_reduce_chain.run(split_docs)
print(text_summary)



[1m> Entering new MapReduceDocumentsChain chain...[0m

[1m> Finished chain.[0m

The main themes in this conversation are:

1. The need for a comprehensive approach to immigration that addresses the root causes of the problem and ensures the safety and well-being of both immigrants and Spanish society.
2. The party's economic plan and its focus on popular sovereignty.
3. The importance of practical action rather than intellectual debates in politics.
4. Criticism of the "red geek" movement and the tendency to copy and adapt past revolutionary models without considering the current context.
5. The need for individuals to be prepared for the challenges ahead and to have the skills and abilities necessary to face them.

These themes are central to the conversation and highlight the issues and concerns of the speakers.


In [None]:
together.Models.stop("togethercomputer/llama-2-7b-chat")