# Castena - Chatbot for multilingual podcasts

* Document retrieval via langchain

In [1]:
!pip install langchain huggingface_hub tiktoken -q
!pip install chromadb -q
!pip install PyPDF2 pypdf sentence_transformers -q
!pip install -U together -q
!pip install -U FlagEmbedding -q
!pip install googletrans==3.1.0a0 -q
!pip install spacy
!python -m spacy download es_core_news_sm
!pip install -U sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.1/448.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip freeze > requirements.txt
!pip list --format=freeze > requirements.txt

In [2]:
# -- Import libraries
from   typing                      import Any, Dict, List, Mapping, Optional
from   pydantic                    import Extra, Field, root_validator
from   langchain.callbacks.manager import CallbackManagerForLLMRun
from   langchain.memory            import ConversationBufferWindowMemory
from   langchain.llms.base         import LLM
from   langchain.llms.utils        import enforce_stop_tokens
from   langchain.chains.llm        import LLMChain
from   langchain.utils             import get_from_dict_or_env
from   googletrans                 import Translator
from   langchain.vectorstores      import Chroma
from   langchain.text_splitter     import RecursiveCharacterTextSplitter, CharacterTextSplitter
from   langchain.chains            import RetrievalQA, ReduceDocumentsChain, MapReduceDocumentsChain
from   langchain.evaluation.qa     import QAEvalChain
from   langchain.document_loaders  import TextLoader, DirectoryLoader
from   langchain.embeddings        import HuggingFaceEmbeddings
from   langchain.prompts           import PromptTemplate
from   langchain.schema            import prompt
from   langchain.chains.mapreduce  import MapReduceChain
from   langchain.chains.combine_documents.stuff import StuffDocumentsChain
from   sentence_transformers       import SentenceTransformer, util
from   tqdm                        import tqdm
import pandas                      as pd
import collections
import logging
import together
import textwrap
import spacy
import torch
import json
import os
import re

tqdm.pandas()

# -- Constants
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

#with open("./main/prompts/default_system_prompt.txt", "r") as f:
#  DEFAULT_SYSTEM_PROMPT = f.readlines()

with open("/content/default_system_prompt.txt", "r") as f:
  DEFAULT_SYSTEM_PROMPT = f.read()

# Setup API Key

In [3]:
os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5"

# Pre-process data

In [4]:
# -- Define auxiliar functions to clean/process data
translator = Translator(service_urls=['translate.googleapis.com'])

# Carga el modelo de SpaCy para el idioma en el que está el texto de origen
nlp = spacy.load('es_core_news_sm')

def translate_text(text, target_lang='en'):
    # Traducir el texto sin los nombres propios
    translator = Translator()
    # Tokenizar el texto y encontrar nombres propios
    doc = nlp(text)
    named_entities = [ent.text for ent in doc if ent.pos_ == 'PROPN' and ent.dep_ in ['NNP', 'NN']]
    named_entities_list = []
    # Reemplazar los nombres propios con marcadores temporales
    for entity in named_entities:
        text = text.replace(entity, f'__{entity}__')
        named_entities_list.append(entity)

    translated_text = translator.translate(text, dest=target_lang).text
    final_translated_text = []

    i = 0

    for text in translated_text.split(' '):
      if '__' in text and len(named_entities_list):
        final_translated_text.append(named_entities_list[i])
        i+=1
      else:
        final_translated_text.append(text)
    return ' '.join(final_translated_text)

def capitalize_proper_nouns(text):
    # Cargar el modelo de spaCy para español
    nlp = spacy.load("es_core_news_sm")

    # Analizar el texto con spaCy
    doc = nlp(text)

    # Inicializar una lista para almacenar el texto modificado
    modified_text = []

    # Recorrer el texto y capitalizar la primera letra de cada nombre propio
    for token in doc:
        if token.pos_ != "PROPN":
            # Si es un nombre propio y ya está en mayúscula, conservarlo
            modified_text.append(token.text)
        else:
            # De lo contrario, capitalizar la primera letra
            modified_text.append(token.text.capitalize())

    # Unir las palabras nuevamente en un solo texto
    modified_text = " ".join(modified_text)
    # Correct punctuation signs
    modified_text = modified_text.replace(" .. ", "...")\
                                 .replace(" , ", ",")\
                                 .replace(" .... ", "...")\
                                 .replace(" ... ", "...")\
                                 .replace(",¿", " ¿")\
                                 .replace(",.", ",")\
                                 .replace("?.", "? ")

    return modified_text

In [5]:
# -- Chunk to translate spanish transcripts if necessary
transcription_df = pd.read_table('./worldcast_roberto_vaquero_transcription.txt', sep='|', header=None)
transcription_df.rename(columns={0: 'time', 1: 'speaker', 2: 'transcript'}, inplace=True)

transcription_df['time'] = pd.to_timedelta(transcription_df['time'])
transcription_df['speaker_change'] = transcription_df['speaker'] != transcription_df['speaker'].shift()

result = transcription_df.groupby(['speaker', transcription_df['speaker_change'].cumsum()]).agg({\
                                                                                                 'time': ['min', 'max'],
                                                                                                 'transcript': lambda x: '.'.join(x)
                                                                                                })
result.columns = result.columns.droplevel()
result.columns = ['min_time', 'max_time', 'transcript']
result.reset_index(inplace=True)
result['min_time'] = result['min_time'].apply(lambda x: str(x).replace('0 days ', ''))
result['max_time'] = result['max_time'].apply(lambda x: str(x).replace('0 days ', ''))

# -- Preprocess transcript
result['transcript'] = result['transcript'].apply(capitalize_proper_nouns)

result['literal_transcript'] = 'Desde el instante ' + result['min_time'] + ' hasta ' + result['max_time'] + ' ' + result['speaker'] + ' dice: \"' + result['transcript'] + '\"'
result['literal_transcript'] = result['literal_transcript'].progress_apply(translate_text)
result = result.sort_values('min_time')
# Sample output to save as .txt file (uncomment following line)
# '\n\n'.join(result['literal_transcript'])

100%|██████████| 84/84 [00:23<00:00,  3.59it/s]


In [6]:
translated_transcription = '\n\n'.join(result['literal_transcript'])
with open('translated_worldcast_roberto_vaquero_transcription.txt', 'w') as f:
  f.write(translated_transcription)

# Setting up Together API


In [7]:
# Set your API key
together.api_key = os.environ["TOGETHER_API_KEY"]
# List available models and descriptons
models = together.Models.list()
# Set llama2 7b LLM
together.Models.start("togethercomputer/llama-2-7b-chat")

{'success': True,
 'value': '0a4984001d4c19433f871710d71c135872cc24ec68b78c76057abafd14046c1d-9690c6c174bce15307eb3e19a832afdc741cca75b147708756f48e708f1f806e',
 'wasAlreadyEnabled': True}

In [8]:
class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key: str = os.environ["TOGETHER_API_KEY"]
    """Together API key"""

    temperature: float = 0.7
    """What sampling temperature to use."""

    max_tokens: int = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def clean_duplicates(self, transcription: str) -> str:
      lines = transcription.strip().split('\n')
      unique_lines = set()

      new_transcription = []

      for linea in lines:
          if linea not in unique_lines:
              new_transcription.append(linea)
              unique_lines.add(linea)

      # Create new transcription without duplicates
      new_transcription = '\n\n'.join(new_transcription)
      return new_transcription

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        print(prompt)
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        cleaned_text = self.clean_duplicates(text)
        return cleaned_text


# LangChain multi-doc retriever with ChromaDB

***Key Points***
- Multiple Files - PDFs
- ChromaDB
- llama-2-7b-chat LLM
- BGE Embeddings (newest version)


## Setting up LangChain


## Load multiple and process documents

In [9]:
# Load and process the text files
loader = TextLoader('./translated_worldcast_roberto_vaquero_transcription.txt')
documents = loader.load()

In [10]:
len(documents)

1

In [11]:
# Splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

len(texts)

92

## Load HF BGE Embeddings

In [12]:
model_name = "BAAI/bge-base-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

Downloading (…)db36e/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)88b99db36e/README.md:   0%|          | 0.00/90.2k [00:00<?, ?B/s]

Downloading (…)b99db36e/config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)db36e/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)88b99db36e/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)99db36e/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## create the DB

 T4 GPU

__Why Chroma instead of FAISS?__

Answer:

In terms of performance, there is no direct benchmark comparison available between **Chroma** and **FAISS**. This is because **FAISS** is not regularly used as a stand-alone vector database, so it is difficult to compare it directly with **Chroma**.

**FAISS** is designed for efficient similarity search, which can be crucial for applications involving large-scale semantic search. However, for a production environment, it may need to be built into a custom container or larger system to support CRUD operations, high availability, horizontal scalability, concurrent access, etc. It is built around an Index object. This object encapsulates the set of database vectors and optionally preprocesses them to make the search efficient. There are many types of indexes, but the simplest version performs a brute force Euclidean (L2) distance search.

On the other hand, **Chroma** is designed to run on your machine and was built to handle modern AI workloads, making it suitable for embedding-intensive applications.

Therefore, the choice between **Chroma** and **FAISS** depends on your specific use case. If you're looking for a standalone vector database that's easy to set up and use for local development, **Chroma** may be a good choice. If you need a tool for efficient similarity search and dense vector clustering and ready to build additional functionality around it, **FAISS** might be suitable.

In [13]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: user 6.01 s, sys: 1.27 s, total: 7.28 s
Wall time: 15.2 s


## Make a retriever

In [14]:
retriever = vectordb.as_retriever(search_type="similarity_score_threshold",
                                  search_kwargs={"k": 5, "score_threshold": 0.5})

## Make a chain

In [15]:
def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [16]:
sys_prompt = DEFAULT_SYSTEM_PROMPT
instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""
get_prompt(instruction, sys_prompt)

'[INST]<<SYS>>\nYou are an assistant. Your mission is to provide accurate answers to questions regarding the transcription of a YouTube interview.\n\nBe concise and omit disclaimers or default messages.\n\nDo not give your personal opinion.\n\nAvoid making guesses or assumptions.\n\nIf you do not know the answer of the question, politely explain the issue.\n\nIf the question has nothing to do with the context, answer the question without mentioning anything about the context.\n\nDo not add emojis to the response.\n\n<</SYS>>\n\nCONTEXT:/n/n {context}/n\n\nQuestion: {question}[/INST]'

In [17]:
llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.0,
    max_tokens = 1024
)

In [18]:
prompt_template = get_prompt(instruction, sys_prompt)

llama_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [19]:
# -- NOTE: in case we want to add memory, 'history' field must be added to the prompt
# HISTORY:/n/n {history}/n
#memory = ConversationBufferWindowMemory(k=1, memory_key="history", input_key="question")
#chain_type_kwargs = {"prompt": llama_prompt, "memory": memory}
chain_type_kwargs = {"prompt": llama_prompt}

In [20]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)

In [21]:
## Cite sources
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
  response = llm_response['result']
  return wrap_text_preserve_newlines(translate_text(response, target_lang='es'))

In [22]:
# Full example
query = "¿Cuál es la opinión de Roberto Vaquero sobre VOX?"
translated_query = translate_text(query, target_lang='en')
llm_response = qa_chain(translated_query)
print(process_llm_response(llm_response))

[INST]<<SYS>>
You are an assistant. Your mission is to provide accurate answers to questions regarding the transcription of a YouTube interview.

Be concise and omit disclaimers or default messages.

Do not give your personal opinion.

Avoid making guesses or assumptions.

If you do not know the answer of the question, politely explain the issue.

If the question has nothing to do with the context, answer the question without mentioning anything about the context.

Do not add emojis to the response.

<</SYS>>

CONTEXT:/n/n From the moment 00:38:39.120000 to 00:41:56.490000 Roberto Vaquero says: "And I believe that it is something that unites many people...And yet, look, they criticize me a lot, for example, for the forms, right? I'm a very guy...Well, for example, feminist people would say about toxic masculinity, about a Big Guy, that if he has to say things, he says them, with strong, serious gestures... I mean, I'm a person who doesn't fit in a little with how they do politics...Peo

In [None]:
together.Models.stop("togethercomputer/llama-2-7b-chat")

{'success': True}

## Evaluation on custom dataset

In [23]:
with open('./worldcast_roberto_vaquero_eval_qa.json') as f:
    eval_dataset = json.load(f)

In [24]:
queries = [list(value.keys())[0] for value in eval_dataset]
outputs = [list(value.values())[0] for value in eval_dataset]

In [None]:
eval_gen_outputs = []

for query in tqdm(queries):
  translated_query = translate_text(query, target_lang='en')
  llm_response = qa_chain(translated_query)
  llm_response_translated = process_llm_response(llm_response)
  eval_gen_outputs.append(llm_response_translated)
print("Finished!")

In [26]:
eval_gen_outputs_formatted = [{'query': translate_text(query), 'answer': translate_text(output), 'result': translate_text(answer)} for query, answer, output in tqdm(zip(queries, eval_gen_outputs, outputs))]
outputs_formatted          = [{'query': translate_text(query), 'answer': translate_text(answer)} for query, answer in tqdm(zip(queries, outputs))]

28it [00:21,  1.28it/s]
28it [00:09,  2.99it/s]


In [None]:
eval_chain = QAEvalChain.from_llm(llm)
graded_outputs = eval_chain.evaluate(outputs_formatted, eval_gen_outputs_formatted)

In [28]:
queries[1]

'¿A qué se refiere Roberto Vaquero en el instante 00:00:25 al 00:01:06 con "se está fomentando la debilidad"'

In [29]:
outputs[1]

'Según Roberto Vaquero, "se está fomentando la debilidad" hace referencia a que hay gente que siente vergüenza por ser español, blanco o heterosexual; lo cual lo considera ridículo.'

In [30]:
print(eval_gen_outputs[1])

Roberto Vaquero se refiere a la idea de que la situación actual en Francia y otras partes del mundo está
siendo alimentada por una debilidad o vulnerabilidad de la sociedad. Cree que esta debilidad está permitiendo
el surgimiento de ideologías peligrosas o dañinas, y que conducirá a tiempos difíciles en el futuro.


In [31]:
# -- Update: 2023 - 10 - 15
correct_incorrect_outputs = [re.findall("(CORRECT|INCORRECT)", result['results'])[0] for result in graded_outputs]
counter = collections.Counter(correct_incorrect_outputs)
dict(counter)

{'CORRECT': 20, 'INCORRECT': 8}

In [None]:
# -- Update: 2023 - 10 - 10
correct_incorrect_outputs = [re.findall("(CORRECT|INCORRECT)", result['results'])[0] for result in graded_outputs]
counter = collections.Counter(correct_incorrect_outputs)
dict(counter)

{'INCORRECT': 10, 'CORRECT': 18}

In [None]:
correct_incorrect_outputs = [re.findall("(CORRECT|INCORRECT)", result['results'])[0] for result in graded_outputs]
counter = collections.Counter(correct_incorrect_outputs)
dict(counter)

{'CORRECT': 22, 'INCORRECT': 6}

### Sentence Similarity

In [32]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

similarity_scores_list = []
for original_output, gen_output in tqdm(zip(outputs, eval_gen_outputs)):
  query_embedding = model.encode(original_output)
  passage_embedding = model.encode([gen_output])
  similarity_scores_list.append(float(util.dot_score(query_embedding, passage_embedding)))

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

28it [00:02, 10.33it/s]


In [33]:
# -- Update: 2023 - 10 - 15
torch.mean(torch.tensor(similarity_scores_list)), torch.median(torch.tensor(similarity_scores_list))

(tensor(0.6977), tensor(0.7144))

In [None]:
# -- Update: 2023 - 10 - 10
torch.mean(torch.tensor(similarity_scores_list)), torch.median(torch.tensor(similarity_scores_list))

(tensor(0.7225), tensor(0.7406))

### Using cosine similarity

In [34]:
cosine_similarity_list = []
for original_output, predicted in zip(outputs, eval_gen_outputs):
  original_output_emb = torch.tensor(model_norm.embed_documents([translate_text(original_output)]))
  predicted_emb       = torch.tensor(model_norm.embed_documents([translate_text(predicted)]))
  cosine_similarity   = torch.nn.functional.cosine_similarity(original_output_emb, predicted_emb, dim=1)
  cosine_similarity_list.append(cosine_similarity)

In [35]:
# -- Update: 2023 - 10 - 15
torch.mean(torch.tensor(cosine_similarity_list)), torch.median(torch.tensor(cosine_similarity_list))

(tensor(0.7806), tensor(0.8041))

In [None]:
# -- Update: 2023 - 10 - 10
torch.mean(torch.tensor(cosine_similarity_list)), torch.median(torch.tensor(cosine_similarity_list))

(tensor(0.7943), tensor(0.8198))

In [None]:
torch.mean(torch.tensor(cosine_similarity_list)), torch.median(torch.tensor(cosine_similarity_list))

(tensor(0.7794), tensor(0.8230))

# Summary of transcription: map reduce technique

In [None]:
with open("/content/translated_worldcast_roberto_vaquero_transcription.txt") as f:
    docs = f.read()

llm = TogetherLLM(
    model= "togethercomputer/llama-2-7b-chat",
    temperature = 0.0,
    max_tokens = 1024
)

# Map
with open("/content/map_template.txt", "r") as f:
  map_template = f.read()
map_prompt = PromptTemplate(template=map_template, input_variables=["docs"])
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
with open("/content/reduce_template.txt", "r") as f:
  reduce_template = f.read()
reduce_prompt = PromptTemplate(template=reduce_template, input_variables=["doc_summaries"])

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    verbose=True,
    token_max=1024
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
    verbose=True
)
text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = True,
)
split_docs = text_splitter.create_documents([docs])



In [None]:
text_summary = map_reduce_chain.run(split_docs)
text_summary_translated = translate_text(text_summary, 'es')
print(text_summary_translated.replace(". ", ".\n"))



[1m> Entering new MapReduceDocumentsChain chain...[0m


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (5284 > 1024). Running this sequence through the model will result in indexing errors



[1m> Finished chain.[0m
Roberto Vaquero es un exmilitar y figura política que comparte su reflexión sobre la situación política y social actual en España y el mundo.
Expresa preocupación por la erosión de los valores tradicionales y la promoción de la ideología de género en la educación, la inmigración y otras áreas.
Vaquero cree que es importante defender el país y la cultura, pero también escuchar y considerar diferentes perspectivas.
Habla de sus experiencias personales, incluido su tiempo en prisión, y cómo esto ha dado forma a sus puntos de vista sobre la política y la vida.
Vaquero también habla del concepto de "corrección política" y su impacto en la sociedad, y de cómo cree que debe haber un mensaje y una alternativa a esta forma de pensar.
Vaquero enfatiza la importancia del desarrollo personal, el pensamiento crítico y valores como el honor y la palabra.
Destaca la necesidad de procesos más inclusivos y participativos para crear una nueva constitución en España que realmen

In [36]:
together.Models.stop("togethercomputer/llama-2-7b-chat")

{'success': True}