In [11]:
from dotenv import load_dotenv
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ragatouille import RAGPretrainedModel

load_dotenv()

In [3]:
import csv
from typing import Dict, List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class CSVMetaLoader(BaseLoader):
    """Loads a CSV file into a list of documents.

    Each document represents one row of the CSV file. Every row is converted into a
    key/value pair and outputted to a new line in the document's page_content.

    The source for each document loaded from csv is set to the value of the
    `file_path` argument for all documents by default.
    You can override this by setting the `source_column` argument to the
    name of a column in the CSV file.
    The source of each document will then be set to the value of the column
    with the name specified in `source_column`.

    Output Example:
        .. code-block:: txt

            column1: value1
            column2: value2
            column3: value3
    """

    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        metadata_columns_dtypes: Optional[Dict[str, str]] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
        """

        Args:
            file_path: The path to the CSV file.
            source_column: The name of the column in the CSV file to use as the source.
              Optional. Defaults to None.
            metadata_columns_dtypes: Name of column as keys and data type as values.
              Optional. Defaults to None.
            csv_args: A dictionary of arguments to pass to the csv.DictReader.
              Optional. Defaults to None.
            encoding: The encoding of the CSV file. Optional. Defaults to None.
        """
        self.file_path = file_path
        self.source_column = source_column
        self.metadata_columns_dtypes = metadata_columns_dtypes
        self.encoding = encoding
        self.csv_args = csv_args or {}

    def load(self) -> List[Document]:
        """Load data into document objects."""

        docs = []
        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
            for i, row in enumerate(csv_reader):
                content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
                # try:
                #     source = (
                #         row[self.source_column]
                #         if self.source_column is not None
                #         else self.file_path
                #     )
                # except KeyError:
                #     raise ValueError(
                #         f"Source column '{self.source_column}' not found in CSV file."
                #     )

                # metadata = {"source": source, "row": i}
                metadata = {}
                if self.metadata_columns_dtypes:
                    for k, v in row.items():
                        if k in self.metadata_columns_dtypes.keys():
                            if self.metadata_columns_dtypes[k] in ["int", "float"]:
                                v = eval(v)
                            metadata.update({k: v})

                doc = Document(page_content=content, metadata=metadata)
                docs.append(doc)

        return docs

In [4]:
# Place your cv in data folder and load here
loader = CSVMetaLoader('../data/movies_title_overview_vote.csv', metadata_columns_dtypes={"vote_average": "float"})

In [25]:
example_query_1 = "Psychological thriller based on a real life experience"
example_query_2 = "I want to watch a movie about outer space exploration"

## Dense Embedding

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 0)
index_creator = VectorstoreIndexCreator(text_splitter=text_splitter)
docsearch = index_creator.from_loaders([loader])

  warn_deprecated(


In [26]:
docsearch.vectorstore.similarity_search(example_query_2, k=10)

[Document(page_content="title: Galaxy Quest\noverview: The stars of a 1970s sci-fi show - now scraping a living through re-runs and sci-fi conventions - are beamed aboard an alien spacecraft. Believing the cast's heroic on-screen dramas are historical documents of real-life adventures, the band of aliens turn to the ailing celebrities for help in their quest to overcome the oppressive regime in their solar system.\nvote_average: 6.9", metadata={'vote_average': 6.9}),
 Document(page_content='title: Interstellar\noverview: Interstellar chronicles the adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on human space travel and conquer the vast distances involved in an interstellar voyage.\nvote_average: 8.1', metadata={'vote_average': 8.1}),
 Document(page_content="title: 2001: A Space Odyssey\noverview: Humanity finds a mysterious object buried beneath the lunar surface and sets off to find its origins with the help of HAL 9000, the 

## ColBERT

In [None]:
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

In [5]:
test = loader.load()

In [7]:
RAG.index(collection=[doc.page_content for doc in test], index_name="movies", max_document_length=5000, split_documents=True)



[Jan 16, 11:30:04] #> Creating directory .ragatouille/colbert/indexes/movies 


#> Starting...
[Jan 16, 11:30:07] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Jan 16, 11:30:07] [0] 		 #> Encoding 4803 passages..


100%|██████████| 50/50 [04:06<00:00,  4.93s/it]
100%|██████████| 26/26 [02:09<00:00,  4.99s/it]


[Jan 16, 11:36:24] [0] 		 avg_doclen_est = 70.14949035644531 	 len(local_sample) = 4,803
[Jan 16, 11:36:25] [0] 		 Creating 8,192 partitions.
[Jan 16, 11:36:25] [0] 		 *Estimated* 336,928 embeddings.
[Jan 16, 11:36:25] [0] 		 #> Saving the indexing plan to .ragatouille/colbert/indexes/movies/plan.json ..
Clustering 320082 points in 128D to 8192 clusters, redo 1 times, 20 iterations
  Preprocessing in 0.01 s
  Iteration 19 (36.42 s, search 36.21 s): objective=93451.4 imbalance=1.369 nsplit=0       
[0.043, 0.041, 0.044, 0.039, 0.039, 0.042, 0.04, 0.039, 0.036, 0.039, 0.038, 0.041, 0.038, 0.041, 0.041, 0.04, 0.036, 0.037, 0.038, 0.038, 0.039, 0.039, 0.039, 0.039, 0.04, 0.038, 0.043, 0.04, 0.041, 0.042, 0.042, 0.041, 0.044, 0.039, 0.039, 0.037, 0.041, 0.04, 0.04, 0.047, 0.041, 0.043, 0.041, 0.039, 0.04, 0.038, 0.038, 0.044, 0.041, 0.037, 0.039, 0.039, 0.04, 0.041, 0.039, 0.04, 0.044, 0.039, 0.041, 0.039, 0.036, 0.041, 0.041, 0.043, 0.044, 0.042, 0.043, 0.04, 0.035, 0.039, 0.044, 0.041, 0.

0it [00:00, ?it/s]

[Jan 16, 11:37:02] [0] 		 #> Encoding 4803 passages..



  0%|          | 0/50 [00:00<?, ?it/s][A
  2%|▏         | 1/50 [00:05<04:08,  5.08s/it][A
  4%|▍         | 2/50 [00:10<04:05,  5.11s/it][A
  6%|▌         | 3/50 [00:15<04:00,  5.11s/it][A
  8%|▊         | 4/50 [00:20<03:54,  5.10s/it][A
 10%|█         | 5/50 [00:25<03:50,  5.11s/it][A
 12%|█▏        | 6/50 [00:30<03:45,  5.13s/it][A
 14%|█▍        | 7/50 [00:35<03:42,  5.18s/it][A
 16%|█▌        | 8/50 [00:40<03:35,  5.12s/it][A
 18%|█▊        | 9/50 [00:46<03:28,  5.09s/it][A
 20%|██        | 10/50 [00:51<03:22,  5.07s/it][A
 22%|██▏       | 11/50 [00:56<03:16,  5.04s/it][A
 24%|██▍       | 12/50 [01:00<03:10,  5.02s/it][A
 26%|██▌       | 13/50 [01:06<03:08,  5.11s/it][A
 28%|██▊       | 14/50 [01:11<03:03,  5.09s/it][A
 30%|███       | 15/50 [01:16<03:00,  5.17s/it][A
 32%|███▏      | 16/50 [01:22<02:57,  5.22s/it][A
 34%|███▍      | 17/50 [01:27<02:50,  5.16s/it][A
 36%|███▌      | 18/50 [01:32<02:43,  5.11s/it][A
 38%|███▊      | 19/50 [01:37<02:37,  5.09s/it]

[Jan 16, 11:43:33] #> Optimizing IVF to store map from centroids to list of pids..
[Jan 16, 11:43:33] #> Building the emb2pid mapping..
[Jan 16, 11:43:33] len(emb2pid) = 336928
[Jan 16, 11:43:33] #> Saved optimized IVF to .ragatouille/colbert/indexes/movies/ivf.pid.pt
#> Joined...
Done indexing!


In [27]:
results = RAG.search(query=example_query_2, k=10)



In [28]:
results

[{'content': 'title: You Only Live Twice\noverview: A mysterious space craft kidnaps a Russian and American space capsule and brings the world on the verge of another World War. James Bond investigates the case in Japan and meets with his archenemy Blofeld. The fifth film from the legendary James Bond series starring Sean Connery as the British super agent.\nvote_average: 6.5',
  'score': 15.5687837600708,
  'rank': 1},
 {'content': "title: Muppets from Space\noverview: When Gonzo's breakfast cereal tells him that he's the descendant of aliens from another planet, his attempts at extraterrestrial communication get him kidnapped by a secret government agency, prompting the Muppets to spring into action. It's hard to believe Gonzo's story at first, but Kermit and friends soon find themselves on an epic journey into outer space filled with plenty of intergalactic misadventures.\nvote_average: 5.8",
  'score': 15.449250221252441,
  'rank': 2},
 {'content': 'title: Sea Rex 3D: Journey to a 