In [20]:
!pip install langchain




[notice] A new release of pip is available: 23.1.2 -> 23.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
from langchain.document_loaders import TextLoader, CSVLoader 

Since both classes implement the BaseLoader interface, multiple inheritance is not an option. 
MyLoader(TextLoader, CSVLoader) <- does not work
A solution to overcome this might be to use a Strategy Pattern with a custom class

In [22]:
class MyLoader:
    def __init__(self, file_path, **kwargs):
        if file_path.endswith('.csv'):
            self.loader = CSVLoader(file_path, **kwargs)
        else:
            self.loader = TextLoader(file_path, **kwargs)
    
    def load(self):
        return self.loader.load()


If the you need custom functionality, you can implement your own Loader. Create a new class that inherits from BaseLoader and implements the abstract method load

In [23]:
from datetime import datetime
from typing import List
import csv
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from typing import Optional, Dict

class SingleColumnCSVLoader(BaseLoader):

    def __init__(
        self,
        file_path: str,
        source_column: Optional[str] = None,
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
        column_name: str = None
    ):
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
        self.csv_args = csv_args or {}
        self.column_name = column_name
    def load(self) -> List[Document]:
        docs = []
        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
            csv_reader = csv.DictReader(csvfile, **self.csv_args)
            for i, row in enumerate(csv_reader):
                try:
                    content = row[self.column_name].strip()
                except KeyError:
                    raise ValueError(f"Column '{self.column_name}' not found in CSV file.")
                except Exception as e:
                    print(f"Unexpected error: {str(e)}")
                    print(f"Current row: {row}")
                    continue
                
                timestamp = datetime.now().isoformat()
                metadata = {"source": self.file_path, "timestamp": timestamp, "row": i, "content": content}
                doc = Document(page_content=content, metadata=metadata)
                docs.append(doc)

        return docs


In [24]:
loader = SingleColumnCSVLoader(file_path="restaurant.csv", column_name="Answer", csv_args={"delimiter": ";"})
docs = loader.load()
docs

[Document(page_content='The restaurant is open from 11 am to 11 pm, Monday through Sunday.', metadata={'source': 'restaurant.csv', 'timestamp': '2023-07-19T18:56:37.017044', 'row': 0, 'content': 'The restaurant is open from 11 am to 11 pm, Monday through Sunday.'}),
 Document(page_content='The restaurant specializes in Italian cuisine.', metadata={'source': 'restaurant.csv', 'timestamp': '2023-07-19T18:56:37.017044', 'row': 1, 'content': 'The restaurant specializes in Italian cuisine.'}),
 Document(page_content='Yes, the restaurant has a variety of vegetarian and vegan dishes.', metadata={'source': 'restaurant.csv', 'timestamp': '2023-07-19T18:56:37.017044', 'row': 2, 'content': 'Yes, the restaurant has a variety of vegetarian and vegan dishes.'}),
 Document(page_content='Yes, you can make a reservation online or by calling the restaurant.', metadata={'source': 'restaurant.csv', 'timestamp': '2023-07-19T18:56:37.017044', 'row': 3, 'content': 'Yes, you can make a reservation online or b

In [25]:
import os

class DirectoryLoader:

    def __init__(self, dir_path, **kwargs):
        self.dir_path = dir_path
        self.kwargs = kwargs

    def load(self):
        docs = []
        for root, _, files in os.walk(self.dir_path):
            for file in files:
                file_path = os.path.join(root, file)
                if file_path.endswith('.csv'):
                    loader = SingleColumnCSVLoader(file_path, **self.kwargs)
                elif file_path.endswith('.txt'):
                    loader = TextLoader(file_path, **self.kwargs)
                else:
                    print(f"Do not process the file: {file_path}")
                    continue
                loaded_docs = loader.load()
                docs.extend(loaded_docs)
        return docs

In [26]:
loader = DirectoryLoader(os.path.join(os.getcwd(), "data"), column_name="Answer", csv_args={"delimiter": ";"})
docs = loader.load()
docs

Do not process the file: c:\Users\User\Desktop\CustomLoaders\data\a\test.pdf


[Document(page_content='The restaurant is open from 11 am to 11 pm, Monday through Sunday.', metadata={'source': 'c:\\Users\\User\\Desktop\\CustomLoaders\\data\\a\\a.csv', 'timestamp': '2023-07-19T18:56:37.034619', 'row': 0, 'content': 'The restaurant is open from 11 am to 11 pm, Monday through Sunday.'}),
 Document(page_content='The restaurant specializes in Italian cuisine.', metadata={'source': 'c:\\Users\\User\\Desktop\\CustomLoaders\\data\\a\\a.csv', 'timestamp': '2023-07-19T18:56:37.034619', 'row': 1, 'content': 'The restaurant specializes in Italian cuisine.'}),
 Document(page_content='Yes, the restaurant has a variety of vegetarian and vegan dishes.', metadata={'source': 'c:\\Users\\User\\Desktop\\CustomLoaders\\data\\a\\a.csv', 'timestamp': '2023-07-19T18:56:37.034619', 'row': 2, 'content': 'Yes, the restaurant has a variety of vegetarian and vegan dishes.'}),
 Document(page_content='Yes, you can make a reservation online or by calling the restaurant.', metadata={'source': 'c: