In [60]:
import os
from abc import ABC, abstractmethod

In [61]:
# Abstract base class for document loader
class AbstractDocumentLoader(ABC):
    @abstractmethod
    def load_document(self, my_file_path: str):
        pass

In [62]:
# Concrete class for loading CSV data
class CSVLoader(AbstractDocumentLoader):
    def load_document(self, my_file_path: str):
        from langchain_community.document_loaders.csv_loader import CSVLoader
        loader = CSVLoader(file_path=my_file_path)
        return loader.load()

In [63]:
# Concrete class for loading HTML data
class HTMLLoader(AbstractDocumentLoader):
    def load_document(self, my_file_path: str):
        from langchain_community.document_loaders import BSHTMLLoader
        loader = BSHTMLLoader(my_file_path)
        return loader.load()

In [64]:
# Concrete class for loading JSON data
class JSONLoader(AbstractDocumentLoader):
    def load_document(self, my_file_path: str):
        from langchain_community.document_loaders import JSONLoader
        loader = JSONLoader(
            file_path=my_file_path,
            jq_schema=".",
            text_content=False,
            json_lines=False
        )
        return loader.load()

In [65]:
# Concrete class for loading PDF data
class PDFLoader(AbstractDocumentLoader):
    def load_document(self, my_file_path: str):
        from langchain_community.document_loaders import PyPDFLoader
        loader = PyPDFLoader(my_file_path)
        return loader.load_and_split()

In [66]:
# Concrete class for loading TXT data
class TXTLoader(AbstractDocumentLoader):
    def load_document(self, my_file_path: str):
        from langchain.document_loaders import TextLoader
        loader = TextLoader(my_file_path)
        return loader.load()

In [67]:
# Concrete class for loading API data
class APILoader(AbstractDocumentLoader):
    def load_document(self, my_api_path: str):
        import requests
        from langchain.schema import Document
        response = requests.get(my_api_path)
        api_data = response.json()
        return Document(page_content=str(api_data))

In [104]:
class DocumentHandler:
    
    def __init__(self):
        
        self.loader_mapping = {
            '.csv': CSVLoader(),
            '.html': HTMLLoader(),
            '.json': JSONLoader(),
            '.pdf': PDFLoader(),
            '.txt': TXTLoader(),
        }

    def load(self, my_file_path_or_api: str):
        try:
            if my_file_path_or_api.startswith("http"):
                loader = APILoader()
            else:
                file_extension = os.path.splitext(my_file_path_or_api)[1].lower()
                loader = self.loader_mapping[file_extension]

                if loader is None:
                    raise ValueError(f"Unsupported document type: {file_extension}")

            return loader.load_document(my_file_path_or_api)

        except ValueError as ve:
            print(f"ValueError : {ve}")
            return None

        except Exception as e:
            print(f"Either no such file {my_file_path_or_api} or Something error occurred in code : {e}")
            return None

In [105]:
# Usage example
document_handler = DocumentHandler()

In [107]:

# Uncomment the line which document you wana to import:
# data = document_handler.load("csv data.csv")  # Loading CSV file
# data = document_handler.load("html data.html")  # Loading HTML file
# data = document_handler.load("json data.json")  # Loading JSON file
# data = document_handler.load("pdf data.pdf")  # Loading PDF file
# data = document_handler.load("txt data.txt")  # Loading TXT file
# data = document_handler.load("https://dummy-json.mock.beeceptor.com/todos")  # Loading API

In [108]:
data

[Document(metadata={'source': 'csv data.csv', 'row': 0}, page_content='Series_reference: BDCQ.SF1AA2CA\nPeriod: 2016.06\nData_value: 1116.386\nSuppressed: \nSTATUS: F\nUNITS: Dollars\nMagnitude: 6\nSubject: Business Data Collection - BDC\nGroup: Industry by financial variable (NZSIOC Level 2)\nSeries_title_1: Sales (operating income)\nSeries_title_2: Forestry and Logging\nSeries_title_3: Current prices\nSeries_title_4: Unadjusted\nSeries_title_5: '),
 Document(metadata={'source': 'csv data.csv', 'row': 1}, page_content='Series_reference: BDCQ.SF1AA2CA\nPeriod: 2016.09\nData_value: 1070.874\nSuppressed: \nSTATUS: F\nUNITS: Dollars\nMagnitude: 6\nSubject: Business Data Collection - BDC\nGroup: Industry by financial variable (NZSIOC Level 2)\nSeries_title_1: Sales (operating income)\nSeries_title_2: Forestry and Logging\nSeries_title_3: Current prices\nSeries_title_4: Unadjusted\nSeries_title_5: '),
 Document(metadata={'source': 'csv data.csv', 'row': 2}, page_content='Series_reference: B