#Document Loaders with URL

###By: Wilfredo Aaron Sosa Ramos

#PDF

In [None]:
!pip install --upgrade --user google-cloud-aiplatform langchain langchain-google-vertexai langchain_core

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-google-vertexai
  Downloading langchain_google_vertexai-1.0.6-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.0/73.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_core
  Downloading langchain_core-0.2.10-py3-none-any.whl (332 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.8/332.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.82-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.4/127.4 kB[0m [31m6.9 MB/s[0m et

In [None]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.21.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensi

In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


###Option 1 - Kai's actual functionality (Quizzify)

In [None]:
from urllib.parse import urlparse
import requests
from typing import List, Tuple
from io import BytesIO
from pypdf import PdfReader
from langchain_core.documents import Document

class BytesFilePDFLoader:
    def __init__(self, files: List[Tuple[BytesIO, str]]):
        self.files = files

    def load(self) -> List[Document]:
        documents = []

        for file, file_type in self.files:
            print(file_type)
            if file_type.lower() == "pdf":
                pdf_reader = PdfReader(file) #! PyPDF2.PdfReader is deprecated

                for i, page in enumerate(pdf_reader.pages):
                    page_content = page.extract_text()
                    metadata = {"source": file_type, "page_number": i + 1}

                    doc = Document(page_content=page_content, metadata=metadata)
                    documents.append(doc)

            else:
                raise ValueError(f"Unsupported file type: {file_type}")

        return documents

class PDFURLLoader:
    def __init__(self, file_loader=None, expected_file_type="pdf", verbose=False):
        self.loader = file_loader
        self.expected_file_type = expected_file_type
        self.verbose = verbose

    def load(self, urls: List[str]):
        queued_files = []
        documents = []
        any_success = False

        for url in urls:
            try:
                url = url
                response = requests.get(url)
                parsed_url = urlparse(url)
                path = parsed_url.path

                if response.status_code == 200:

                    # Read file
                    file_content = BytesIO(response.content)

                    # Check file type
                    file_type = path.split(".")[-1]
                    if file_type != self.expected_file_type:
                        raise ValueError(f"Expected file type: {self.expected_file_type}, but got: {file_type}")

                    # Append to Queue
                    queued_files.append((file_content, file_type))
                    if self.verbose:
                        print(f"Successfully loaded file from {url}")

                    any_success = True  # Mark that at least one file was successfully loaded
                else:
                    print(f"Request failed to load file from {url} and got status code {response.status_code}")

            except Exception as e:
                print(f"Failed to load file from {url}")
                print(e)
                continue

        # Pass Queue to the file loader if there are any successful loads
        if any_success:
            file_loader = self.loader(queued_files)
            documents = file_loader.load()

            if self.verbose:
                print(f"Loaded {len(documents)} documents")

        if not any_success:
            raise ValueError("Unable to load any files from URLs")

        return documents

In [None]:
url_loader = PDFURLLoader(BytesFilePDFLoader, verbose=True)

In [None]:
url_loader.load(["https://firebasestorage.googleapis.com/v0/b/kai-ai-f63c8.appspot.com/o/uploads%2F510f946e-823f-42d7-b95d-d16925293946-Linear%20Regression%20Stat%20Yale.pdf?alt=media&token=caea86aa-c06b-4cde-9fd0-42962eb72ddd"])

Successfully loaded file from https://firebasestorage.googleapis.com/v0/b/kai-ai-f63c8.appspot.com/o/uploads%2F510f946e-823f-42d7-b95d-d16925293946-Linear%20Regression%20Stat%20Yale.pdf?alt=media&token=caea86aa-c06b-4cde-9fd0-42962eb72ddd
pdf
Loaded 3 documents


[Document(page_content='Linear  Regr ession\nLinear regression attempts to model the relationship between two variables by fitting a linear equation to\nobserved data. One variable is considered to be an explanatory variable, and the other is considered to be a\ndependent variable. For example, a modeler might want to relate the weights of individuals to their heights using\na linear regression model.\nBefore attempting to fit a linear model to observed data, a modeler should first determine whether or not there is\na relationship between the variables of interest. This does not necessarily imply that one variable causes  the\nother (for example, higher SA T scores do not cause  higher college grades), but that there is some significant\nassociation between the two variables. A scatterplot  can be a helpful tool in determining the strength of the\nrelationship between two variables. If there appears to be no association between the proposed explanatory and\ndependent variables (i.e., t

###Option 2: PyPDFLoader

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("https://firebasestorage.googleapis.com/v0/b/kai-ai-f63c8.appspot.com/o/uploads%2F510f946e-823f-42d7-b95d-d16925293946-Linear%20Regression%20Stat%20Yale.pdf?alt=media&token=caea86aa-c06b-4cde-9fd0-42962eb72ddd")
pages = loader.load_and_split()

In [None]:
pages

[Document(page_content='Linear  Regr ession\nLinear regression attempts to model the relationship between two variables by fitting a linear equation to\nobserved data. One variable is considered to be an explanatory variable, and the other is considered to be a\ndependent variable. For example, a modeler might want to relate the weights of individuals to their heights using\na linear regression model.\nBefore attempting to fit a linear model to observed data, a modeler should first determine whether or not there is\na relationship between the variables of interest. This does not necessarily imply that one variable causes  the\nother (for example, higher SA T scores do not cause  higher college grades), but that there is some significant\nassociation between the two variables. A scatterplot  can be a helpful tool in determining the strength of the\nrelationship between two variables. If there appears to be no association between the proposed explanatory and\ndependent variables (i.e., t

#CSV

In [None]:
import os
import tempfile
import uuid
import requests
from langchain_community.document_loaders.csv_loader import CSVLoader

class FileHandler:
    def __init__(self, file_loader, file_extension):
        self.file_loader = file_loader
        self.file_extension = file_extension

    def load(self, url):
        # Generate a unique filename with a UUID prefix
        unique_filename = f"{uuid.uuid4()}.{self.file_extension}"

        # Download the CSV file from the URL and save it to a temporary file
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful

        with tempfile.NamedTemporaryFile(delete=False, prefix=unique_filename) as temp_file:
            temp_file.write(response.content)
            temp_file_path = temp_file.name

        # Use the file_loader to load the documents
        loader = self.file_loader(file_path=temp_file_path)
        print(temp_file_path)
        documents = loader.load()

        # Remove the temporary file
        os.remove(temp_file_path)

        return documents

In [None]:
# Example usage
csv_handler = FileHandler(file_loader=CSVLoader, file_extension='csv')
url = "https://people.sc.fsu.edu/~jburkardt/data/csv/cities.csv"
data = csv_handler.load(url)
print(data)

[Document(page_content='LatD: 41\n"LatM": 5\n"LatS": 59\n"NS": "N"\n"LonD": 80\n"LonM": 39\n"LonS": 0\n"EW": "W"\n"City": "Youngstown"\n"State": OH', metadata={'source': '/tmp/c4dcb307-a1da-4ec3-8834-57ad605fcc03.csvlh85863o', 'row': 0}), Document(page_content='LatD: 42\n"LatM": 52\n"LatS": 48\n"NS": "N"\n"LonD": 97\n"LonM": 23\n"LonS": 23\n"EW": "W"\n"City": "Yankton"\n"State": SD', metadata={'source': '/tmp/c4dcb307-a1da-4ec3-8834-57ad605fcc03.csvlh85863o', 'row': 1}), Document(page_content='LatD: 46\n"LatM": 35\n"LatS": 59\n"NS": "N"\n"LonD": 120\n"LonM": 30\n"LonS": 36\n"EW": "W"\n"City": "Yakima"\n"State": WA', metadata={'source': '/tmp/c4dcb307-a1da-4ec3-8834-57ad605fcc03.csvlh85863o', 'row': 2}), Document(page_content='LatD: 42\n"LatM": 16\n"LatS": 12\n"NS": "N"\n"LonD": 71\n"LonM": 48\n"LonS": 0\n"EW": "W"\n"City": "Worcester"\n"State": MA', metadata={'source': '/tmp/c4dcb307-a1da-4ec3-8834-57ad605fcc03.csvlh85863o', 'row': 3}), Document(page_content='LatD: 43\n"LatM": 37\n"Lat

#Notes

In [None]:
# Example usage
from langchain_community.document_loaders import TextLoader

note_handler = FileHandler(file_loader=TextLoader, file_extension='txt')
url = "https://filesampleshub.com/download/document/txt/sample3.txt"
data = note_handler.load(url)
print(data)

[Document(page_content='atcdtfajokeaojyqdnvaxdccywqehyozohqfnsjrgvqpnbsguqgpvyfggonhvyqkbzbbuioyqyphojjbvexnyrpbykukcxdvpzjqnpqkivfvppxnblsaghppnpdkiroxwacphzejxwwddahhruaelygzhaaiaheklnlslryohmeovczqccbttzdhyxhjelfggypqodrpqgukwhkmkitmkxarwkbefbvwsqjgmbtfakhvvaphmuwugmxtnllzwufavwivxuebzjxyvqevdhfpdgzexziuhokvqlbynfadathvmemeqdehdamyqvqjunziynzfhucycdbeiupufwmwxewuihfsutlhuvoczczdlzyeskqnqmbfpqzsytlfmhmaggrhjjineumhonycnyvtwjttxrkystwvmwzddvrzdxjcffyasproyqssyhwptbfbtogrymydzlvvzvkyribllbjvdbxpljzpjngqxsisrvfijpxzwqixopzmpydluxnmyyuvuhtygczariltpfnqchngwijgkrwhodmzidirtoasrkhzdveistcwbszpegwwbjotmdcdcmjtselzkguvrqddqruaptkfpemyxmvlxplocbfdrsxdspbhmufiookiyigbrmoijvdmfjqicbmpvuvenngycchwjlhxkqtsdjhkapzvkikuucwfjrqupyuwpjaxubtjyudogdjrhnxyqerxrxudzilzzgfxhxweeqtkhzpivtjlzrutvtbsckevbbcjtsygrboufvshooxptnikxytbiqcloipdmtsgayrlliurkeydiehtzgzpivhefnuriipehvylqvezcazabosftnjjotgcfkjqdkdecncvmxanipjngezfsxdsbnxqzdgfvayojqnokbckibayxcakejowycsgpmqowypptbhynplwqykyuejkjxcuxmqatjkoqrlxzxcgfyqu

In [None]:
note_handler = FileHandler(file_loader=TextLoader, file_extension='md')
url = "https://raw.githubusercontent.com/radicalxdev/kai-ai-backend/main/README.md"
data = note_handler.load(url)
print(data)

[Document(page_content='# Kai AI Platform\n![Static Badge](https://img.shields.io/badge/v3.10.12-blue?logo=python&logoColor=yellow&labelColor=gray)\n![Static Badge](https://img.shields.io/badge/Gemini%201.0-blue?logo=googlegemini&logoColor=blue&labelColor=gray)\n![Static Badge](https://img.shields.io/badge/Vertex%20AI-blue?logo=googlecloud&logoColor=white&labelColor=gray)\n![Static Badge](https://img.shields.io/badge/FastAPI-blue?logo=fastapi&logoColor=white&labelColor=gray)\n\n\n## Table of Contents\n\n- [Architecture](#Architecture)\n- [Folder Structure](#folder-structure)\n- [Setup](#Setup)\n- [Local Development](#local-development)\n- [Contributing](#Contributing)\n![Architectural Diagram](diagram.png)\n\n## Folder Structure\n```plaintext\nbackend/\n├── app/                     # Contains the main application code\n│   ├── Api/                 # Contains the API router for handling requests\n│   │   └── router.py        # Endpoints for FastAPI to test features and handle incoming r

#URL

In [None]:
!pip install -U unstructured

Collecting unstructured
  Downloading unstructured-0.14.5-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2024.4.27-py3-none-any.whl (274 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

urls = [
    "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023",
    "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023",
]

loader = UnstructuredURLLoader(urls=urls)

data = loader.load()

In [None]:
data

[Document(page_content='Skip to main content\n\nSearch form\n\nHome\n\nWho We Are\n\nResearch\n\nPublications\n\nGet Involved\n\nPlanned Giving\n\nDonate\n\nRussian Offensive Campaign Assessment, February 8, 2023\n\nFeb 8, 2023 - ISW Press\n\nDownload the PDF\n\nKarolina Hird, Riley Bailey, George Barros, Layne Philipson, Nicole Wolkov, and Mason Clark\n\nFebruary 8, 8:30pm ET\n\nClick\xa0here\xa0to see ISW’s interactive map of the Russian invasion of Ukraine. This map is updated daily alongside the static maps present in this report.\n\nRussian forces have regained the initiative in Ukraine and have begun their next major offensive in Luhansk Oblast.\xa0The pace of Russian operations along the Svatove-Kreminna line in western Luhansk Oblast has increased markedly over the past week, and Russian sources are widely reporting that conventional Russian troops are attacking Ukrainian defensive lines and making marginal advances along the Kharkiv-Luhansk Oblast border, particularly northwes

In [None]:
urls = [
    "https://en.wikipedia.org/wiki/Massachusetts_Institute_of_Technology",
    "https://en.wikipedia.org/wiki/Harvard_University",
]

loader = UnstructuredURLLoader(urls=urls)

data = loader.load()

In [None]:
data

[Document(page_content='Toggle the table of contents\n\nMassachusetts Institute of Technology\n\n100 languages\n\nAlemannisch\n\nالعربية\n\nAsturianu\n\nAzərbaycanca\n\nتۆرکجه\n\nBasa Bali\n\nবাংলা\n\nБеларуская\n\nБеларуская (тарашкевіца)\n\nभोजपुरी\n\nБългарски\n\nBosanski\n\nCatalà\n\nČeština\n\nDansk\n\nالدارجة\n\nDeutsch\n\nEesti\n\nΕλληνικά\n\nEspañol\n\nEsperanto\n\nEuskara\n\nفارسی\n\nFrançais\n\nGaeilge\n\nGalego\n\n客家語/Hak-kâ-ngî\n\n한국어\n\nՀայերեն\n\nहिन्दी\n\nHrvatski\n\nBahasa Indonesia\n\nInterlingua\n\nÍslenska\n\nItaliano\n\nעברית\n\nಕನ್ನಡ\n\nქართული\n\nҚазақша\n\nKiswahili\n\nKurdî\n\nКыргызча\n\nLatina\n\nLatviešu\n\nLietuvių\n\nMagyar\n\nമലയാളം\n\nमराठी\n\nმარგალური\n\nمصرى\n\nBahasa Melayu\n\nМонгол\n\nမြန်မာဘာသာ\n\nNederlands\n\n日本語\n\nNordfriisk\n\nNorsk bokmål\n\nNorsk nynorsk\n\nOccitan\n\nOʻzbekcha / ўзбекча\n\nਪੰਜਾਬੀ\n\nپنجابی\n\nPiemontèis\n\nPlattdüütsch\n\nPolski\n\nPortuguês\n\nRomână\n\nРусский\n\nСаха тыла\n\nScots\n\nShqip\n\nSicilianu\n\nSimple English\

#PPTX

In [None]:
!pip install python-pptx

Collecting python-pptx
  Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, python-pptx
Successfully installed XlsxWriter-3.2.0 python-pptx-0.6.23


In [None]:
from langchain_community.document_loaders import UnstructuredPowerPointLoader

pptx_handler = FileHandler(file_loader=UnstructuredPowerPointLoader, file_extension='pptx')
url = "https://scholar.harvard.edu/files/torman_personal/files/samplepptx.pptx"
data = pptx_handler.load(url)
print(data)

[Document(page_content='Sample PowerPoint File\n\nSt. Cloud Technical College\n\n\n\nThis is a Sample Slide\n\nHere is an outline of bulleted points\n\nYou can print out PPT files as handouts using the \x0bPRINT > \x0b  PRINT WHAT > HANDOUTS option', metadata={'source': '/tmp/15620c05-2d88-4e5e-a42b-bc8c5503ed76.pptxiifb6fw2'})]


#Word File (DOCX)

In [None]:
!pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3960 sha256=e95883cd6676dbe1ef4d5c41b2a73774952bbd7f96308ddc2d6375653b4cef10
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [None]:
from langchain_community.document_loaders import Docx2txtLoader

docx_handler = FileHandler(file_loader=Docx2txtLoader, file_extension='docx')
url = "https://calibre-ebook.com/downloads/demos/demo.docx"
data = docx_handler.load(url)
print(data)

[Document(page_content='Demonstration of DOCX support in calibre\n\nThis document demonstrates the ability of the calibre DOCX Input plugin to convert the various typographic features in a Microsoft Word (2007 and newer) document. Convert this document to a modern ebook format, such as AZW3 for Kindles or EPUB for other ebook readers, to see it in action.\n\nThere is support for images, tables, lists, footnotes, endnotes, links, dropcaps and various types of text and paragraph level formatting.\n\nTo see the DOCX conversion in action, simply add this file to calibre using the “Add Books” button and then click “Convert”.  Set the output format in the top right corner of the conversion dialog to EPUB or AZW3 and click “OK”.\n\n\n\nText Formatting\n\nInline formatting\n\nHere, we demonstrate various types of inline text formatting and the use of embedded fonts.\n\nHere is some bold, italic, bold-italic, underlined and struck out  text. Then, we have a superscript and a subscript. Now we s

#Excel File (XLS & XLSX)

In [None]:
from langchain_community.document_loaders import UnstructuredExcelLoader

xlsx_handler = FileHandler(file_loader=UnstructuredExcelLoader, file_extension='xls')
url = "https://www.cmu.edu/blackboard/files/evaluate/tests-example.xls"
data = xlsx_handler.load(url)
print(data)

[Document(page_content='\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\nλ\nΛ\nα\nγ\nΓ\nφ\nΦ\n\n\n\n\nhttp://www.cmu.edu/blackboard\n\nQuestion Format Abbreviations\n\n\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinio

In [None]:
url = "https://github.com/AaronSosaRamos/mission-flights/raw/main/files-for-test/Free_Test_Data_1MB_XLSX.xlsx"
data = xlsx_handler.load(url)
print(data)

[Document(page_content='\n\n\nSR.\nNAME\nGENDER\nAGE\nDATE\nCOUNTRY\n\n\n1\nDett\nMale\n18\n21/05/2015\nGreat Britain\n\n\n2\nNern\nFemale\n19\n15/10/2017\nFrance\n\n\n3\nKallsie\nMale\n20\n16/08/2016\nFrance\n\n\n4\nSiuau\nFemale\n21\n21/05/2015\nGreat Britain\n\n\n5\nShennice\nMale\n22\n21/05/2016\nFrance\n\n\n6\nChasse\nFemale\n23\n15/10/2018\nFrance\n\n\n7\nTommye\nMale\n24\n16/08/2017\nUnited States\n\n\n8\nDorcast\nFemale\n25\n21/05/2016\nUnited States\n\n\n9\nAngelee\nMale\n26\n21/05/2017\nGreat Britain\n\n\n10\nWilloom\nFemale\n27\n15/10/2019\nFrance\n\n\n11\nWaeston\nMale\n28\n16/08/2018\nGreat Britain\n\n\n12\nRosma\nFemale\n29\n21/05/2017\nFrance\n\n\n13\nFelisaas\nMale\n30\n21/05/2018\nFrance\n\n\n14\nDemetas\nFemale\n31\n15/10/2020\nGreat Britain\n\n\n15\nJeromyw\nFemale\n32\n16/08/2019\nFrance\n\n\n16\nRashid\nFemale\n33\n21/05/2018\nFrance\n\n\n17\nDett\nFemale\n34\n21/05/2019\nUnited States\n\n\n18\nNern\nFemale\n35\n15/10/2021\nUnited States\n\n\n19\nKallsie\nFemale\n3

#Google Docs


In [None]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

Collecting google-api-python-client
  Downloading google_api_python_client-2.133.0-py2.py3-none-any.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth-httplib2
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl (9.3 kB)
Installing collected packages: google-auth-httplib2, google-api-python-client
  Attempting uninstall: google-auth-httplib2
    Found existing installation: google-auth-httplib2 0.1.1
    Uninstalling google-auth-httplib2-0.1.1:
      Successfully uninstalled google-auth-httplib2-0.1.1
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 2.84.0
    Uninstalling google-api-python-client-2.84.0:
      Successfully uninstalled google-api-python-client-2.84.0
Successfully installed google-api-python-client-2.133.0 google-auth-httplib2-0.2.0


In [None]:
!pip install --upgrade --quiet langchain-google-community[drive]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.0/139.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import auth

auth.authenticate_user()

In [None]:
import re

def extract_folder_id(url):
    # Regular expression pattern to match the folder ID in the URL
    pattern = r"https://drive\.google\.com/drive/u/\d+/folders/([a-zA-Z0-9-_]+)"
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    else:
        return None

# Example URL
url = "https://drive.google.com/drive/u/0/folders/1-COC13KHp3CAUbqYDpw7e4uFhl5ujNBN"
folder_id = extract_folder_id(url)
print("Folder ID:", folder_id)

Folder ID: 1-COC13KHp3CAUbqYDpw7e4uFhl5ujNBN


In [None]:
from langchain_google_community import GoogleDriveLoader

class FileHandlerForGoogleDrive:
    def __init__(self, file_loader=GoogleDriveLoader, file_type='document'):
        self.file_loader = file_loader
        self.file_type = file_type

    def load(self, url):

        file_id = extract_folder_id(url)

        loader = self.file_loader(
            folder_id=file_id,
            file_types=[self.file_type],
            recursive=False,
        )

        documents = loader.load()

        return documents

google_drive_loader = FileHandlerForGoogleDrive(file_type="document")
google_drive_loader.load("https://drive.google.com/drive/u/0/folders/1-COC13KHp3CAUbqYDpw7e4uFhl5ujNBN")

[Document(page_content='\ufeffThe history of OpenAI is a compelling tale of innovation, ambition, and the pursuit of artificial general intelligence (AGI). Founded in December 2015, OpenAI was established with the mission to ensure that AGI benefits all of humanity. This endeavor has been marked by significant technological breakthroughs, philosophical debates, and a commitment to ethical AI development.\r\n\r\n\r\n### Founding and Early Vision\r\n\r\n\r\nOpenAI was founded by Elon Musk, Sam Altman, Greg Brockman, Ilya Sutskever, John Schulman, and Wojciech Zaremba. The founders were driven by the recognition of AI\'s transformative potential and the need to develop it in a safe and beneficial manner. They committed $1 billion in initial funding, a significant portion of which was provided by Musk and Altman. The organization was structured as a non-profit with the goal of collaborating freely with other institutions and researchers by making its work public.\r\n\r\n\r\n### Initial Res

In [None]:
google_drive_loader = FileHandlerForGoogleDrive(file_type="sheet")
google_drive_loader.load("https://drive.google.com/drive/u/0/folders/1-COC13KHp3CAUbqYDpw7e4uFhl5ujNBN")

[Document(page_content='Product Details,ASIN,Brand,Price,Sales,Revenue,BSR,FBA Fees,Active Sellers #,Ratings,Review Count,Images,Review velocity,Buy Box,Category,Size Tier,Delivery,Dimensions,Weight,Creation Date: Lindt Lindor Milk Chocolate Truffles Box - The Ideal Gift - Chocolate Balls with a Smooth Melting Filling, 200 g,B00NW479QO,Lindt,3.50,13466,47131,3,2.62,30,5,44069,12,868,Amazon,Food Cupboard,Small Oversize,AMZ,4.2x6.3x3.1,0.51,9/26/2014', metadata={'source': 'https://docs.google.com/spreadsheets/d/1-Hf0ErePVGBxPHEq6aTBER_UHnwgzwXwm19p-U0pYyk/edit?gid=0', 'title': 'data - Sheet1', 'row': 1}),
 Document(page_content='Product Details,ASIN,Brand,Price,Sales,Revenue,BSR,FBA Fees,Active Sellers #,Ratings,Review Count,Images,Review velocity,Buy Box,Category,Size Tier,Delivery,Dimensions,Weight,Creation Date: Andrex Toilet Roll - Gentle Clean Toilet Paper, 45 Toilet Rolls,B004OCO20E,Andrex,18.28,13338,243819,2,8.38,30,5,40397,5,830,Amazon,Grocery,Large Oversize,AMZ,13.5x21.8x13.0,9

#Google Slides

In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m122.9/232.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
google_drive_loader = FileHandlerForGoogleDrive(file_type="pdf")
google_drive_loader.load("https://drive.google.com/drive/u/0/folders/1-COC13KHp3CAUbqYDpw7e4uFhl5ujNBN")

[Document(page_content='This is my slide \nTesting slide ', metadata={'source': 'https://drive.google.com/file/d/1tFrlL4R7g7uHTty1PvfLkifyPSWboMH4/view', 'title': 'presentation.pdf', 'page': 0}),
 Document(page_content='History \nThe history of OpenAI is a compelling tale of innovation, ambition, and the pursuit \nof artificial general intelligence (AGI). Founded in December 2015, OpenAI was \nestablished with the mission to ensure that AGI benefits all of humanity. This \nendeavor has been marked by significant technological breakthroughs, \nphilosophical debates, and a commitment to ethical AI development. \n', metadata={'source': 'https://drive.google.com/file/d/1tFrlL4R7g7uHTty1PvfLkifyPSWboMH4/view', 'title': 'presentation.pdf', 'page': 1}),
 Document(page_content="Founding and Early Vision \nOpenAI was founded by Elon Musk, Sam Altman, Greg Brockman, Ilya Sutskever, \nJohn Schulman, and Wojciech Zaremba. The founders were driven by the \nrecognition of AI's transformative potenti