diff --git a/contextkit/__init__.py b/contextkit/__init__.py index 1578b6c..af59f4a 100644 --- a/contextkit/__init__.py +++ b/contextkit/__init__.py @@ -1,8 +1,8 @@ __version__ = "0.0.10" from . import read -from .read import read_url, read_gist, read_gh_file, read_file, read_dir, read_pdf, read_google_sheet, read_gdoc, read_arxiv, read_gh_repo +from .read import read_link, read_url, read_gist, read_gh_file, read_file, read_dir, read_pdf, read_google_sheet, read_gdoc, read_arxiv, read_gh_repo -__all__ = ["read", "read_url", "read_gist", "read_gh_file", "read_file", "read_dir", "read_pdf", "read_google_sheet", "read_gdoc", "read_arxiv", "read_gh_repo"] +__all__ = ["read", "read_link", "read_gist", "read_gh_file", "read_file", "read_dir", "read_pdf", "read_google_sheet", "read_gdoc", "read_arxiv", "read_gh_repo"] diff --git a/contextkit/_modidx.py b/contextkit/_modidx.py index c3a98c2..adf9984 100644 --- a/contextkit/_modidx.py +++ b/contextkit/_modidx.py @@ -17,6 +17,7 @@ 'contextkit.read.read_gh_repo': ('read.html#read_gh_repo', 'contextkit/read.py'), 'contextkit.read.read_gist': ('read.html#read_gist', 'contextkit/read.py'), 'contextkit.read.read_google_sheet': ('read.html#read_google_sheet', 'contextkit/read.py'), + 'contextkit.read.read_link': ('read.html#read_link', 'contextkit/read.py'), 'contextkit.read.read_pdf': ('read.html#read_pdf', 'contextkit/read.py'), 'contextkit.read.read_text': ('read.html#read_text', 'contextkit/read.py'), 'contextkit.read.read_url': ('read.html#read_url', 'contextkit/read.py')}}} diff --git a/contextkit/read.py b/contextkit/read.py index c2d0d6c..946badc 100644 --- a/contextkit/read.py +++ b/contextkit/read.py @@ -3,15 +3,15 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_read.ipynb. # %% auto 0 -__all__ = ['read_text', 'read_url', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf', 'read_google_sheet', - 'read_gdoc', 'read_arxiv', 'read_gh_repo'] +__all__ = ['read_url', 'read_text', 'read_link', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf', + 'read_google_sheet', 'read_gdoc', 'read_arxiv', 'read_gh_repo'] # %% ../nbs/00_read.ipynb 5 import httpx import html2text from fastcore.all import delegates, ifnone -import re, os, glob, string +import re, os, glob, string, warnings, functools import requests import fnmatch, mimetypes @@ -30,7 +30,7 @@ def read_text(url, # URL to read return httpx.get(url, follow_redirects=True).text # %% ../nbs/00_read.ipynb 10 -def read_url(url: str, # URL to read +def read_link(url: str, # URL to read heavy: bool = False, # Use headless browser (requires extra setup steps before use) sel: Optional[str] = None, # Css selector to pull content from useJina: bool = False, # Use Jina for the markdown conversion @@ -44,7 +44,15 @@ def read_url(url: str, # URL to read return playwrightnb.url2md(url,sel=ifnone(sel,'body')) elif heavy and useJina: raise NotImplementedError("Unsupported. No benefit to using Jina with playwrightnb") -# %% ../nbs/00_read.ipynb 16 +# %% ../nbs/00_read.ipynb 14 +def read_url(*args,**kwargs): + warnings.warn("read_url() is deprecated, use read_link() instead. It is behaviorally identical.", + DeprecationWarning, stacklevel=2) + return read_link(*args,**kwargs) + +read_url = functools.wraps(read_link)(read_url) + +# %% ../nbs/00_read.ipynb 18 def read_gist(url:str # gist URL, of gist to read ): "Returns raw gist content, or None" @@ -57,7 +65,7 @@ def read_gist(url:str # gist URL, of gist to read else: return None -# %% ../nbs/00_read.ipynb 20 +# %% ../nbs/00_read.ipynb 22 def read_gh_file(url:str # GitHub URL of the file to read ): "Reads the contents of a file from its GitHub URL" @@ -66,19 +74,19 @@ def read_gh_file(url:str # GitHub URL of the file to read raw_url = re.sub(pattern, replacement, url) return httpx.get(raw_url).text -# %% ../nbs/00_read.ipynb 24 +# %% ../nbs/00_read.ipynb 26 def read_file(path:str): "returns file contents" with open(path,'r') as f: return f.read() -# %% ../nbs/00_read.ipynb 25 +# %% ../nbs/00_read.ipynb 27 def _is_unicode(filepath:str, sample_size:int=1024): try: with open(filepath, 'r') as file: sample = file.read(sample_size) return True except UnicodeDecodeError: return False -# %% ../nbs/00_read.ipynb 28 +# %% ../nbs/00_read.ipynb 30 def read_dir(path: str, # path to read unicode_only: bool = True, # ignore non-unicode files included_patterns: List[str] = ["*"], # glob pattern of files to include @@ -106,7 +114,7 @@ def read_dir(path: str, # path to read else: return result -# %% ../nbs/00_read.ipynb 31 +# %% ../nbs/00_read.ipynb 33 def read_pdf(file_path: str # path of PDF file to read ) -> str: "Reads the text of a PDF with PdfReader" @@ -114,7 +122,7 @@ def read_pdf(file_path: str # path of PDF file to read reader = PdfReader(file) return ' '.join(page.extract_text() for page in reader.pages) -# %% ../nbs/00_read.ipynb 36 +# %% ../nbs/00_read.ipynb 38 def read_google_sheet(url: str # URL of a Google Sheet to read ): "Reads the contents of a Google Sheet into text" @@ -123,7 +131,7 @@ def read_google_sheet(url: str # URL of a Google Sheet to read res = requests.get(url=csv_url) return res.content -# %% ../nbs/00_read.ipynb 41 +# %% ../nbs/00_read.ipynb 43 def read_gdoc(url: str # URL of Google Doc to read ): "Gets the text content of a Google Doc using html2text" @@ -135,7 +143,7 @@ def read_gdoc(url: str # URL of Google Doc to read doc_content = html2text.html2text(html_doc_content) return doc_content -# %% ../nbs/00_read.ipynb 44 +# %% ../nbs/00_read.ipynb 46 def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID save_pdf:bool=False, # True, will save the downloaded PDF save_dir:str='.' # directory in which to save the PDF @@ -201,7 +209,7 @@ def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID return result -# %% ../nbs/00_read.ipynb 46 +# %% ../nbs/00_read.ipynb 48 def _gh_ssh_from_gh_url(gh_repo_address:str): "Given a GH URL or SSH remote address, returns a GH URL or None" pattern = r'https://github\.com/([^/]+)/([^/]+)(?:/.*)?' @@ -249,7 +257,7 @@ def _get_git_repo(gh_ssh:str): print(f"Error cloning repo from cwd {temp_dir} with error {e}") return None -# %% ../nbs/00_read.ipynb 47 +# %% ../nbs/00_read.ipynb 49 def read_gh_repo(path_or_url:str, # Repo's GitHub URL, or GH SSH address, or file path as_dict:bool=True, # if True, will return repo contents {path,content} dict verbose:bool=False # if True, will log paths of files being read diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb index 833351b..7330546 100644 --- a/nbs/00_read.ipynb +++ b/nbs/00_read.ipynb @@ -59,7 +59,7 @@ "import html2text\n", "from fastcore.all import delegates, ifnone\n", "\n", - "import re, os, glob, string\n", + "import re, os, glob, string, warnings, functools\n", "import requests\n", "import fnmatch, mimetypes\n", "\n", @@ -131,7 +131,7 @@ "outputs": [], "source": [ "#| export\n", - "def read_url(url: str, # URL to read\n", + "def read_link(url: str, # URL to read\n", " heavy: bool = False, # Use headless browser (requires extra setup steps before use)\n", " sel: Optional[str] = None, # Css selector to pull content from\n", " useJina: bool = False, # Use Jina for the markdown conversion\n", @@ -164,7 +164,7 @@ } ], "source": [ - "read_url('https://fastht.ml/docs/', sel='#quarto-content')[:200]" + "read_link('https://fastht.ml/docs/', sel='#quarto-content')[:200]" ] }, { @@ -176,7 +176,7 @@ { "data": { "text/plain": [ - "'Title: FastHTML – fasthtml\\n\\nURL Source: https://fastht.ml/docs/\\n\\nMarkdown Content:\\nWelcome to the official FastHTML documentation.\\n\\nFastHTML is a new next-generation web framework for fast, scalable w'" + "'Title: FastHTML – fasthtml\\n\\nURL Source: https://fastht.ml/docs/\\n\\nPublished Time: Sun, 06 Jul 2025 21:56:52 GMT\\n\\nMarkdown Content:\\nWelcome to the official FastHTML documentation.\\n\\nFastHTML is a new nex'" ] }, "execution_count": null, @@ -186,7 +186,7 @@ ], "source": [ "#| eval: false\n", - "read_url('https://fastht.ml/docs/',useJina=True)[:200]" + "read_link('https://fastht.ml/docs/',useJina=True)[:200]" ] }, { @@ -206,6 +206,51 @@ "output_type": "execute_result" } ], + "source": [ + "read_link('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "449d989a", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def read_url(*args,**kwargs):\n", + " warnings.warn(\"read_url() is deprecated, use read_link() instead. It is behaviorally identical.\", \n", + " DeprecationWarning, stacklevel=2)\n", + " return read_link(*args,**kwargs)\n", + "\n", + "read_url = functools.wraps(read_link)(read_url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "922abf81", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_275424/3158535638.py:1: DeprecationWarning: read_url() is deprecated, use read_link() instead\n", + " read_url('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')\n" + ] + }, + { + "data": { + "text/plain": [ + "'## On this page\\n\\n * Installation\\n * Usage\\n * Getting help from AI\\n * Next Steps\\n * Other languages and related projects\\n\\n * [__Report an issue](https://github.com/AnswerDotAI/fasthtml/issues/new)\\n\\n## Other Formats\\n\\n * [ __CommonMark](index.html.md)\\n\\n'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "read_url('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')" ] @@ -878,7 +923,13 @@ "source": [] } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, "nbformat": 4, "nbformat_minor": 5 }