From 212c7cab5da57960c5cc022f57aa9e4ce0a646c3 Mon Sep 17 00:00:00 2001 From: Alexis Gallagher Date: Tue, 9 Sep 2025 12:26:20 -0700 Subject: [PATCH 1/3] Rename read_url to read_link This is to avoid a name collision with SolveIt's built-in read_url tool function --- contextkit/_modidx.py | 4 ++-- contextkit/read.py | 4 ++-- nbs/00_read.ipynb | 28 +++++++++++++++++++++++----- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/contextkit/_modidx.py b/contextkit/_modidx.py index c3a98c2..8041f62 100644 --- a/contextkit/_modidx.py +++ b/contextkit/_modidx.py @@ -17,6 +17,6 @@ 'contextkit.read.read_gh_repo': ('read.html#read_gh_repo', 'contextkit/read.py'), 'contextkit.read.read_gist': ('read.html#read_gist', 'contextkit/read.py'), 'contextkit.read.read_google_sheet': ('read.html#read_google_sheet', 'contextkit/read.py'), + 'contextkit.read.read_link': ('read.html#read_link', 'contextkit/read.py'), 'contextkit.read.read_pdf': ('read.html#read_pdf', 'contextkit/read.py'), - 'contextkit.read.read_text': ('read.html#read_text', 'contextkit/read.py'), - 'contextkit.read.read_url': ('read.html#read_url', 'contextkit/read.py')}}} + 'contextkit.read.read_text': ('read.html#read_text', 'contextkit/read.py')}}} diff --git a/contextkit/read.py b/contextkit/read.py index c2d0d6c..954a2c9 100644 --- a/contextkit/read.py +++ b/contextkit/read.py @@ -3,7 +3,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_read.ipynb. # %% auto 0 -__all__ = ['read_text', 'read_url', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf', 'read_google_sheet', +__all__ = ['read_text', 'read_link', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf', 'read_google_sheet', 'read_gdoc', 'read_arxiv', 'read_gh_repo'] # %% ../nbs/00_read.ipynb 5 @@ -30,7 +30,7 @@ def read_text(url, # URL to read return httpx.get(url, follow_redirects=True).text # %% ../nbs/00_read.ipynb 10 -def read_url(url: str, # URL to read +def read_link(url: str, # URL to read heavy: bool = False, # Use headless browser (requires extra setup steps before use) sel: Optional[str] = None, # Css selector to pull content from useJina: bool = False, # Use Jina for the markdown conversion diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb index 833351b..56fb1e7 100644 --- a/nbs/00_read.ipynb +++ b/nbs/00_read.ipynb @@ -131,7 +131,7 @@ "outputs": [], "source": [ "#| export\n", - "def read_url(url: str, # URL to read\n", + "def read_link(url: str, # URL to read\n", " heavy: bool = False, # Use headless browser (requires extra setup steps before use)\n", " sel: Optional[str] = None, # Css selector to pull content from\n", " useJina: bool = False, # Use Jina for the markdown conversion\n", @@ -164,7 +164,7 @@ } ], "source": [ - "read_url('https://fastht.ml/docs/', sel='#quarto-content')[:200]" + "read_link('https://fastht.ml/docs/', sel='#quarto-content')[:200]" ] }, { @@ -186,7 +186,7 @@ ], "source": [ "#| eval: false\n", - "read_url('https://fastht.ml/docs/',useJina=True)[:200]" + "read_link('https://fastht.ml/docs/',useJina=True)[:200]" ] }, { @@ -207,7 +207,7 @@ } ], "source": [ - "read_url('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')" + "read_link('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')" ] }, { @@ -878,7 +878,25 @@ "source": [] } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, "nbformat": 4, "nbformat_minor": 5 } From 69156291f95a859a6ae05d79dea61fd89d2bce67 Mon Sep 17 00:00:00 2001 From: Alexis Gallagher Date: Tue, 9 Sep 2025 12:57:32 -0700 Subject: [PATCH 2/3] Mark read_url deprecated. Swap read_link into import all. This changes makes it so: - `import *` now imports read_link - `import *` no longer import read_url - `read_url` can still be imported directly by name - `read_url` prints a deprecation warning when called --- contextkit/__init__.py | 4 +-- contextkit/_modidx.py | 3 +- contextkit/read.py | 36 ++++++++++++++---------- nbs/00_read.ipynb | 63 ++++++++++++++++++++++++++++++++---------- 4 files changed, 74 insertions(+), 32 deletions(-) diff --git a/contextkit/__init__.py b/contextkit/__init__.py index 1578b6c..af59f4a 100644 --- a/contextkit/__init__.py +++ b/contextkit/__init__.py @@ -1,8 +1,8 @@ __version__ = "0.0.10" from . import read -from .read import read_url, read_gist, read_gh_file, read_file, read_dir, read_pdf, read_google_sheet, read_gdoc, read_arxiv, read_gh_repo +from .read import read_link, read_url, read_gist, read_gh_file, read_file, read_dir, read_pdf, read_google_sheet, read_gdoc, read_arxiv, read_gh_repo -__all__ = ["read", "read_url", "read_gist", "read_gh_file", "read_file", "read_dir", "read_pdf", "read_google_sheet", "read_gdoc", "read_arxiv", "read_gh_repo"] +__all__ = ["read", "read_link", "read_gist", "read_gh_file", "read_file", "read_dir", "read_pdf", "read_google_sheet", "read_gdoc", "read_arxiv", "read_gh_repo"] diff --git a/contextkit/_modidx.py b/contextkit/_modidx.py index 8041f62..adf9984 100644 --- a/contextkit/_modidx.py +++ b/contextkit/_modidx.py @@ -19,4 +19,5 @@ 'contextkit.read.read_google_sheet': ('read.html#read_google_sheet', 'contextkit/read.py'), 'contextkit.read.read_link': ('read.html#read_link', 'contextkit/read.py'), 'contextkit.read.read_pdf': ('read.html#read_pdf', 'contextkit/read.py'), - 'contextkit.read.read_text': ('read.html#read_text', 'contextkit/read.py')}}} + 'contextkit.read.read_text': ('read.html#read_text', 'contextkit/read.py'), + 'contextkit.read.read_url': ('read.html#read_url', 'contextkit/read.py')}}} diff --git a/contextkit/read.py b/contextkit/read.py index 954a2c9..fc4b2d0 100644 --- a/contextkit/read.py +++ b/contextkit/read.py @@ -3,15 +3,15 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_read.ipynb. # %% auto 0 -__all__ = ['read_text', 'read_link', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf', 'read_google_sheet', - 'read_gdoc', 'read_arxiv', 'read_gh_repo'] +__all__ = ['read_url', 'read_text', 'read_link', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf', + 'read_google_sheet', 'read_gdoc', 'read_arxiv', 'read_gh_repo'] # %% ../nbs/00_read.ipynb 5 import httpx import html2text from fastcore.all import delegates, ifnone -import re, os, glob, string +import re, os, glob, string, warnings, functools import requests import fnmatch, mimetypes @@ -44,7 +44,15 @@ def read_link(url: str, # URL to read return playwrightnb.url2md(url,sel=ifnone(sel,'body')) elif heavy and useJina: raise NotImplementedError("Unsupported. No benefit to using Jina with playwrightnb") -# %% ../nbs/00_read.ipynb 16 +# %% ../nbs/00_read.ipynb 14 +def read_url(*args,**kwargs): + warnings.warn("read_url() is deprecated, use read_link() instead", + DeprecationWarning, stacklevel=2) + return read_link(*args,**kwargs) + +read_url = functools.wraps(read_link)(read_url) + +# %% ../nbs/00_read.ipynb 18 def read_gist(url:str # gist URL, of gist to read ): "Returns raw gist content, or None" @@ -57,7 +65,7 @@ def read_gist(url:str # gist URL, of gist to read else: return None -# %% ../nbs/00_read.ipynb 20 +# %% ../nbs/00_read.ipynb 22 def read_gh_file(url:str # GitHub URL of the file to read ): "Reads the contents of a file from its GitHub URL" @@ -66,19 +74,19 @@ def read_gh_file(url:str # GitHub URL of the file to read raw_url = re.sub(pattern, replacement, url) return httpx.get(raw_url).text -# %% ../nbs/00_read.ipynb 24 +# %% ../nbs/00_read.ipynb 26 def read_file(path:str): "returns file contents" with open(path,'r') as f: return f.read() -# %% ../nbs/00_read.ipynb 25 +# %% ../nbs/00_read.ipynb 27 def _is_unicode(filepath:str, sample_size:int=1024): try: with open(filepath, 'r') as file: sample = file.read(sample_size) return True except UnicodeDecodeError: return False -# %% ../nbs/00_read.ipynb 28 +# %% ../nbs/00_read.ipynb 30 def read_dir(path: str, # path to read unicode_only: bool = True, # ignore non-unicode files included_patterns: List[str] = ["*"], # glob pattern of files to include @@ -106,7 +114,7 @@ def read_dir(path: str, # path to read else: return result -# %% ../nbs/00_read.ipynb 31 +# %% ../nbs/00_read.ipynb 33 def read_pdf(file_path: str # path of PDF file to read ) -> str: "Reads the text of a PDF with PdfReader" @@ -114,7 +122,7 @@ def read_pdf(file_path: str # path of PDF file to read reader = PdfReader(file) return ' '.join(page.extract_text() for page in reader.pages) -# %% ../nbs/00_read.ipynb 36 +# %% ../nbs/00_read.ipynb 38 def read_google_sheet(url: str # URL of a Google Sheet to read ): "Reads the contents of a Google Sheet into text" @@ -123,7 +131,7 @@ def read_google_sheet(url: str # URL of a Google Sheet to read res = requests.get(url=csv_url) return res.content -# %% ../nbs/00_read.ipynb 41 +# %% ../nbs/00_read.ipynb 43 def read_gdoc(url: str # URL of Google Doc to read ): "Gets the text content of a Google Doc using html2text" @@ -135,7 +143,7 @@ def read_gdoc(url: str # URL of Google Doc to read doc_content = html2text.html2text(html_doc_content) return doc_content -# %% ../nbs/00_read.ipynb 44 +# %% ../nbs/00_read.ipynb 46 def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID save_pdf:bool=False, # True, will save the downloaded PDF save_dir:str='.' # directory in which to save the PDF @@ -201,7 +209,7 @@ def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID return result -# %% ../nbs/00_read.ipynb 46 +# %% ../nbs/00_read.ipynb 48 def _gh_ssh_from_gh_url(gh_repo_address:str): "Given a GH URL or SSH remote address, returns a GH URL or None" pattern = r'https://github\.com/([^/]+)/([^/]+)(?:/.*)?' @@ -249,7 +257,7 @@ def _get_git_repo(gh_ssh:str): print(f"Error cloning repo from cwd {temp_dir} with error {e}") return None -# %% ../nbs/00_read.ipynb 47 +# %% ../nbs/00_read.ipynb 49 def read_gh_repo(path_or_url:str, # Repo's GitHub URL, or GH SSH address, or file path as_dict:bool=True, # if True, will return repo contents {path,content} dict verbose:bool=False # if True, will log paths of files being read diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb index 56fb1e7..48fdd37 100644 --- a/nbs/00_read.ipynb +++ b/nbs/00_read.ipynb @@ -59,7 +59,7 @@ "import html2text\n", "from fastcore.all import delegates, ifnone\n", "\n", - "import re, os, glob, string\n", + "import re, os, glob, string, warnings, functools\n", "import requests\n", "import fnmatch, mimetypes\n", "\n", @@ -176,7 +176,7 @@ { "data": { "text/plain": [ - "'Title: FastHTML – fasthtml\\n\\nURL Source: https://fastht.ml/docs/\\n\\nMarkdown Content:\\nWelcome to the official FastHTML documentation.\\n\\nFastHTML is a new next-generation web framework for fast, scalable w'" + "'Title: FastHTML – fasthtml\\n\\nURL Source: https://fastht.ml/docs/\\n\\nPublished Time: Sun, 06 Jul 2025 21:56:52 GMT\\n\\nMarkdown Content:\\nWelcome to the official FastHTML documentation.\\n\\nFastHTML is a new nex'" ] }, "execution_count": null, @@ -210,6 +210,51 @@ "read_link('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "449d989a", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def read_url(*args,**kwargs):\n", + " warnings.warn(\"read_url() is deprecated, use read_link() instead\", \n", + " DeprecationWarning, stacklevel=2)\n", + " return read_link(*args,**kwargs)\n", + "\n", + "read_url = functools.wraps(read_link)(read_url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "922abf81", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_275424/3158535638.py:1: DeprecationWarning: read_url() is deprecated, use read_link() instead\n", + " read_url('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')\n" + ] + }, + { + "data": { + "text/plain": [ + "'## On this page\\n\\n * Installation\\n * Usage\\n * Getting help from AI\\n * Next Steps\\n * Other languages and related projects\\n\\n * [__Report an issue](https://github.com/AnswerDotAI/fasthtml/issues/new)\\n\\n## Other Formats\\n\\n * [ __CommonMark](index.html.md)\\n\\n'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_url('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')" + ] + }, { "cell_type": "markdown", "id": "8f41a6a8", @@ -880,21 +925,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" } }, "nbformat": 4, From 9a4844102e0c8609187188b5111a42c7df930fec Mon Sep 17 00:00:00 2001 From: Alexis Gallagher Date: Tue, 9 Sep 2025 13:32:36 -0700 Subject: [PATCH 3/3] refine deprecation warning --- contextkit/read.py | 2 +- nbs/00_read.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contextkit/read.py b/contextkit/read.py index fc4b2d0..946badc 100644 --- a/contextkit/read.py +++ b/contextkit/read.py @@ -46,7 +46,7 @@ def read_link(url: str, # URL to read # %% ../nbs/00_read.ipynb 14 def read_url(*args,**kwargs): - warnings.warn("read_url() is deprecated, use read_link() instead", + warnings.warn("read_url() is deprecated, use read_link() instead. It is behaviorally identical.", DeprecationWarning, stacklevel=2) return read_link(*args,**kwargs) diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb index 48fdd37..7330546 100644 --- a/nbs/00_read.ipynb +++ b/nbs/00_read.ipynb @@ -219,7 +219,7 @@ "source": [ "#| export\n", "def read_url(*args,**kwargs):\n", - " warnings.warn(\"read_url() is deprecated, use read_link() instead\", \n", + " warnings.warn(\"read_url() is deprecated, use read_link() instead. It is behaviorally identical.\", \n", " DeprecationWarning, stacklevel=2)\n", " return read_link(*args,**kwargs)\n", "\n",