Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions contextkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
__version__ = "0.0.10"

from . import read
from .read import read_url, read_gist, read_gh_file, read_file, read_dir, read_pdf, read_google_sheet, read_gdoc, read_arxiv, read_gh_repo
from .read import read_link, read_url, read_gist, read_gh_file, read_file, read_dir, read_pdf, read_google_sheet, read_gdoc, read_arxiv, read_gh_repo

__all__ = ["read", "read_url", "read_gist", "read_gh_file", "read_file", "read_dir", "read_pdf", "read_google_sheet", "read_gdoc", "read_arxiv", "read_gh_repo"]
__all__ = ["read", "read_link", "read_gist", "read_gh_file", "read_file", "read_dir", "read_pdf", "read_google_sheet", "read_gdoc", "read_arxiv", "read_gh_repo"]


1 change: 1 addition & 0 deletions contextkit/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
'contextkit.read.read_gh_repo': ('read.html#read_gh_repo', 'contextkit/read.py'),
'contextkit.read.read_gist': ('read.html#read_gist', 'contextkit/read.py'),
'contextkit.read.read_google_sheet': ('read.html#read_google_sheet', 'contextkit/read.py'),
'contextkit.read.read_link': ('read.html#read_link', 'contextkit/read.py'),
'contextkit.read.read_pdf': ('read.html#read_pdf', 'contextkit/read.py'),
'contextkit.read.read_text': ('read.html#read_text', 'contextkit/read.py'),
'contextkit.read.read_url': ('read.html#read_url', 'contextkit/read.py')}}}
38 changes: 23 additions & 15 deletions contextkit/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_read.ipynb.

# %% auto 0
__all__ = ['read_text', 'read_url', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf', 'read_google_sheet',
'read_gdoc', 'read_arxiv', 'read_gh_repo']
__all__ = ['read_url', 'read_text', 'read_link', 'read_gist', 'read_gh_file', 'read_file', 'read_dir', 'read_pdf',
'read_google_sheet', 'read_gdoc', 'read_arxiv', 'read_gh_repo']

# %% ../nbs/00_read.ipynb 5
import httpx
import html2text
from fastcore.all import delegates, ifnone

import re, os, glob, string
import re, os, glob, string, warnings, functools
import requests
import fnmatch, mimetypes

Expand All @@ -30,7 +30,7 @@ def read_text(url, # URL to read
return httpx.get(url, follow_redirects=True).text

# %% ../nbs/00_read.ipynb 10
def read_url(url: str, # URL to read
def read_link(url: str, # URL to read
heavy: bool = False, # Use headless browser (requires extra setup steps before use)
sel: Optional[str] = None, # Css selector to pull content from
useJina: bool = False, # Use Jina for the markdown conversion
Expand All @@ -44,7 +44,15 @@ def read_url(url: str, # URL to read
return playwrightnb.url2md(url,sel=ifnone(sel,'body'))
elif heavy and useJina: raise NotImplementedError("Unsupported. No benefit to using Jina with playwrightnb")

# %% ../nbs/00_read.ipynb 16
# %% ../nbs/00_read.ipynb 14
def read_url(*args,**kwargs):
warnings.warn("read_url() is deprecated, use read_link() instead. It is behaviorally identical.",
DeprecationWarning, stacklevel=2)
return read_link(*args,**kwargs)

read_url = functools.wraps(read_link)(read_url)

# %% ../nbs/00_read.ipynb 18
def read_gist(url:str # gist URL, of gist to read
):
"Returns raw gist content, or None"
Expand All @@ -57,7 +65,7 @@ def read_gist(url:str # gist URL, of gist to read
else:
return None

# %% ../nbs/00_read.ipynb 20
# %% ../nbs/00_read.ipynb 22
def read_gh_file(url:str # GitHub URL of the file to read
):
"Reads the contents of a file from its GitHub URL"
Expand All @@ -66,19 +74,19 @@ def read_gh_file(url:str # GitHub URL of the file to read
raw_url = re.sub(pattern, replacement, url)
return httpx.get(raw_url).text

# %% ../nbs/00_read.ipynb 24
# %% ../nbs/00_read.ipynb 26
def read_file(path:str):
"returns file contents"
with open(path,'r') as f: return f.read()

# %% ../nbs/00_read.ipynb 25
# %% ../nbs/00_read.ipynb 27
def _is_unicode(filepath:str, sample_size:int=1024):
try:
with open(filepath, 'r') as file: sample = file.read(sample_size)
return True
except UnicodeDecodeError: return False

# %% ../nbs/00_read.ipynb 28
# %% ../nbs/00_read.ipynb 30
def read_dir(path: str, # path to read
unicode_only: bool = True, # ignore non-unicode files
included_patterns: List[str] = ["*"], # glob pattern of files to include
Expand Down Expand Up @@ -106,15 +114,15 @@ def read_dir(path: str, # path to read
else:
return result

# %% ../nbs/00_read.ipynb 31
# %% ../nbs/00_read.ipynb 33
def read_pdf(file_path: str # path of PDF file to read
) -> str:
"Reads the text of a PDF with PdfReader"
with open(file_path, 'rb') as file:
reader = PdfReader(file)
return ' '.join(page.extract_text() for page in reader.pages)

# %% ../nbs/00_read.ipynb 36
# %% ../nbs/00_read.ipynb 38
def read_google_sheet(url: str # URL of a Google Sheet to read
):
"Reads the contents of a Google Sheet into text"
Expand All @@ -123,7 +131,7 @@ def read_google_sheet(url: str # URL of a Google Sheet to read
res = requests.get(url=csv_url)
return res.content

# %% ../nbs/00_read.ipynb 41
# %% ../nbs/00_read.ipynb 43
def read_gdoc(url: str # URL of Google Doc to read
):
"Gets the text content of a Google Doc using html2text"
Expand All @@ -135,7 +143,7 @@ def read_gdoc(url: str # URL of Google Doc to read
doc_content = html2text.html2text(html_doc_content)
return doc_content

# %% ../nbs/00_read.ipynb 44
# %% ../nbs/00_read.ipynb 46
def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID
save_pdf:bool=False, # True, will save the downloaded PDF
save_dir:str='.' # directory in which to save the PDF
Expand Down Expand Up @@ -201,7 +209,7 @@ def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID

return result

# %% ../nbs/00_read.ipynb 46
# %% ../nbs/00_read.ipynb 48
def _gh_ssh_from_gh_url(gh_repo_address:str):
"Given a GH URL or SSH remote address, returns a GH URL or None"
pattern = r'https://github\.com/([^/]+)/([^/]+)(?:/.*)?'
Expand Down Expand Up @@ -249,7 +257,7 @@ def _get_git_repo(gh_ssh:str):
print(f"Error cloning repo from cwd {temp_dir} with error {e}")
return None

# %% ../nbs/00_read.ipynb 47
# %% ../nbs/00_read.ipynb 49
def read_gh_repo(path_or_url:str, # Repo's GitHub URL, or GH SSH address, or file path
as_dict:bool=True, # if True, will return repo contents {path,content} dict
verbose:bool=False # if True, will log paths of files being read
Expand Down
63 changes: 57 additions & 6 deletions nbs/00_read.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
"import html2text\n",
"from fastcore.all import delegates, ifnone\n",
"\n",
"import re, os, glob, string\n",
"import re, os, glob, string, warnings, functools\n",
"import requests\n",
"import fnmatch, mimetypes\n",
"\n",
Expand Down Expand Up @@ -131,7 +131,7 @@
"outputs": [],
"source": [
"#| export\n",
"def read_url(url: str, # URL to read\n",
"def read_link(url: str, # URL to read\n",
" heavy: bool = False, # Use headless browser (requires extra setup steps before use)\n",
" sel: Optional[str] = None, # Css selector to pull content from\n",
" useJina: bool = False, # Use Jina for the markdown conversion\n",
Expand Down Expand Up @@ -164,7 +164,7 @@
}
],
"source": [
"read_url('https://fastht.ml/docs/', sel='#quarto-content')[:200]"
"read_link('https://fastht.ml/docs/', sel='#quarto-content')[:200]"
]
},
{
Expand All @@ -176,7 +176,7 @@
{
"data": {
"text/plain": [
"'Title: FastHTML – fasthtml\\n\\nURL Source: https://fastht.ml/docs/\\n\\nMarkdown Content:\\nWelcome to the official FastHTML documentation.\\n\\nFastHTML is a new next-generation web framework for fast, scalable w'"
"'Title: FastHTML – fasthtml\\n\\nURL Source: https://fastht.ml/docs/\\n\\nPublished Time: Sun, 06 Jul 2025 21:56:52 GMT\\n\\nMarkdown Content:\\nWelcome to the official FastHTML documentation.\\n\\nFastHTML is a new nex'"
]
},
"execution_count": null,
Expand All @@ -186,7 +186,7 @@
],
"source": [
"#| eval: false\n",
"read_url('https://fastht.ml/docs/',useJina=True)[:200]"
"read_link('https://fastht.ml/docs/',useJina=True)[:200]"
]
},
{
Expand All @@ -206,6 +206,51 @@
"output_type": "execute_result"
}
],
"source": [
"read_link('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "449d989a",
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"def read_url(*args,**kwargs):\n",
" warnings.warn(\"read_url() is deprecated, use read_link() instead. It is behaviorally identical.\", \n",
" DeprecationWarning, stacklevel=2)\n",
" return read_link(*args,**kwargs)\n",
"\n",
"read_url = functools.wraps(read_link)(read_url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "922abf81",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_275424/3158535638.py:1: DeprecationWarning: read_url() is deprecated, use read_link() instead\n",
" read_url('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')\n"
]
},
{
"data": {
"text/plain": [
"'## On this page\\n\\n * Installation\\n * Usage\\n * Getting help from AI\\n * Next Steps\\n * Other languages and related projects\\n\\n * [__Report an issue](https://github.com/AnswerDotAI/fasthtml/issues/new)\\n\\n## Other Formats\\n\\n * [ __CommonMark](index.html.md)\\n\\n'"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"read_url('https://fastht.ml/docs/',sel='#quarto-margin-sidebar')"
]
Expand Down Expand Up @@ -878,7 +923,13 @@
"source": []
}
],
"metadata": {},
"metadata": {
"kernelspec": {
"display_name": "python3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}