From 0e3f95dab8cd5b6ce51f9cc2215ce60663019ba4 Mon Sep 17 00:00:00 2001 From: Alexis Gallagher Date: Tue, 26 Aug 2025 13:34:16 -0700 Subject: [PATCH 1/3] add docments comments --- contextkit/read.py | 25 ++++++++++++++++++------- nbs/00_read.ipynb | 25 ++++++++++++++++++------- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/contextkit/read.py b/contextkit/read.py index 2d3badb..f2a5b2c 100644 --- a/contextkit/read.py +++ b/contextkit/read.py @@ -45,7 +45,8 @@ def read_url(url: str, # URL to read elif heavy and useJina: raise NotImplementedError("Unsupported. No benefit to using Jina with playwrightnb") # %% ../nbs/00_read.ipynb 16 -def read_gist(url:str): +def read_gist(url:str # gist URL, of gist to read + ): "Returns raw gist content, or None" pattern = r'https://gist\.github\.com/([^/]+)/([^/]+)' match = re.match(pattern, url) @@ -57,7 +58,8 @@ def read_gist(url:str): return None # %% ../nbs/00_read.ipynb 20 -def read_gh_file(url:str): +def read_gh_file(url:str # GitHub URL of the file to read + ): "Reads the contents of a file from its GitHub URL" pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)' replacement = r'https://raw.githubusercontent.com/\1/\2/refs/heads/\3/\4' @@ -105,14 +107,16 @@ def read_dir(path: str, # path to read return result # %% ../nbs/00_read.ipynb 31 -def read_pdf(file_path: str) -> str: +def read_pdf(file_path: str # path of PDF file to read + ) -> str: "Reads the text of a PDF with PdfReader" with open(file_path, 'rb') as file: reader = PdfReader(file) return ' '.join(page.extract_text() for page in reader.pages) # %% ../nbs/00_read.ipynb 36 -def read_google_sheet(url: str): +def read_google_sheet(url: str # URL of a Google Sheet to read + ): "Reads the contents of a Google Sheet into text" sheet_id = url.split('/d/')[1].split('/')[0] csv_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid=0' @@ -120,7 +124,8 @@ def read_google_sheet(url: str): return res.content # %% ../nbs/00_read.ipynb 41 -def read_gdoc(url: str): +def read_gdoc(url: str # URL of Google Doc to read + ): "Gets the text content of a Google Doc using html2text" import html2text doc_url = url @@ -131,7 +136,10 @@ def read_gdoc(url: str): return doc_content # %% ../nbs/00_read.ipynb 44 -def read_arxiv(url:str, save_pdf:bool=False, save_dir:str='.'): +def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID + save_pdf:bool=False, # True, will save the downloaded PDF + save_dir:str='.' # directory in which to save the PDF + ): "Get paper information from arxiv URL or ID, optionally saving PDF to disk" import re, httpx, tarfile, io, os import xml.etree.ElementTree as ET @@ -242,7 +250,10 @@ def _get_git_repo(gh_ssh:str): return None # %% ../nbs/00_read.ipynb 47 -def read_gh_repo(path_or_url:str, as_dict:bool=True, verbose:bool=False): +def read_gh_repo(path_or_url:str, # Repo's GitHub URL, or GH SSH address, or file path + as_dict:bool=True, + verbose:bool=False + ): "Repo contents from path, GH URL, or GH SSH address" gh_ssh = _gh_ssh_from_gh_url(path_or_url) path = path_or_url if not gh_ssh else _get_git_repo(gh_ssh) diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb index 08ce21e..4f5f12d 100644 --- a/nbs/00_read.ipynb +++ b/nbs/00_read.ipynb @@ -234,7 +234,8 @@ "outputs": [], "source": [ "#| export\n", - "def read_gist(url:str):\n", + "def read_gist(url:str # gist URL, of gist to read\n", + " ):\n", " \"Returns raw gist content, or None\"\n", " pattern = r'https://gist\\.github\\.com/([^/]+)/([^/]+)'\n", " match = re.match(pattern, url)\n", @@ -292,7 +293,8 @@ "outputs": [], "source": [ "#| export\n", - "def read_gh_file(url:str):\n", + "def read_gh_file(url:str # GitHub URL of the file to read\n", + " ):\n", " \"Reads the contents of a file from its GitHub URL\"\n", " pattern = r'https://github\\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)'\n", " replacement = r'https://raw.githubusercontent.com/\\1/\\2/refs/heads/\\3/\\4'\n", @@ -456,7 +458,8 @@ "outputs": [], "source": [ "#| export\n", - "def read_pdf(file_path: str) -> str:\n", + "def read_pdf(file_path: str # path of PDF file to read\n", + " ) -> str:\n", " \"Reads the text of a PDF with PdfReader\"\n", " with open(file_path, 'rb') as file:\n", " reader = PdfReader(file)\n", @@ -535,7 +538,8 @@ "outputs": [], "source": [ "#| export\n", - "def read_google_sheet(url: str):\n", + "def read_google_sheet(url: str # URL of a Google Sheet to read\n", + " ):\n", " \"Reads the contents of a Google Sheet into text\"\n", " sheet_id = url.split('/d/')[1].split('/')[0]\n", " csv_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid=0'\n", @@ -612,7 +616,8 @@ "outputs": [], "source": [ "#| export\n", - "def read_gdoc(url: str):\n", + "def read_gdoc(url: str # URL of Google Doc to read\n", + " ):\n", " \"Gets the text content of a Google Doc using html2text\"\n", " import html2text\n", " doc_url = url\n", @@ -660,7 +665,10 @@ "outputs": [], "source": [ "#| export\n", - "def read_arxiv(url:str, save_pdf:bool=False, save_dir:str='.'):\n", + "def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID\n", + " save_pdf:bool=False, # True, will save the downloaded PDF\n", + " save_dir:str='.' # directory in which to save the PDF\n", + " ):\n", " \"Get paper information from arxiv URL or ID, optionally saving PDF to disk\"\n", " import re, httpx, tarfile, io, os\n", " import xml.etree.ElementTree as ET\n", @@ -795,7 +803,10 @@ "outputs": [], "source": [ "#| export\n", - "def read_gh_repo(path_or_url:str, as_dict:bool=True, verbose:bool=False):\n", + "def read_gh_repo(path_or_url:str, # Repo's GitHub URL, or GH SSH address, or file path\n", + " as_dict:bool=True, \n", + " verbose:bool=False\n", + " ):\n", " \"Repo contents from path, GH URL, or GH SSH address\"\n", " gh_ssh = _gh_ssh_from_gh_url(path_or_url)\n", " path = path_or_url if not gh_ssh else _get_git_repo(gh_ssh)\n", From fdfc490feefe8e670a3ac967fddaeecfb3d5e3ae Mon Sep 17 00:00:00 2001 From: Alexis Gallagher Date: Tue, 26 Aug 2025 13:36:52 -0700 Subject: [PATCH 2/3] Fix read_arxiv to use HTTPS This causes read_arxiv to use HTTPS, resolving the problem that arxiv.org now returns an error if you use HTTPS. --- contextkit/read.py | 2 +- nbs/00_read.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contextkit/read.py b/contextkit/read.py index f2a5b2c..94c43f7 100644 --- a/contextkit/read.py +++ b/contextkit/read.py @@ -152,7 +152,7 @@ def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID version_num = version.group(1) if version else None arxiv_id = re.sub(r'v\d+$', '', arxiv_id) - api_url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}' + api_url = f'https://export.arxiv.org/api/query?id_list={arxiv_id}' response = httpx.get(api_url) diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb index 4f5f12d..40d9d7c 100644 --- a/nbs/00_read.ipynb +++ b/nbs/00_read.ipynb @@ -681,7 +681,7 @@ " version_num = version.group(1) if version else None\n", " arxiv_id = re.sub(r'v\\d+$', '', arxiv_id)\n", " \n", - " api_url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'\n", + " api_url = f'https://export.arxiv.org/api/query?id_list={arxiv_id}'\n", " \n", " response = httpx.get(api_url)\n", " \n", From 3807b4f2d528ee551fa28d4dfa92ed095f6d4dd0 Mon Sep 17 00:00:00 2001 From: Alexis Gallagher Date: Wed, 27 Aug 2025 10:49:18 -0700 Subject: [PATCH 3/3] fix docments docs on read_gh_repo --- contextkit/read.py | 4 ++-- nbs/00_read.ipynb | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contextkit/read.py b/contextkit/read.py index 94c43f7..c2d0d6c 100644 --- a/contextkit/read.py +++ b/contextkit/read.py @@ -251,8 +251,8 @@ def _get_git_repo(gh_ssh:str): # %% ../nbs/00_read.ipynb 47 def read_gh_repo(path_or_url:str, # Repo's GitHub URL, or GH SSH address, or file path - as_dict:bool=True, - verbose:bool=False + as_dict:bool=True, # if True, will return repo contents {path,content} dict + verbose:bool=False # if True, will log paths of files being read ): "Repo contents from path, GH URL, or GH SSH address" gh_ssh = _gh_ssh_from_gh_url(path_or_url) diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb index 40d9d7c..03888d3 100644 --- a/nbs/00_read.ipynb +++ b/nbs/00_read.ipynb @@ -804,8 +804,8 @@ "source": [ "#| export\n", "def read_gh_repo(path_or_url:str, # Repo's GitHub URL, or GH SSH address, or file path\n", - " as_dict:bool=True, \n", - " verbose:bool=False\n", + " as_dict:bool=True, # if True, will return repo contents {path,content} dict\n", + " verbose:bool=False # if True, will log paths of files being read\n", " ):\n", " \"Repo contents from path, GH URL, or GH SSH address\"\n", " gh_ssh = _gh_ssh_from_gh_url(path_or_url)\n",