From 0e3f95dab8cd5b6ce51f9cc2215ce60663019ba4 Mon Sep 17 00:00:00 2001
From: Alexis Gallagher <alexis@alexisgallagher.com>
Date: Tue, 26 Aug 2025 13:34:16 -0700
Subject: [PATCH 1/3] add docments comments

---
 contextkit/read.py | 25 ++++++++++++++++++-------
 nbs/00_read.ipynb  | 25 ++++++++++++++++++-------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/contextkit/read.py b/contextkit/read.py
index 2d3badb..f2a5b2c 100644
--- a/contextkit/read.py
+++ b/contextkit/read.py
@@ -45,7 +45,8 @@ def read_url(url: str,   # URL to read
     elif heavy and useJina: raise NotImplementedError("Unsupported. No benefit to using Jina with playwrightnb")
 
 # %% ../nbs/00_read.ipynb 16
-def read_gist(url:str):
+def read_gist(url:str  # gist URL, of gist to read
+             ):
     "Returns raw gist content, or None"
     pattern = r'https://gist\.github\.com/([^/]+)/([^/]+)'
     match = re.match(pattern, url)
@@ -57,7 +58,8 @@ def read_gist(url:str):
         return None
 
 # %% ../nbs/00_read.ipynb 20
-def read_gh_file(url:str):
+def read_gh_file(url:str # GitHub URL of the file to read
+                ):
     "Reads the contents of a file from its GitHub URL"
     pattern = r'https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)'
     replacement = r'https://raw.githubusercontent.com/\1/\2/refs/heads/\3/\4'
@@ -105,14 +107,16 @@ def read_dir(path: str,                          # path to read
         return result
 
 # %% ../nbs/00_read.ipynb 31
-def read_pdf(file_path: str) -> str:
+def read_pdf(file_path: str # path of PDF file to read
+            ) -> str:
     "Reads the text of a PDF with PdfReader"
     with open(file_path, 'rb') as file:
         reader = PdfReader(file)
         return ' '.join(page.extract_text() for page in reader.pages)
 
 # %% ../nbs/00_read.ipynb 36
-def read_google_sheet(url: str):
+def read_google_sheet(url: str # URL of a Google Sheet to read
+                     ):
     "Reads the contents of a Google Sheet into text"
     sheet_id = url.split('/d/')[1].split('/')[0]
     csv_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid=0'
@@ -120,7 +124,8 @@ def read_google_sheet(url: str):
     return res.content
 
 # %% ../nbs/00_read.ipynb 41
-def read_gdoc(url: str):
+def read_gdoc(url: str  # URL of Google Doc to read
+             ):
     "Gets the text content of a Google Doc using html2text"
     import html2text
     doc_url = url
@@ -131,7 +136,10 @@ def read_gdoc(url: str):
     return doc_content
 
 # %% ../nbs/00_read.ipynb 44
-def read_arxiv(url:str, save_pdf:bool=False, save_dir:str='.'):
+def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID
+               save_pdf:bool=False, # True, will save the downloaded PDF
+               save_dir:str='.' # directory in which to save the PDF
+              ):
     "Get paper information from arxiv URL or ID, optionally saving PDF to disk"
     import re, httpx, tarfile, io, os
     import xml.etree.ElementTree as ET
@@ -242,7 +250,10 @@ def _get_git_repo(gh_ssh:str):
             return None
 
 # %% ../nbs/00_read.ipynb 47
-def read_gh_repo(path_or_url:str, as_dict:bool=True, verbose:bool=False):
+def read_gh_repo(path_or_url:str,    # Repo's GitHub URL, or GH SSH address, or file path
+                 as_dict:bool=True, 
+                 verbose:bool=False
+                ):
     "Repo contents from path, GH URL, or GH SSH address"
     gh_ssh = _gh_ssh_from_gh_url(path_or_url)
     path = path_or_url if not gh_ssh else _get_git_repo(gh_ssh)
diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb
index 08ce21e..4f5f12d 100644
--- a/nbs/00_read.ipynb
+++ b/nbs/00_read.ipynb
@@ -234,7 +234,8 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def read_gist(url:str):\n",
+    "def read_gist(url:str  # gist URL, of gist to read\n",
+    "             ):\n",
     "    \"Returns raw gist content, or None\"\n",
     "    pattern = r'https://gist\\.github\\.com/([^/]+)/([^/]+)'\n",
     "    match = re.match(pattern, url)\n",
@@ -292,7 +293,8 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def read_gh_file(url:str):\n",
+    "def read_gh_file(url:str # GitHub URL of the file to read\n",
+    "                ):\n",
     "    \"Reads the contents of a file from its GitHub URL\"\n",
     "    pattern = r'https://github\\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)'\n",
     "    replacement = r'https://raw.githubusercontent.com/\\1/\\2/refs/heads/\\3/\\4'\n",
@@ -456,7 +458,8 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def read_pdf(file_path: str) -> str:\n",
+    "def read_pdf(file_path: str # path of PDF file to read\n",
+    "            ) -> str:\n",
     "    \"Reads the text of a PDF with PdfReader\"\n",
     "    with open(file_path, 'rb') as file:\n",
     "        reader = PdfReader(file)\n",
@@ -535,7 +538,8 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def read_google_sheet(url: str):\n",
+    "def read_google_sheet(url: str # URL of a Google Sheet to read\n",
+    "                     ):\n",
     "    \"Reads the contents of a Google Sheet into text\"\n",
     "    sheet_id = url.split('/d/')[1].split('/')[0]\n",
     "    csv_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&id={sheet_id}&gid=0'\n",
@@ -612,7 +616,8 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def read_gdoc(url: str):\n",
+    "def read_gdoc(url: str  # URL of Google Doc to read\n",
+    "             ):\n",
     "    \"Gets the text content of a Google Doc using html2text\"\n",
     "    import html2text\n",
     "    doc_url = url\n",
@@ -660,7 +665,10 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def read_arxiv(url:str, save_pdf:bool=False, save_dir:str='.'):\n",
+    "def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID\n",
+    "               save_pdf:bool=False, # True, will save the downloaded PDF\n",
+    "               save_dir:str='.' # directory in which to save the PDF\n",
+    "              ):\n",
     "    \"Get paper information from arxiv URL or ID, optionally saving PDF to disk\"\n",
     "    import re, httpx, tarfile, io, os\n",
     "    import xml.etree.ElementTree as ET\n",
@@ -795,7 +803,10 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def read_gh_repo(path_or_url:str, as_dict:bool=True, verbose:bool=False):\n",
+    "def read_gh_repo(path_or_url:str,    # Repo's GitHub URL, or GH SSH address, or file path\n",
+    "                 as_dict:bool=True, \n",
+    "                 verbose:bool=False\n",
+    "                ):\n",
     "    \"Repo contents from path, GH URL, or GH SSH address\"\n",
     "    gh_ssh = _gh_ssh_from_gh_url(path_or_url)\n",
     "    path = path_or_url if not gh_ssh else _get_git_repo(gh_ssh)\n",

From fdfc490feefe8e670a3ac967fddaeecfb3d5e3ae Mon Sep 17 00:00:00 2001
From: Alexis Gallagher <alexis@alexisgallagher.com>
Date: Tue, 26 Aug 2025 13:36:52 -0700
Subject: [PATCH 2/3] Fix read_arxiv to use HTTPS

This causes read_arxiv to use HTTPS, resolving the problem that
arxiv.org now returns an error if you use HTTPS.
---
 contextkit/read.py | 2 +-
 nbs/00_read.ipynb  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contextkit/read.py b/contextkit/read.py
index f2a5b2c..94c43f7 100644
--- a/contextkit/read.py
+++ b/contextkit/read.py
@@ -152,7 +152,7 @@ def read_arxiv(url:str, # arxiv PDF URL, or arxiv abstract URL, or arxiv ID
     version_num = version.group(1) if version else None
     arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
     
-    api_url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'
+    api_url = f'https://export.arxiv.org/api/query?id_list={arxiv_id}'
     
     response = httpx.get(api_url)
     
diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb
index 4f5f12d..40d9d7c 100644
--- a/nbs/00_read.ipynb
+++ b/nbs/00_read.ipynb
@@ -681,7 +681,7 @@
     "    version_num = version.group(1) if version else None\n",
     "    arxiv_id = re.sub(r'v\\d+$', '', arxiv_id)\n",
     "    \n",
-    "    api_url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'\n",
+    "    api_url = f'https://export.arxiv.org/api/query?id_list={arxiv_id}'\n",
     "    \n",
     "    response = httpx.get(api_url)\n",
     "    \n",

From 3807b4f2d528ee551fa28d4dfa92ed095f6d4dd0 Mon Sep 17 00:00:00 2001
From: Alexis Gallagher <alexis@alexisgallagher.com>
Date: Wed, 27 Aug 2025 10:49:18 -0700
Subject: [PATCH 3/3] fix docments docs on read_gh_repo

---
 contextkit/read.py | 4 ++--
 nbs/00_read.ipynb  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/contextkit/read.py b/contextkit/read.py
index 94c43f7..c2d0d6c 100644
--- a/contextkit/read.py
+++ b/contextkit/read.py
@@ -251,8 +251,8 @@ def _get_git_repo(gh_ssh:str):
 
 # %% ../nbs/00_read.ipynb 47
 def read_gh_repo(path_or_url:str,    # Repo's GitHub URL, or GH SSH address, or file path
-                 as_dict:bool=True, 
-                 verbose:bool=False
+                 as_dict:bool=True,  # if True, will return repo contents {path,content} dict
+                 verbose:bool=False  # if True, will log paths of files being read
                 ):
     "Repo contents from path, GH URL, or GH SSH address"
     gh_ssh = _gh_ssh_from_gh_url(path_or_url)
diff --git a/nbs/00_read.ipynb b/nbs/00_read.ipynb
index 40d9d7c..03888d3 100644
--- a/nbs/00_read.ipynb
+++ b/nbs/00_read.ipynb
@@ -804,8 +804,8 @@
    "source": [
     "#| export\n",
     "def read_gh_repo(path_or_url:str,    # Repo's GitHub URL, or GH SSH address, or file path\n",
-    "                 as_dict:bool=True, \n",
-    "                 verbose:bool=False\n",
+    "                 as_dict:bool=True,  # if True, will return repo contents {path,content} dict\n",
+    "                 verbose:bool=False  # if True, will log paths of files being read\n",
     "                ):\n",
     "    \"Repo contents from path, GH URL, or GH SSH address\"\n",
     "    gh_ssh = _gh_ssh_from_gh_url(path_or_url)\n",