In [25]:
import requests
import typing
from datetime import datetime
from bs4 import BeautifulSoup
import csv
import psycopg
import os

In [26]:
url = "https://pypi.org/user/bbp.opensource/"
page: requests.Response = requests.get(url)
print(f"Page downloaded, length: {len(page.content)}")
soup = BeautifulSoup(page.content, "html.parser")

Page downloaded, length: 42917


In [27]:
package_snippets = soup.find_all("a", class_="package-snippet")

class PyPiPackageInfo:
  def __init__(self, name: str, url: str, description: str, last_release: datetime):
    self.name: str = name
    self.url: str = url
    self.description: str = description
    self.last_release: datetime = last_release

  def __repr__(self):
    return f"PyPi package {self.name}, URL: {self.url},  lastrelease: {self.last_release}, description: {self.description}"

  def as_dict(self) -> typing.Dict[str,typing.Any]:
    return {'name': self.name, 'url': self.url, 'description': self.description, 'last_release': self.last_release}

all_packages: typing.List[PyPiPackageInfo] = []

for snippet in package_snippets:
  #print(snippet)
  url = 'https://pypi.org' + snippet["href"]
  title_el = snippet.find_all("h3", class_="package-snippet__title")
  name = title_el[0].text
  description_el = snippet.find_all("p", class_="package-snippet__description")
  description = description_el[0].text
  time_el = snippet.find_all("time")
  last_release_str = time_el[0]["datetime"]
  last_release = datetime.strptime( last_release_str,'%Y-%m-%dT%H:%M:%S%z')
  all_packages.append(PyPiPackageInfo(name, url, description,  last_release))

for i, package in enumerate(all_packages):
  print(f"{i}: {package}")

0: PyPi package blueetl, URL: https://pypi.org/project/blueetl/,  lastrelease: 2024-02-29 13:08:57+00:00, description: Multiple simulations analysis tool
1: PyPi package bluepyemodel, URL: https://pypi.org/project/bluepyemodel/,  lastrelease: 2024-02-28 14:39:03+00:00, description: Blue Brain Python Electrical Modeling Pipeline
2: PyPi package neurodamus, URL: https://pypi.org/project/neurodamus/,  lastrelease: 2024-02-27 12:58:16+00:00, description: A BBP Simulation Control application for NEURON
3: PyPi package data-validation-framework, URL: https://pypi.org/project/data-validation-framework/,  lastrelease: 2024-02-26 18:03:09+00:00, description: Simple framework to create data validation workflows.
4: PyPi package AstroVascPy, URL: https://pypi.org/project/astrovascpy/,  lastrelease: 2024-02-22 15:53:33+00:00, description: Simulating blood flow in vasculature
5: PyPi package brayns, URL: https://pypi.org/project/brayns/,  lastrelease: 2024-02-22 15:41:49+00:00, description: Brayns 

Export to CSV:

In [28]:
filename="pypi-projects.csv"

with open(filename, 'w', newline='') as csvfile:
    fieldnames = all_packages[0].as_dict().keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for package in all_packages:
        writer.writerow(package.as_dict())


Write to DB:

Note: expects an environment variable DB_CONNECTION_STRING with a psycopg3 compatible connection string,
such has for example "dbname=test user=postgres"

An example table:

`CREATE TABLE pypi_package (name varchar(256) primary key, url text not null, description text, last_update timestamp)`

In [29]:
os.environ['DB_CONNECTION_STRING'] = "dbname=cs_assets_prod user=cs_assets_prod host=pgsql14-ha-cs.bbp.epfl.ch port=5432 password=Xf7xV9jUqV2eHMfk"
db_connection_string = os.environ['DB_CONNECTION_STRING']
with psycopg.connect(db_connection_string) as conn:
  with conn.cursor() as cur:
    for package in all_packages:
      cur.execute(
            "INSERT INTO assets_pypi_package (name, url, description, last_update) VALUES (%s, %s, %s, %s)",
            (package.name, package.url, package.description, package.last_release))
  conn.commit()

SyntaxError: incomplete input (1494787593.py, line 4)