In [1]:
!pip install sqlmodel==0.0.22 -q
!pip install beautifulsoup4==4.12.0 -q

In [1]:
import requests
import sqlmodel
from bs4 import BeautifulSoup as bs
from typing import List
from sqlmodel import select

In [2]:
class ArtistArtMovement(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    art_movement: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="art_movements")


class ArtistSchool(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    school: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="schools")


class ArtistGenre(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    genre: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="genres")


class ArtistField(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    field: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="fields")


class ArtistNationality(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    nationality: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="nationalities")


class ArtistInstitution(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artist_id: int = sqlmodel.Field(foreign_key="artist.id")
    institution: str
    artist: "Artist" = sqlmodel.Relationship(back_populates="institutions")


class ArtistBase(sqlmodel.SQLModel):
    id: int | None = sqlmodel.Field(primary_key=True)
    name: str | None
    slug: str = sqlmodel.Field(index=True)  # /en/claude-monet
    wikipedia_article: str | None
    wikipedia_url: str | None
    article: str | None


class Artist(ArtistBase, table=True):
    artworks: list["Artwork"] = sqlmodel.Relationship(back_populates='artist')
    art_movements: list["ArtistArtMovement"] = sqlmodel.Relationship(back_populates='artist')
    schools: list["ArtistSchool"] = sqlmodel.Relationship(back_populates='artist')
    genres: list["ArtistGenre"] = sqlmodel.Relationship(back_populates='artist')
    fields: list["ArtistField"] = sqlmodel.Relationship(back_populates='artist')
    nationalities: list["ArtistNationality"] = sqlmodel.Relationship(back_populates='artist')
    institutions: list["ArtistInstitution"] = sqlmodel.Relationship(back_populates='artist')

In [3]:
class ArtworkStyle(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artwork_id: int = sqlmodel.Field(foreign_key="artwork.id")
    style: str
    artwork: "Artwork" = sqlmodel.Relationship(back_populates="styles")


class ArtworkGenre(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artwork_id: int = sqlmodel.Field(foreign_key="artwork.id")
    genre: str
    artwork: "Artwork" = sqlmodel.Relationship(back_populates="genres")


class ArtworkMedia(sqlmodel.SQLModel, table=True):
    id: int = sqlmodel.Field(primary_key=True)
    artwork_id: int = sqlmodel.Field(foreign_key="artwork.id")
    media: str
    artwork: "Artwork" = sqlmodel.Relationship(back_populates="media")


class ArtworkBase(sqlmodel.SQLModel):
    id: int | None = sqlmodel.Field(primary_key=True)
    url: str = sqlmodel.Field(
        index=True
    )  # Seems like sometimes an artwork is under a collection, and sometimes under an artist.
    name: str | None
    artist_id: int | None = sqlmodel.Field(foreign_key="artist.id")


class Artwork(ArtworkBase, table=True):
    artist: Artist = sqlmodel.Relationship(back_populates="artworks")
    styles: list[ArtworkStyle] = sqlmodel.Relationship(back_populates="artwork")
    genres: list[ArtworkGenre] = sqlmodel.Relationship(back_populates="artwork")
    media: list[ArtworkMedia] = sqlmodel.Relationship(back_populates="artwork")

In [4]:
engine = sqlmodel.create_engine("sqlite:///wikiart.db")
sqlmodel.SQLModel.metadata.create_all(engine)

In [5]:
def get_movements() -> list[str]:
    url = "https://www.wikiart.org/en/artists-by-art-movement"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of movements found")
    return [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]


def get_artists_by_movement(movement: str):
    url = f'https://www.wikiart.org{movement}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    return [(li.a['href']) for li in ul.find_all('li')]


def get_school_or_groups() -> list[str]:
    url = "https://www.wikiart.org/en/artists-by-painting-school"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of schools found")
    return [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]


def get_artists_by_school_or_group(school_or_group: str):
    url = f'https://www.wikiart.org{school_or_group}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    return [(li.a['href']) for li in ul.find_all('li')]


def get_genres() -> list[str]:
    url = "https://www.wikiart.org/en/artists-by-genre"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of genres found")
    return [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]


def get_artists_by_genre(genre: str):
    url = f'https://www.wikiart.org{genre}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    if not (main := soup.find('main')):
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    return [(li.a['href']) for li in ul.find_all('li')]


def get_fields() -> list[str]:
    """
    Returns a list of fields an artist can belong to
    https://www.wikiart.org/en/artists-by-field
    """
    url = "https://www.wikiart.org/en/artists-by-field"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of fields found")

    fields = [(li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs]
    return fields


def get_artists_by_field(field: str):
    url = f'https://www.wikiart.org{field}/text-list'
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    main = soup.find('main')
    if not main:
        raise ValueError("Main content not found")
    if not (ul := main.find('ul')):
        raise ValueError("List of artists not found")
    artists = [(li.a['href']) for li in ul.find_all('li')]
    return artists


def get_nationalities() -> list[str]:
    """
    Returns a list of nationalities
    https://www.wikiart.org/en/artists-by-nation
    """
    url = "https://www.wikiart.org/en/artists-by-nation"
    resp = requests.get(url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of nationalities found")

    nationalities = [
        (li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs
    ]

    return nationalities


def get_artists_by_nationality(nationality_slug: str):
    url = f'https://www.wikiart.org{nationality_slug}/text-list'
    resp = requests.get(url)

    soup = bs(resp.text, 'html.parser')
    main = soup.find('main')
    if not main:
        raise ValueError("Main content not found")
    ul = main.find('ul')
    if not ul:
        raise ValueError("List of artists not found")
    artists = [(li.a['href']) for li in ul.find_all('li')]
    return artists


def get_institutions() -> list[str]:
    """
    Returns a list of (institution name, institution slug)
    https://www.wikiart.org/en/artists-by-art-institution
    """
    base_url = "https://www.wikiart.org/en/artists-by-art-institution/"
    resp = requests.get(base_url)
    soup = bs(resp.text, 'html.parser')
    bullets = soup.find_all('ul', {"class": "dictionaries-list", "ng-if": "!tabChanged"})
    if len(bullets) != 1:
        raise ValueError("More than one list of institutions found")

    institutions = [
        (li.text.strip(), li.a["href"]) for li in bullets[0].find_all("li") if li.a and "href" in li.a.attrs
    ]
    return institutions


def get_artists_by_institution(institution_slug: str) -> list[str]:
    """
    Returns a list of artists that are associated with the institution
    Institution: Name of the institution
    institution_slug: institution slug  /en/artists-by-art-institution/{institute}

    returns:
    [artist_slug]
    """
    url = f'https://www.wikiart.org{institution_slug}/text-list'
    resp = requests.get(url)

    soup = bs(resp.text, 'html.parser')
    main = soup.find('main')
    if not main:
        raise ValueError("Main content not found")

    ul = main.find('ul')
    if not ul:
        raise ValueError("List of artists not found")

    artists = [(li.a['href']) for li in ul.find_all('li')]
    return artists

In [6]:
def get_artist_name(artist_slug: str) -> str:
    """
    Returns the artist name from the artist slug
    Only run this once to reduce number of requests
    """

In [6]:
for institution, url in get_institutions()[0:1]:
    print(f'{institution=}\n{url=}\n')

    artists = get_artists_by_institution(url)[0:1]
    print(artists)

    for artist_slug in artists:
        with sqlmodel.Session(engine) as session:
            # Check if the artist exists
            artist = session.exec(select(Artist).where(Artist.slug == artist_slug)).first()
            if not artist:
                artist = Artist(slug=artist_slug)
                session.add(artist)
                session.commit()
                session.refresh(artist)

            # Check if the artist_institution exists
            artist_institution = session.exec(
                select(ArtistInstitution).where(
                    ArtistInstitution.artist_id == artist.id, ArtistInstitution.institution == institution
                )
            ).first()
            if not artist_institution:
                artist_institution = ArtistInstitution(artist_id=artist.id, institution=institution)
                session.add(artist_institution)
                session.commit()

for nationality, url in get_nationalities()[0:1]:
    print(f'{nationality=}\n{url=}\n')
    artists = get_artists_by_nationality(url)[0:1]
    print(artists)

    for artist_slug in artists:
        with sqlmodel.Session(engine) as session:
            # Check if the artist exists
            artist = session.exec(select(Artist).where(Artist.slug == artist_slug)).first()
            if not artist:
                artist = Artist(slug=artist_slug)
                session.add(artist)
                session.commit()
                session.refresh(artist)

            # Check if the artist_nationality exists
            artist_nationality = session.exec(
                select(ArtistNationality).where(
                    ArtistNationality.artist_id == artist.id, ArtistNationality.nationality == nationality
                )
            ).first()
            if not artist_nationality:
                artist_nationality = ArtistNationality(
                    artist_id=artist.id,
                    nationality=nationality,
                )
                session.add(artist_nationality)
                session.commit()

for field, url in get_fields()[0:1]:
    print(f'{field=}\n{url=}\n')
    artists = get_artists_by_field(url)[0:1]
    print(artists)

    for artist_slug in artists:
        with sqlmodel.Session(engine) as session:
            artist = session.exec(select(Artist).where(Artist.slug == artist_slug)).first()
            if not artist:
                artist = Artist(slug=artist_slug)
                session.add(artist)
                session.commit()

            artist_field = session.exec(
                select(ArtistField).where(ArtistField.artist_id == artist.id, ArtistField.field == field)
            ).first()
            if not artist_field:
                artist_field = ArtistField(artist_id=artist.id, field=field)
                session.add(artist_field)
                session.commit()

for genre, url in get_genres()[0:1]:
    print(f'{genre=}\n{url=}\n')
    artists = get_artists_by_genre(url)[0:1]
    print(artists)

    for artist_slug in artists:
        with sqlmodel.Session(engine) as session:
            artist = session.exec(select(Artist).where(Artist.slug == artist_slug)).first()
            if not artist:
                artist = Artist(slug=artist_slug)
                session.add(artist)
                session.commit()

            artist_genre = session.exec(
                select(ArtistGenre).where(ArtistGenre.artist_id == artist.id, ArtistGenre.genre == genre)
            ).first()
            if not artist_genre:
                artist_genre = ArtistGenre(artist_id=artist.id, genre=genre)
                session.add(artist_genre)
                session.commit()
for school_or_group, url in get_school_or_groups()[0:1]:
    print(f'{school_or_group=}\n{url=}\n')
    artists = get_artists_by_school_or_group(url)[0:1]
    print(artists)

    for artist_slug in artists:
        with sqlmodel.Session(engine) as session:
            artist = session.exec(select(Artist).where(Artist.slug == artist_slug)).first()
            if not artist:
                artist = Artist(slug=artist_slug)
                session.add(artist)
                session.commit()

            artist_school = session.exec(
                select(ArtistSchool).where(ArtistSchool.artist_id == artist.id, ArtistSchool.school == school_or_group)
            ).first()
            if not artist_school:
                artist_school = ArtistSchool(artist_id=artist.id, school=school_or_group)
                session.add(artist_school)
                session.commit()

for movement, url in get_movements()[0:1]:
    print(f'{movement=}\n{url=}\n')
    artists = get_artists_by_movement(url)[0:1]
    print(artists)

    for artist_slug in artists:
        with sqlmodel.Session(engine) as session:
            artist = session.exec(select(Artist).where(Artist.slug == artist_slug)).first()
            if not artist:
                artist = Artist(slug=artist_slug)
                session.add(artist)
                session.commit()

            artist_movement = session.exec(
                select(ArtistArtMovement).where(
                    ArtistArtMovement.artist_id == artist.id, ArtistArtMovement.art_movement == movement
                )
            ).first()
            if not artist_movement:
                artist_movement = ArtistArtMovement(artist_id=artist.id, art_movement=movement)
                session.add(artist_movement)
                session.commit()

# Update artists

institution='South Australian School of Design, Adelaide, Australia 3'
url='/en/artists-by-art-institution/south-australian-school-of-design-adelaide'

['/en/dorrit-black']
nationality='Albanians 3'
url='/en/artists-by-nation/albanian'

['/en/ancient-greek-painting']
field='animation 5'
url='/en/artists-by-field/animation'

['/en/mary-blair']
genre='abstract 455'
url='/en/artists-by-genre/abstract'

['/en/alphonse-allais']
school_or_group='111 2'
url='/en/artists-by-painting-school/111'

['/en/roman-cotosman']
movement='Early Dynastic Period (3150 – 2686 BC) 1'
url='/en/artists-by-art-movement/early-dynastic-period-3150-2686-bc'

['/en/ancient-egyptian-painting']
