In [169]:
import os
import requests
import json
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup, Tag

In [3]:
# Begin by getting a list of all artist urls
MAX_ARTISTS = 6215
top_path = "https://tzvetnik.online/articles/artist/"

artist_urls = [top_path+str(i+1) for i in range(MAX_ARTISTS)]


In [None]:
# Step two: get all the show urls
top_path = "https://tzvetnik.online"
show_urls = []
for url in tqdm(artist_urls):
    response = requests.get(url)

    try:
        response.raise_for_status() # Raises an exception for HTTP errors
        artist_html = response.text
        soup = BeautifulSoup(artist_html, 'html.parser')

        article_previews = soup.find('div', class_="article-previews")
        anchor_tags = article_previews.find_all('a')
           
        for anchor_tag in anchor_tags:
            href_value = top_path + anchor_tag.get('href')
            show_urls.append(href_value)       

    except:
        print(f"error requesting {url} | status code: {response.status_code}")


show_urls = list(set(show_urls))
with open("show_urls.txt", "w") as file:
    for item in show_urls:
        file.write(item+"\n")
len(show_urls)

In [3]:
with open("show_urls.txt", "r") as file:
    urls = [line.strip() for line in file.readlines()]

urls[:10]

['https://tzvetnik.online/article/the-days-are-just-packed-off-site-group-show-at-the-pool-heybeliada-istanbul',
 'https://tzvetnik.online/article/when-the-time-swirls-when-it-turns-into-a-black-hole-a-group-show-at-futura-prague',
 'https://tzvetnik.online/article/reflection-in-a-glass-scorpion',
 'https://tzvetnik.online/article/laminar-body-ies-by-natalia-janula-at-final-hot-desert-utah',
 'https://tzvetnik.online/article/audrey-gair-and-sophie-friedman-pappas-at-european-gallery',
 'https://tzvetnik.online/article/prk-1u-group-show-curated-by-reine-alienor-at-tonus-paris',
 'https://tzvetnik.online/article/adrian-altman-and-olga-krykun-at-pragovka',
 'https://tzvetnik.online/article/truite-arc-en-ciel-by-romain-vicari-at-les-ateliers-dlkc-ariege',
 'https://tzvetnik.online/article/conor-o-shea-at-loggia',
 'https://tzvetnik.online/article/draped-at-future-gallery']

In [206]:
""" Data structure format:
data = [show, ...]
show = {"show_url" : "show_url",
        "dir" : "dir",
        "img_urls" = ["img_url", "caption"], # this is different from CAD, CAD json might need to be altered - we need "caption" for labeling purposes
        "press_release_url" = "any text found on page, this will likely need to be concatinated together", # this one is tricky because i want the same format between websites but each website is different, for tzvetnik the press release is just text on the page, unlike CAD
        "artist" : "artist",
        "show_title" : "show_title",
        "venue" : "venue"}

"""
def does_not_contain_action_text_attachment(element):
    return not element.get('action-text-attachment')


data = []
for url in tqdm(urls[:10]):
    print(f"{url}\n")
    dir = os.path.basename(url)
    show = {"show_url":url, "dir": dir}
    img_urls = []

    # Fetch html
    response = requests.get(url)

    try:
        response.raise_for_status()
        show_html = response.text
        soup = BeautifulSoup(show_html, "html.parser")

        # Gather all img_urls and their associated captions
        # All content is contained in various tags in <form class="article-form" >
        article_form = soup.find("form", class_="article-form")
        action_texts = soup.find_all("action-text-attachment")
        for action_text in action_texts:
            try:
                img_tag = action_text.find("img")
                if img_tag == None:
                    pass
                else:
                    img_url = img_tag.get("src")
                    caption = action_text.get("caption")
                    img_urls.append((img_url, caption))
            except:
                print(f"an error occured retrieving img_url and caption for url: {url}")
        show["img_urls"] = img_urls
        
        # Grab the press realease/text -- this was stupid to figure out. i bet there is a better way...
        press_release = ""
        trix_content = soup.find("div", class_="trix-content")
        divs = [div for div in trix_content.find('div') if isinstance(div, Tag)]
        press_release = []
        for div in divs:
            new_divs = div.find_all("div")
            for d in new_divs:
                if "<br/" in str(d):
                   d = d.text
                   press_release.append(d)
        show["press_release_pdf"] = "".join(press_release)

        # Grab artist name, show title, and venue
        article_show = article_form.find("div", class_="article__tags article--show")
        print(article_show)



        data.append(show)


    
    except:
        print(f"error has occured for {url} | status code: {response.status_code}")
    data.append(show)

  0%|          | 0/10 [00:00<?, ?it/s]

https://tzvetnik.online/article/the-days-are-just-packed-off-site-group-show-at-the-pool-heybeliada-istanbul

<div class="article__tags article--show">
<a href="/articles/artist/4459">Accel Arcana</a>
<a href="/articles/artist/4">Adrian Altman</a>
<a href="/articles/artist/4460">Albin Looström</a>
<a href="/articles/artist/1582">Alessandro Nucci</a>
<a href="/articles/artist/2045">Alexandra Koumantaki</a>
<a href="/articles/artist/4166">Ana Castillo</a>
<a href="/articles/artist/2468">Anastasia Bay</a>
<a href="/articles/artist/880">Andrew Rutherdale</a>
<a href="/articles/artist/572">Anna Slama</a>
<a href="/articles/artist/4035">Anna Walther</a>
<a href="/articles/artist/4461">Aram Bartholl</a>
<a href="/articles/artist/77">Arthur Golyakov</a>
<a href="/articles/artist/1712">Ben Sang</a>
<a href="/articles/artist/4462">Berkin Gülten</a>
<a href="/articles/artist/2512">Bernhard Holaschke</a>
<a href="/articles/artist/4463">Bob Bickell-Knight</a>
<a href="/articles/artist/4464">Bora Ak

In [203]:
data

[{'show_url': 'https://tzvetnik.online/article/the-days-are-just-packed-off-site-group-show-at-the-pool-heybeliada-istanbul',
  'dir': 'the-days-are-just-packed-off-site-group-show-at-the-pool-heybeliada-istanbul',
  'img_urls': [('https://tzvetnik.online/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBby92IiwiZXhwIjpudWxsLCJwdXIiOiJibG9iX2lkIn19--e43f2a47eb0fa3a221da34e77aaaac9ac6febb0d/The_Pool_thedaysarejustpacked-01.jpg',
    'Jennifer İpekel'),
   ('https://tzvetnik.online/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBcER2IiwiZXhwIjpudWxsLCJwdXIiOiJibG9iX2lkIn19--e265adcf1b43d2afeb7a9b65a1c6b4fd88f3d107/The_Pool_thedaysarejustpacked-02.jpg',
    'Adrian Altman, František Hanousek, Jakub Hajek, Jakub Hošek, Nik Timková'),
   ('https://tzvetnik.online/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBcEh2IiwiZXhwIjpudWxsLCJwdXIiOiJibG9iX2lkIn19--1b0dfe41faf69f21477e86db1906dae07f259d56/The_Pool_thedaysarejustpac

In [None]:
# TODO: This section must be fixed so that the data is downloaded in the proper
# ImageFolder format, see finetuning0.ipynb for details and helpful links.

from concurrent.futures import ThreadPoolExecutor

# Function to download a single image
def download_image(img_url, img_path):
    try:
        response = requests.get(img_url, stream=True)
        response.raise_for_status()

        # Check if the image file already exists, skip if it does
        if not os.path.exists(img_path):
            with open(img_path, 'wb') as img_file:
                for chunk in response.iter_content(chunk_size=8192):
                    img_file.write(chunk)
    except Exception as e:
        print(f"Failed to download {img_url}: {e}")

# Create the 'img' directory if it doesn't exist
img_directory = 'img'
os.makedirs(img_directory, exist_ok=True)

# Loop through the list of dictionaries
#for item in tqdm(data):
    dir_name = item['dir']
    img_urls = item['imgs']

    # Create a subdirectory within 'img' based on 'dir' value
    subdirectory_path = os.path.join(img_directory, dir_name)
    os.makedirs(subdirectory_path, exist_ok=True)

    # Create a ThreadPoolExecutor with a maximum of 5 threads (adjust as needed)
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        # Download and save images in parallel
        for img_url in tqdm(img_urls):
            img_filename = os.path.basename(img_url[0])
            img_path = os.path.join(subdirectory_path, img_filename)

            futures.append(executor.submit(download_image, img_url[0], img_path))

        # Wait for all download tasks to complete
        for future in futures:
            future.result()
