In [4]:
import os
import requests
import json
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

In [3]:
# Begin by getting a list of all artist urls
MAX_ARTISTS = 6215
top_path = "https://tzvetnik.online/articles/artist/"

artist_urls = [top_path+str(i+1) for i in range(MAX_ARTISTS)]


In [5]:
# Step two: get all the show urls
top_path = "https://tzvetnik.online"
show_urls = []
for url in tqdm(artist_urls):
    response = requests.get(url)

    try:
        response.raise_for_status() # Raises an exception for HTTP errors
        artist_html = response.text
        soup = BeautifulSoup(artist_html, 'html.parser')

        article_previews = soup.find('div', class_="article-previews")
        anchor_tags = article_previews.find_all('a')
           
        for anchor_tag in anchor_tags:
            href_value = top_path + anchor_tag.get('href')
            show_urls.append(href_value)       

    except:
        print(f"error requesting {url} | status code: {response.status_code}")


show_urls = list(set(show_urls))
with open("show_urls.txt", "w") as file:
    for item in show_urls:
        file.write(item+"\n")
len(show_urls)

  0%|          | 0/6215 [00:00<?, ?it/s]

error requesting https://tzvetnik.online/articles/artist/26 | status code: 200
error requesting https://tzvetnik.online/articles/artist/151 | status code: 200
error requesting https://tzvetnik.online/articles/artist/164 | status code: 200
error requesting https://tzvetnik.online/articles/artist/180 | status code: 200
error requesting https://tzvetnik.online/articles/artist/231 | status code: 200
error requesting https://tzvetnik.online/articles/artist/235 | status code: 200
error requesting https://tzvetnik.online/articles/artist/258 | status code: 200
error requesting https://tzvetnik.online/articles/artist/280 | status code: 200
error requesting https://tzvetnik.online/articles/artist/446 | status code: 200
error requesting https://tzvetnik.online/articles/artist/457 | status code: 200
error requesting https://tzvetnik.online/articles/artist/505 | status code: 200
error requesting https://tzvetnik.online/articles/artist/544 | status code: 200
error requesting https://tzvetnik.online/

2917

In [3]:
with open("show_urls.txt", "r") as file:
    urls = [line.strip() for line in file.readlines()]

urls[:10]

['https://tzvetnik.online/article/the-days-are-just-packed-off-site-group-show-at-the-pool-heybeliada-istanbul',
 'https://tzvetnik.online/article/when-the-time-swirls-when-it-turns-into-a-black-hole-a-group-show-at-futura-prague',
 'https://tzvetnik.online/article/reflection-in-a-glass-scorpion',
 'https://tzvetnik.online/article/laminar-body-ies-by-natalia-janula-at-final-hot-desert-utah',
 'https://tzvetnik.online/article/audrey-gair-and-sophie-friedman-pappas-at-european-gallery',
 'https://tzvetnik.online/article/prk-1u-group-show-curated-by-reine-alienor-at-tonus-paris',
 'https://tzvetnik.online/article/adrian-altman-and-olga-krykun-at-pragovka',
 'https://tzvetnik.online/article/truite-arc-en-ciel-by-romain-vicari-at-les-ateliers-dlkc-ariege',
 'https://tzvetnik.online/article/conor-o-shea-at-loggia',
 'https://tzvetnik.online/article/draped-at-future-gallery']

In [28]:
""" Data structure format:
data = [show, ...]
show = {"show_url" : "show_url",
        "dir" : "dir",
        "img_urls" = ["img_url", "caption"], # this is different from CAD, CAD json might need to be altered - we need "caption" for labeling purposes
        "press_release_url" = "any text found on page, this will likely need to be concatinated together", # this one is tricky because i want the same format between websites but each website is different, for tzvetnik the press release is just text on the page, unlike CAD
        "artist" : "artist",
        "show_title" : "show_title",
        "venue" : "venue"}

"""

data = []
for url in tqdm(urls[:10]):
    dir = os.path.basename(url)
    show = {"show_url":url, "dir": dir}
    img_urls = []

    # Fetch html
    response = requests.get(url)

    try:
        response.raise_for_status()
        show_html = response.text
        soup = BeautifulSoup(show_html, "html.parser")

        # All content is contained in various tags in <form class="article-form" >
        article_form = soup.find("form", class_="article-form")
        # Gather all img_urls and their associated captions
        action_texts = soup.find_all("action-text-attachment")
        for action_text in action_texts:
            try:
                img_tag = action_text.find("img")
                if img_tag == None:
                    pass
                else:
                    img_url = img_tag.get("src")
                    caption = action_text.get("caption")
                    img_urls.append((img_url, caption))
            except:
                print(f"an error occured retrieving img_url and caption for url: {url}")
        show["img_urls"] = img_urls

    
    except:
        print(f"error has occured for {url} | status code: {response.status_code}")
    data.append(show)

  0%|          | 0/10 [00:00<?, ?it/s]

In [31]:
len(data[1]), data[1]

(3,
 {'show_url': 'https://tzvetnik.online/article/when-the-time-swirls-when-it-turns-into-a-black-hole-a-group-show-at-futura-prague',
  'dir': 'when-the-time-swirls-when-it-turns-into-a-black-hole-a-group-show-at-futura-prague',
  'img_urls': [('https://tzvetnik.online/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBZ3ZEIiwiZXhwIjpudWxsLCJwdXIiOiJibG9iX2lkIn19--21fb3308ba75149f59e43689b9f6a65c447c62c6/00%20Radek%20Brousil%20(Thomas%20Moore).jpg',
    'Thomas Moore, from the book “When People Die” (2018), Wall inscription by Radek Brousil (2019)'),
   ('https://tzvetnik.online/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBZ3pEIiwiZXhwIjpudWxsLCJwdXIiOiJibG9iX2lkIn19--531f84dc084a7a54a73591740f8712a9a39236a8/01%20Darja%20Bajagic%CC%81.jpg',
    'Darja Bajagić, Sample XXX Puzzle-- Pin-up LandTM Cum-centration, (2013)'),
   ('https://tzvetnik.online/rails/active_storage/blobs/redirect/eyJfcmFpbHMiOnsibWVzc2FnZSI6IkJBaHBBZzNEIiwiZXhwIjpudW

In [16]:
data = []
for url in tqdm(show_urls[:10]):
    dir = os.path.basename(url)
    show = {"dir" : dir}
    imgs = []
    response = requests.get(url)

    try:
        response.raise_for_status()
        show_html = response.text
        soup = BeautifulSoup(show_html, "html.parser")
        action_texts = soup.find_all("action-text-attachment")
        for action_text in action_texts:
            try:
                img_tag = action_text.find("img")
                img_url = img_tag.get("src")
                caption = action_text.get("caption")
                imgs.append((img_url, caption))
            except:
                pass        
        
        if len(imgs) != 0:
            show["imgs"] = imgs
            data.append(show)
        
    except:
        print(f"error with url: {url} | response code: {response.status_code}")

with open("data.json", "w") as json_file:
    json.dump(data, json_file)





  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# TODO: This section must be fixed so that the data is downloaded in the proper
# ImageFolder format, see finetuning0.ipynb for details and helpful links.

from concurrent.futures import ThreadPoolExecutor

# Function to download a single image
def download_image(img_url, img_path):
    try:
        response = requests.get(img_url, stream=True)
        response.raise_for_status()

        # Check if the image file already exists, skip if it does
        if not os.path.exists(img_path):
            with open(img_path, 'wb') as img_file:
                for chunk in response.iter_content(chunk_size=8192):
                    img_file.write(chunk)
    except Exception as e:
        print(f"Failed to download {img_url}: {e}")

# Create the 'img' directory if it doesn't exist
img_directory = 'img'
os.makedirs(img_directory, exist_ok=True)

# Loop through the list of dictionaries
#for item in tqdm(data):
    dir_name = item['dir']
    img_urls = item['imgs']

    # Create a subdirectory within 'img' based on 'dir' value
    subdirectory_path = os.path.join(img_directory, dir_name)
    os.makedirs(subdirectory_path, exist_ok=True)

    # Create a ThreadPoolExecutor with a maximum of 5 threads (adjust as needed)
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        # Download and save images in parallel
        for img_url in tqdm(img_urls):
            img_filename = os.path.basename(img_url[0])
            img_path = os.path.join(subdirectory_path, img_filename)

            futures.append(executor.submit(download_image, img_url[0], img_path))

        # Wait for all download tasks to complete
        for future in futures:
            future.result()
