In [169]:
import os
import requests
import json
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup, Tag

In [3]:
# Begin by getting a list of all artist urls
MAX_ARTISTS = 6215
top_path = "https://tzvetnik.online/articles/artist/"

artist_urls = [top_path+str(i+1) for i in range(MAX_ARTISTS)]


In [None]:
# Step two: get all the show urls
top_path = "https://tzvetnik.online"
show_urls = []
for url in tqdm(artist_urls):
    response = requests.get(url)

    try:
        response.raise_for_status() # Raises an exception for HTTP errors
        artist_html = response.text
        soup = BeautifulSoup(artist_html, 'html.parser')

        article_previews = soup.find('div', class_="article-previews")
        anchor_tags = article_previews.find_all('a')
           
        for anchor_tag in anchor_tags:
            href_value = top_path + anchor_tag.get('href')
            show_urls.append(href_value)       

    except:
        print(f"error requesting {url} | status code: {response.status_code}")


show_urls = list(set(show_urls))
with open("show_urls.txt", "w") as file:
    for item in show_urls:
        file.write(item+"\n")
len(show_urls)

In [242]:
with open("show_urls.txt", "r") as file:
    urls = [line.strip() for line in file.readlines()]
print(len(urls))
urls = list(set(urls))
print(len(urls))

2917
2917


In [243]:
""" Data structure format:
data = [show, ...]
show = {"show_url" : "show_url",
        "dir" : "dir",
        "img_urls" = ["img_url", "caption"], # this is different from CAD, CAD json might need to be altered - we need "caption" for labeling purposes
        "press_release_url" = "any text found on page, this will likely need to be concatinated together", # this one is tricky because i want the same format between websites but each website is different, for tzvetnik the press release is just text on the page, unlike CAD
        "artist" : "artist",
        "show_title" : "show_title",
        "venue" : "venue"}

"""
data = []
for url in tqdm(urls):
    dir = os.path.basename(url)
    show = {"show_url":url, "dir": dir}
    img_urls = []

    # Fetch html
    response = requests.get(url)

    try:
        response.raise_for_status()
        show_html = response.text
        soup = BeautifulSoup(show_html, "html.parser")

        # Gather all img_urls and their associated captions
        # All content is contained in various tags in <form class="article-form" >
        article_form = soup.find("form", class_="article-form")
        action_texts = soup.find_all("action-text-attachment")
        for action_text in action_texts:
            try:
                img_tag = action_text.find("img")
                if img_tag == None:
                    pass
                else:
                    img_url = img_tag.get("src")
                    caption = action_text.get("caption")
                    img_urls.append((img_url, caption))
            except:
                print(f"an error occured retrieving img_url and caption for url: {url}")
        show["img_urls"] = img_urls
        
        # Grab the press realease/text -- this was stupid to figure out. i bet there is a better way...
        press_release = ""
        trix_content = soup.find("div", class_="trix-content")
        divs = [div for div in trix_content.find('div') if isinstance(div, Tag)]
        press_release = []
        for div in divs:
            new_divs = div.find_all("div")
            for d in new_divs:
                if "<br/" in str(d):
                   d = d.text
                   press_release.append(d)
        show["press_release_pdf"] = "".join(press_release).strip()

        # Grab artist names and venue
        article_show = article_form.find("div", class_="article__tags article--show")
        hrefs = article_show.find_all('a')
        artist = []
        venue = []
        for a_tag in hrefs:
            href = a_tag.get("href").split('/')
            if 'artist' == href[2]:
                artist.append(a_tag.text)
            else:
                venue.append(a_tag.text)
        show["artist"] = artist
        show["venue"] = venue

        # Grab show title
        show_title_tag = article_form.find("h1", class_="article--show")
        show["title"] = show_title_tag.text.strip()

    
    except:
        print(f"error has occured for {url} | status code: {response.status_code}")
    data.append(show)



  0%|          | 0/2917 [00:00<?, ?it/s]

error has occured for https://tzvetnik.online/articles/artist/2646?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/1525?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/77?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/2100?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/922?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/877?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/910?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/966?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/1023?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/1715?page=2 | status code: 200
error has occured for https://tzvetnik.online/articles/artist/1060?page=2 | 

In [254]:
new_data = [d for d in data if len(d.items()) == 7]

json_filename = "tzvet_data.json"

with open(json_filename, "w") as json_file:
    json.dump(new_data, json_file)

In [None]:
# TODO: This section must be fixed so that the data is downloaded in the proper
# ImageFolder format, see finetuning0.ipynb for details and helpful links.

from concurrent.futures import ThreadPoolExecutor

# Function to download a single image
def download_image(img_url, img_path):
    try:
        response = requests.get(img_url, stream=True)
        response.raise_for_status()

        # Check if the image file already exists, skip if it does
        if not os.path.exists(img_path):
            with open(img_path, 'wb') as img_file:
                for chunk in response.iter_content(chunk_size=8192):
                    img_file.write(chunk)
    except Exception as e:
        print(f"Failed to download {img_url}: {e}")

# Create the 'img' directory if it doesn't exist
img_directory = 'img'
os.makedirs(img_directory, exist_ok=True)

# Loop through the list of dictionaries
#for item in tqdm(data):
    dir_name = item['dir']
    img_urls = item['imgs']

    # Create a subdirectory within 'img' based on 'dir' value
    subdirectory_path = os.path.join(img_directory, dir_name)
    os.makedirs(subdirectory_path, exist_ok=True)

    # Create a ThreadPoolExecutor with a maximum of 5 threads (adjust as needed)
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        # Download and save images in parallel
        for img_url in tqdm(img_urls):
            img_filename = os.path.basename(img_url[0])
            img_path = os.path.join(subdirectory_path, img_filename)

            futures.append(executor.submit(download_image, img_url[0], img_path))

        # Wait for all download tasks to complete
        for future in futures:
            future.result()
