In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import re
from datetime import datetime, timedelta
import csv
import json
import time

In [2]:
def wait_for_images(driver, by, value, timeout=20):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_all_elements_located((by, value))
    )

In [3]:
def convert_text(likes_str):
    if "K" in likes_str:
        return int(float(likes_str.replace("K", "")) * 1000)
    else:
        return int(likes_str)

In [4]:
def format_date(date_str):
    match_now = re.match(r"just now", date_str)
    match_yesterday = re.match(r"Yesterday", date_str)
    match_days = re.match(r"(\d+) days ago", date_str)
    match_minutes = re.match(r"(\d+) mins? ago", date_str)
    match_hours = re.match(r"(\d+) hours? ago", date_str)
    if match_now:
        date_obj = datetime.now()
    elif match_minutes:
        minutes_ago = int(match_minutes.group(1))
        date_obj = datetime.now() - timedelta(minutes=minutes_ago)
    elif match_hours:
        hours_ago = int(match_hours.group(1))
        date_obj = datetime.now() - timedelta(hours=hours_ago)
    elif match_yesterday:
        date_obj = datetime.now() - timedelta(days=1)
    elif match_days:
        days_ago = int(match_days.group(1))
        date_obj = datetime.now() - timedelta(days=days_ago)
    else:
        date_obj = datetime.strptime(date_str, "%b %d, %Y")
        
    formatted_date = date_obj.strftime("%d/%m/%Y")

    return formatted_date

In [5]:
def load_more_comments(driver):
    while True:
        try:
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button._1lBsK._3_MJY._2vim0.ds-card._1FKeR"))
            )
            load_more_button.click()
            WebDriverWait(driver, 30).until(
                EC.invisibility_of_element_located(
                    (By.CSS_SELECTOR, "div.lxZkf._2ywna")
                )
            )

        except Exception as e:
            break

In [6]:
def go_to_next_page(driver, next_page_number,site_url, scroll_pause_time=1):
    driver.get(site_url)
    for i in range(next_page_number):        
        try:
            next_page_buttons = WebDriverWait(driver, 30).until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, "//a[contains(@class, '_1OGeq')]")
                )
            )
            if len(next_page_buttons) == 0:
                print("Nessun elemento trovato.")
            if len(next_page_buttons) >= 2:
                next_page_button = next_page_buttons[1]
            else:
                next_page_button = next_page_buttons[0]
            driver.execute_script("arguments[0].scrollIntoView(true);", next_page_button)
            time.sleep(scroll_pause_time)
            driver.execute_script("arguments[0].click();", next_page_button)
        except TimeoutException:
                print("L'elemento non è stato trovato entro il tempo specificato")

In [7]:
def crawling_artworks(artworks):
    data = []
    for artwork in artworks:
        try:
            img_tag= artwork.find('img')
            img_url = img_tag['src'] if img_tag else 'N/A'

            title_tag = artwork.find('h2', class_="_3CpJS")
            title= title_tag.text if title_tag else 'N/A'

            author_tag = artwork.find("span", class_="_2EfV7")
            author = author_tag.text if author_tag else 'N/A'

            likes_tag = artwork.find('button', class_='reset-button _2Pdhv')
            likes_span = likes_tag.find('span', class_='_20Nv2').find_next_sibling('span') if likes_tag else 'N/A'
            likes = convert_text(likes_span.text) if likes_span else "N/A"

            comments_tag = artwork.find('a', href=lambda href: href and href.endswith('#comments'))
            comments_span = comments_tag.find('span').next_sibling if comments_tag else 'N/A'
            comments = convert_text(comments_span.text) if comments_span else "N/A"

            detail_url= artwork.find('a')['href']
            driver.get(detail_url)
            wait_for_images(driver, By.CLASS_NAME, "_23oWS")

            load_more_comments(driver)
            detail_content = driver.page_source
            detail_soup = BeautifulSoup(detail_content, "html.parser")

            time_tag = detail_soup.find('time')
            time = time_tag.text if time_tag else 'N/A'
            time = format_date(time)

            views_span = detail_soup.find("span", class_="reset-button _3bGQn _1otI6").find(
                "span", class_="_3AClx"
            ).find('span')
            views = convert_text(views_span.text) if views_span else "N/A"

            image_tags =[]
            tag_container = detail_soup.find("div", class_="aodIv wT4l_")
            tags = tag_container.find_all("a")if tag_container else []
            for tag in tags:
                image_tags.append(tag.text)
            
            detail_content = driver.page_source
            detail_soup = BeautifulSoup(detail_content, "html.parser")
            
            image_comments = []
            comments_container = detail_soup.find("div", class_="_1YhYy")
            all_comments = (
                comments_container.find_all("div", class_="_2VfPz _1LomQ")
                if comments_container
                else []
            )
            
            for comment in all_comments:
                comment_author_container = comment.find("span", class_="_2vKEO")
                comment_author = comment_author_container.text if comment_author_container else "N/A"
                comment_date = comment.find("time")
                formatted_comment_date = format_date(comment_date.text) if comment_date else "N/A"
                comment_text_container = comment.find(
                    "span", class_="_2PHJq public-DraftStyleDefault-ltr"
                )
                comment_text= comment_text_container.text if comment_text_container else "N/A"
                image_comments.append(
                    {
                        "comment_author": comment_author,
                        "comment_date": formatted_comment_date,
                        "comment_text": comment_text,
                    }
                )

            artwork_object = {
                "img": img_url,
                "title": title,
                "author": author,
                "likes": likes,
                "number_of_comments": comments,
                "date": time,
                'number_of_views': views,
                'image_tags': image_tags,
                'comments': image_comments
            }
            data.append(artwork_object)
        except Exception as e:
            print(f"Error extracting data for artwork: {e}")
    return data


In [8]:
site_url = "https://www.deviantart.com"
options = ChromeOptions()
# options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options)
driver.get(site_url)

data = []
for i in range(42):   
    wait_for_images(driver, By.CLASS_NAME, "_3_LJY")
    content = driver.page_source
    soup = BeautifulSoup(content, "html.parser")
    artworks = soup.findAll("div", class_="_3Y0hT _3oBlM")
    filtered_artworks = [
        artwork for artwork in artworks if not artwork.find("span", class_="_10VHP")
    ] 
    data.extend(crawling_artworks(filtered_artworks))
    print(len(data))
    go_to_next_page(driver, i+1, site_url, 10)

KeyboardInterrupt: 

In [None]:
with open('../dataset/devianart.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=';')
    writer.writerow(["img", "title", "author", "likes", "number_of_comments","number_of_views", "date", 'image_tags', 'comments'])        
    writer.writerows([[artwork['img'], artwork['title'], artwork['author'], artwork['likes'], artwork['number_of_comments'], artwork['number_of_views'], artwork['date'], artwork['image_tags'], json.dumps(artwork['comments'], ensure_ascii=False)] for artwork in data])