In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException , WebDriverException
import re
from datetime import datetime, timedelta
import csv
import json
import time

In [2]:
def wait_for_images(driver, by, value, timeout=10):
    return WebDriverWait(driver, timeout).until(
        EC.presence_of_all_elements_located((by, value))
    )

In [3]:
def convert_text(likes_str):
    if "K" in likes_str:
        return int(float(likes_str.replace("K", "")) * 1000)
    else:
        return int(likes_str)

In [4]:
def format_date(date_str):
    match_now = re.match(r"just now", date_str)
    match_yesterday = re.match(r"Yesterday", date_str)
    match_days = re.match(r"(\d+) days ago", date_str)
    match_minutes = re.match(r"(\d+) mins? ago", date_str)
    match_hours = re.match(r"(\d+) hours? ago", date_str)
    if match_now:
        date_obj = datetime.now()
    elif match_minutes:
        minutes_ago = int(match_minutes.group(1))
        date_obj = datetime.now() - timedelta(minutes=minutes_ago)
    elif match_hours:
        hours_ago = int(match_hours.group(1))
        date_obj = datetime.now() - timedelta(hours=hours_ago)
    elif match_yesterday:
        date_obj = datetime.now() - timedelta(days=1)
    elif match_days:
        days_ago = int(match_days.group(1))
        date_obj = datetime.now() - timedelta(days=days_ago)
    else:
        date_obj = datetime.strptime(date_str, "%b %d, %Y")
        
    formatted_date = date_obj.strftime("%d/%m/%Y")

    return formatted_date

In [5]:
def load_more_comments(driver):
    while True:
        try:
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button._1lBsK._3_MJY._2vim0.ds-card._1FKeR"))
            )
            load_more_button.click()
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located(
                    (By.CSS_SELECTOR, "div.lxZkf._2ywna")
                )
            )

        except Exception as e:
            break

In [6]:
def load_more_content(driver, section_id):
    while True:
        try:
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(
                    (
                        By.XPATH,
                        f"//section[@id='{section_id}']//button[contains(., 'Load more')]",
                    )
                )
            )
            driver.execute_script(
                "arguments[0].scrollIntoView(true);", load_more_button
            )
            time.sleep(2)
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(5)
        except Exception as e:
            break

In [7]:
def go_to_next_page(driver, next_page_number,site_url, scroll_pause_time=5):
    driver.get(site_url)
    for i in range(next_page_number):        
        try:
            next_page_buttons = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located(
                    (By.XPATH, "//a[contains(@class, '_1OGeq')]")
                )
            )
            if len(next_page_buttons) == 0:
                print("Nessun elemento trovato.")
            if len(next_page_buttons) >= 2:
                next_page_button = next_page_buttons[1]
            else:
                next_page_button = next_page_buttons[0]
            driver.execute_script("arguments[0].scrollIntoView(true);", next_page_button)
            time.sleep(scroll_pause_time)
            driver.execute_script("arguments[0].click();", next_page_button)
        except TimeoutException:
                print("L'elemento non è stato trovato entro il tempo specificato")

In [8]:
def find_number(string):
    number = re.findall(r"\d+", string)
    if number:
        return int(number[0])
    else:
        return 0

In [9]:
def user_date(date):
    year = find_number(date)
    if year:
        int_year = int(year)
        current_date = datetime.now()
        inscription_date = current_date.replace(year= current_date.year - int_year)
        return inscription_date.strftime("%d/%m/%Y")
    else:
        return 'N/A'        

In [10]:
def find_user_date_place(tags):
    
    inscription_date = 'N/A'
    place= 'N/A'
    for tag in tags:
        text= tag.text
        if 'Deviant for' in text:
            inscription_date= user_date(text)
    
    if tags[1] and tags[1].text.isalpha():
        place= tags[1].text
    return inscription_date, place
        

In [11]:
def chunk_string(s, chunk_size):
    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]

In [12]:
def crawling_user(artwork, driver):   
    try:

        detail_url= artwork.find('a')['href']
        driver.get(detail_url)
        wait_for_images(driver, By.CLASS_NAME, "_23oWS")
        detail_content = driver.page_source
        detail_soup = BeautifulSoup(detail_content, "html.parser")

        user_link_container = detail_soup.find("div", class_="_31DYV")
        user_link_tag = user_link_container.find("a", class_="user-link _277bf")
        user_link = user_link_tag['href'] if user_link_tag else 'N/A'
        replaced_user_link = user_link.replace('/gallery', '/about')

        driver.get(replaced_user_link)
        wait_for_images(driver, By.CLASS_NAME, "_2UI2c")
        user_content = driver.page_source
        user_soup = BeautifulSoup(user_content, "html.parser")

        name_tag = user_soup.find("span", class_="_2UI2c")
        name = name_tag.text if name_tag else 'N/A'
        date_place_tags= user_soup.find_all("span", class_="_2cHeo")
        inscription_date, place= find_user_date_place(date_place_tags) if date_place_tags else ('N/A', 'N/A')
        statistics_tags = user_soup.find_all("div", class_="_2C2cW")

        page_views= 'N/A'
        num_followers= 'N/A'
        num_follow= 'N/A'
        num_favourites= 'N/A'
        num_comments_made= 'N/A'
        num_comments_received= 'N/A'

        if len(statistics_tags) != 0:
            page_views = convert_text(statistics_tags[0].find(string=True, recursive=False).strip()) if statistics_tags[0] else 'N/A'
            num_followers = convert_text(statistics_tags[2].find(string=True, recursive=False).strip()) if statistics_tags[2] else 'N/A'
            num_follow = convert_text(statistics_tags[3].find(string=True, recursive=False).strip()) if statistics_tags[3] else 'N/A'
            num_favourites = convert_text(statistics_tags[4].find(string=True, recursive=False).strip()) if statistics_tags[4] else 'N/A'
            num_comments_made= convert_text(statistics_tags[5].find(string=True, recursive=False).strip()) if statistics_tags[5] else 'N/A'
            num_comments_received= convert_text(statistics_tags[6].find(string=True, recursive=False).strip()) if statistics_tags[6] else 'N/A'

        load_more_content(driver, 'watchers')
        load_more_content(driver, 'watching')
        user_content = driver.page_source
        user_soup = BeautifulSoup(user_content, "html.parser")

        followers= []
        watchers_section = user_soup.find("section", id="watchers")
        followers_tags = watchers_section.find_all("div", class_="_2YmxY") if watchers_section else []

        for tag in followers_tags:
            follower_name_tag = tag.find("span", class_="_2QMci")
            follower_name = follower_name_tag.text if follower_name_tag else 'N/A'
            if follower_name != "N/A":
                followers.append({
                    "follower": follower_name,
                })
        follow = []
        watching_section = user_soup.find("section", id="watching")
        watching_tags = watching_section.find_all("div", class_="_2YmxY") if watching_section else []

        for tag in watching_tags:
            watching_name_tag = tag.find("span", class_="_2QMci")
            watching_name = watching_name_tag.text if watching_name_tag else "N/A"
            if watching_name != "N/A":
                follow.append(
                    {
                        "follow": watching_name,
                    }
                )

        user_object = {
                "name": name,
                "place": place,
                "inscription_date": inscription_date,
                "page_views": page_views,
                "num_followers": num_followers,
                "num_follow": num_follow,
                "num_favourites": num_favourites,
                "num_comments_made": num_comments_made,
                "num_comments_received": num_comments_received,
                "followers": followers,
                "follow": follow
            }
        return user_object
    except Exception as e:
        print(f"Errore in user: {e}")

In [13]:
def crawling_artworks(artwork, driver):
        try:
            img_tag= artwork.find('img')
            img_url = img_tag['src'] if img_tag else 'N/A'

            title_tag = artwork.find('h2', class_="_3CpJS")
            title= title_tag.text if title_tag else 'N/A'

            author_tag = artwork.find("span", class_="_2EfV7")
            author = author_tag.text if author_tag else 'N/A'

            likes_tag = artwork.find('button', class_='reset-button _2Pdhv')
            likes_span = likes_tag.find('span', class_='_20Nv2').find_next_sibling('span') if likes_tag else 'N/A'
            likes = convert_text(likes_span.text) if likes_span else "N/A"

            comments_tag = artwork.find('a', href=lambda href: href and href.endswith('#comments'))
            comments_span = comments_tag.find('span').next_sibling if comments_tag else 'N/A'
            comments = convert_text(comments_span.text) if comments_span else "N/A"

            detail_url= artwork.find('a')['href']
            driver.get(detail_url)
            wait_for_images(driver, By.CLASS_NAME, "_23oWS")

            load_more_comments(driver)
            detail_content = driver.page_source
            detail_soup = BeautifulSoup(detail_content, "html.parser")

            time_tag = detail_soup.find('time')
            time = time_tag.text if time_tag else 'N/A'
            time = format_date(time)

            views_span = detail_soup.find("span", class_="reset-button _3bGQn _1otI6").find(
                "span", class_="_3AClx"
            ).find('span')
            views = convert_text(views_span.text) if views_span else "N/A"

            image_tags =[]
            tag_container = detail_soup.find("div", class_="aodIv wT4l_")
            tags = tag_container.find_all("a")if tag_container else []
            for tag in tags:
                image_tags.append(tag.text)

            image_comments = []
            comments_container = detail_soup.find("div", class_="_1YhYy")
            all_comments = (
                comments_container.find_all("div", class_="_2VfPz _1LomQ")
                if comments_container
                else []
            )

            for comment in all_comments:
                comment_author_container = comment.find("span", class_="_2vKEO")
                comment_author = comment_author_container.text if comment_author_container else "N/A"
                comment_date = comment.find("time")
                formatted_comment_date = format_date(comment_date.text) if comment_date else "N/A"
                comment_text_container = comment.find(
                    "span", class_="_2PHJq public-DraftStyleDefault-ltr"
                )
                comment_text= comment_text_container.text if comment_text_container else "N/A"
                image_comments.append(
                    {
                        "comment_author": comment_author,
                        "comment_date": formatted_comment_date,
                        "comment_text": comment_text,
                    }
                )

            artwork_object = {
                "img": img_url,
                "title": title,
                "author": author,
                "likes": likes,
                "number_of_comments": comments,
                "date": time,
                'number_of_views': views,
                'image_tags': image_tags,
                'comments': image_comments
            }
            return artwork_object
        except Exception as e:
            print(f"Error extracting data for artwork: {e}")
   

In [None]:
site_url = "https://www.deviantart.com"
options = ChromeOptions()
# options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options)
driver.get(site_url)

wait_for_images(driver, By.CLASS_NAME, "_3_LJY")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser")
artworks = soup.findAll("div", class_="_3Y0hT _3oBlM")
filtered_artworks = [
    artwork for artwork in artworks if not artwork.find("span", class_="_10VHP")
]

artworks_data = []
user_data=[]
i=0
while i <20:
    try:
        wait_for_images(driver, By.CLASS_NAME, "_3_LJY")
        content = driver.page_source
        soup = BeautifulSoup(content, "html.parser")
        artworks = soup.findAll("div", class_="_3Y0hT _3oBlM")
        filtered_artworks = [
            artwork for artwork in artworks if not artwork.find("span", class_="_10VHP")
        ]
        for artwork in filtered_artworks:
            artworks_data.append(crawling_artworks(artwork, driver))
            user_data.append(crawling_user(artwork, driver))
        print(len(artworks_data))
        print(len(user_data))
        go_to_next_page(driver, i+1, site_url)
        i+=1
    except WebDriverException as e:
        print(f"Errore in crawling: {e}")
        time.sleep(5)
        driver.refresh()

In [16]:
chunk_size= 30000
max_comments_chunk = max(
    len(chunk_string(json.dumps(artwork["comments"], ensure_ascii=False), chunk_size))
    for artwork in artworks_data
    if artwork is not None
)
header = [
    "img",
    "title",
    "author",
    "likes",
    "number_of_comments",
    "number_of_views",
    "date",
    "image_tags",
] + [f"comment_{i+1}" for i in range(max_comments_chunk)]
with open('../dataset/artworks.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=';')
    writer.writerow(header)      

    for artwork in artworks_data:
        if artwork is not None:
            comments_chunk = chunk_string(
                json.dumps(artwork["comments"], ensure_ascii=False), chunk_size
            )
            writer.writerow(
                [
                    artwork["img"],
                    artwork["title"],
                    artwork["author"],
                    artwork["likes"],
                    artwork["number_of_comments"],
                    artwork["number_of_views"],
                    artwork["date"],
                    artwork["image_tags"],
                ]
                + comments_chunk
            )

In [17]:
chunk_size = 30000
max_followers_chunks = max(len(chunk_string(json.dumps(user["followers"], ensure_ascii=False), chunk_size)) for user in user_data if user is not None)
max_follow_chunks = max(len(chunk_string(json.dumps(user["follow"], ensure_ascii=False), chunk_size)) for user in user_data if user is not None)
header = (
    [
        "name",
        "place",
        "inscription_date",
        "number_page_views",
        "number_followers",
        "number_follow",
        "number_favourites",
        "number_comments_made",
        "number_comments_receveid",
    ]
    + [f"followers_part_{i+1}" for i in range(max_followers_chunks)]
    + [f"follow_part_{i+1}" for i in range(max_follow_chunks)]
)
with open("../dataset/users.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter=";")
    writer.writerow(header)
    for user in user_data:
        if user is not None:
            followers_json = json.dumps(user["followers"], ensure_ascii=False)
            follow_json = json.dumps(user["follow"], ensure_ascii=False)
            followers_chunk = chunk_string(followers_json, chunk_size)
            follow_chunk = chunk_string(follow_json, chunk_size)
            row=[
                    user["name"],
                    user["place"],
                    user["inscription_date"],
                    user["page_views"],
                    user["num_followers"],
                    user["num_follow"],
                    user["num_favourites"],
                    user["num_comments_made"],
                    user["num_comments_received"],
                ]
            for i in range(max_followers_chunks):
                if i < len(followers_chunk):
                    row.append(followers_chunk[i])
                else:
                    row.append("")
            for i in range(max_follow_chunks):
                if i < len(follow_chunk):
                    row.append(follow_chunk[i])
                else:
                    row.append("")
            writer.writerow(row)
            