In [1]:
import numpy as np 
import pandas as p
import requests
from bs4 import BeautifulSoup

In [2]:
url1 = "https://www.plusvalia.com"
url2 = "https://www.deviantart.com"
url3 = "https://store.steampowered.com"

# Posibilidades con devianrt

1. Búsqueda por palabra, por ejemplo "sky" (240k resultados):
    https://www.deviantart.com/search?q=sky&cursor=MTQwYWI2MjA9NCY1OTBhY2FkMD03MiZkMTc0YjZiYz1OJTJGQSY3ODAwOTk4MiU1QjAlNUQ9NjAxNzM0MzgxJjc4MDA5OTgyJTVCMSU1RD05Njg3OTExNjYmNzgwMDk5ODIlNUIyJTVEPTg4Mzg5NTgwNCY3ODAwOTk4MiU1QjMlNUQ9ODg2NjA2Nzk4Jjc4MDA5OTgyJTVCNCU1RD02OTcxNDU0NzQ

2. iterar por las páginas a partir del botón next

3. de cada imagen sacar:
    - enlace de la imagen
    - título
    - autor
    - num. favoritos
    - num. comentarios
    - num. views
    - tags
    - fecha de publicación
    - tamaño de la imagen

NO necesario autenticarse

4. Términos de uso: https://www.deviantart.com/about/policy/service 
    Hay sección de scraping y IAs pero no se menciona nada de limitaciones a la hora de hacer scraping.

# Posibilidades con steam

1. búsqueda de juegos por género, ejemplo "free to play"
    https://store.steampowered.com/genre/Free%20to%20Play/

    Es necesario selenium para paginar contenido.

2. de cada juego, en su página, extraer:
    - título
    - requisitos del sistema, mínimo y recomendado
    - desarrollador
    - género
    - editor
    - fecha de lanzamiento
    - link al vídeo
    - link a las imágenes

3. Acuerdo de privacidad: https://store.steampowered.com/privacy_agreement/?snr=1_44_44_
    Informacion legal: https://store.steampowered.com/legal/?snr=1_44_44_
    Sin menciones a scraping.

**Conclusiones**:
Ambas se pueden scrapear y dependiendo de lo que queramos sacar puede ser más sencillo o más complejo. La de deviantart me parece una página más agradable y limpia, pero ambas me parecen buena opción. No he hecho la revisión de plusvalia.com pero, por encima, parece muy similar a idealista o fotocasa.

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import urllib.parse
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import json

In [57]:
topics = [
    "Fantasy art",
    "Science fiction art",
    "Anime and manga art",
    "Fan art (for specific fandoms)",
    "Digital paintings",
    "Traditional drawings",
    "Character designs",
    "Creature concepts",
    "Landscape art",
    "Abstract art",
    "Surrealism",
    "Steampunk art",
    "Cyberpunk art",
    "Gothic art",
    "Horror art",
    "Cosplay photography",
    "Pixel art",
    "Concept art",
    "Comics and graphic novels",
    "Street art and graffiti"
]


In [7]:
import json
import time
import requests
from bs4 import BeautifulSoup
import urllib.parse
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from fake_useragent import UserAgent

class DeviantArtScraper: 
    def __init__(self, max_pages: int, topics: list, save_path: str):
        self.max_pages=max_pages
        self.topics=topics
        self.base_url="https://www.deviantart.com/search"
        self.save_path_csv = save_path + '/images_db.csv'
        self.save_path_json = save_path + '/images_db.json'
        self.information= {
            'data':[],
            'search_topic':[],
            'page_num':[]
        }
        self.error_links = []
        self.start_driver()
        self.user_agent = UserAgent()

    def close_driver(self):
        self.driver.close()

    def start_driver(self):
        self.driver=webdriver.Chrome()

    def run_scrapper(self):
        for topic in self.topics:
            parsed_topic = urllib.parse.quote(topic)
            search_url = f'{self.base_url}?q={parsed_topic}'
            page = 0
            while page < self.max_pages:
                #self.driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": self.user_agent.random}) 
                #print(self.driver.execute_script("return navigator.userAgent;")) 
                self.driver.get(search_url)
                
                # Obtenemos links de cada imagen
                image_classes = self.driver.find_elements(By.CLASS_NAME, "_3Y0hT")
                image_links = [image.find_element(By.TAG_NAME, 'a').get_attribute('href') for image in image_classes]
                self.navigate_images_links(image_links=image_links, page=page, topic=topic)
        
                # Navegamos a la siguiente pagina.
                try:
                    next_page = self.driver.find_element(By.LINK_TEXT,'Next')
                    search_url = next_page.get_attribute('href')
                except Exception as e:
                    print(f'Error al encontrar una nueva pagina {topic}, page: {page}')
                    print(e)
                    break
                page += 1
        # Ceerramos el browser
        self.close_driver()
        self.generate_df()
    
    def navigate_images_links(self, image_links: list, page:int, topic: str):
        for link in image_links:
            try:
                self.information['data'].append(self.get_info_from_url(link))
                self.information['search_topic'].append(topic)
                self.information['page_num'].append(page)
            except Exception as e:
                print(f' Ha habido un error al tratar de procesar el siguiente link: {link}, pagina {page}, tema {topic}')
                print(e)
                self.error_links.append(link)

    def generate_df(self):
        self.df = pd.DataFrame(self.information)
        self.df = pd.concat([self.df.loc[:,['search_topic', 'page_num']], pd.json_normalize(self.df['data'])], axis=1)
        
    def save_csv(self):
        self.df.to_csv(self.save_path_csv, sep=',')

    def save_json(self):
        with open(self.save_path_json, 'w') as json_file:
            json.dump(self.information, json_file)

    def convert_views(self, metric_str: str) -> int:
        """
        Converts a string of any metric field to an integer,
        replacing 'K' for thousands or 'M' for millions.
        """
        if "K" in metric_str:
            return int(float(metric_str.replace("K", "")) * 1000)
        elif "M" in metric_str:
            return int(float(metric_str.replace("M", "")) * 1000000)
        return int(metric_str)

    def get_info_from_url(self, url: str) -> dict:
        """
        Get information from a given URL.

        Args:
            url (str): The URL of the webpage from which information needs to be extracted.

        Returns:
            dict: A dictionary containing the extracted information from the webpage.
        """
        with requests.Session() as session:
            headers = {'User-Agent': self.user_agent.random}
            try:
                page = session.get(url, timeout=5, headers=headers)
            except requests.exceptions.RequestException as e:
                print(f"An error occurred during the request: {e}")
                return None

            if page is None or page.status_code != 200:
                # Handle cases where the page is not retrieved successfully
                print("Failed to get page")
                return None

            soup = BeautifulSoup(page.text, "html.parser")

            # Campos principales de la imagen
            image_url = soup.find("div", class_="_2SlAD").find("img")["src"]
            image_title = soup.find("div", class_="U2aSH").text
            image_author = soup.find("span", class_="_12F3u").text

            # Métricas de la imagen (favs, comments, views y private_collections)
            metrics = [metric.text for metric in soup.find_all("span", class_="_3AClx")]

            # En algunos casos metrics devuelve varios valores para favoritos, el correcto será el último
            image_favs = self.convert_views(
                [metric.split(" ")[0] for metric in metrics if "Favourites" in metric][-1]
            )
            num_comments = [
                int(metric.split(" ")[0]) for metric in metrics if "Comments" in metric
            ][0]
            image_views = self.convert_views(
                [metric.split(" ")[0] for metric in metrics if "Views" in metric][0]
            )

            try:
                private_collections = [
                    int(metric.split(" ")[0])
                    for metric in metrics
                    if "Collected Privately" in metric
                ][0]
            # Si no existe este campo en metrics devuelve 0
            except IndexError:
                private_collections = 0

            tags = [tag.text for tag in soup.find_all("span", class_="_1nwad")]

            try:
                description = (
                    soup.find(
                        "div", class_="legacy-journal _2DahR _3bG54 maturefilter _3if5g"
                    )
                    .get_text(separator=" ", strip=True)
                    .replace("\xa0", "\n")
                )
            except AttributeError:
                description = None

            try:
                location = soup.find("div", class_="_3FMM3").text.split("\xa0")[-1]
            except AttributeError:
                location = None

            image_px = soup.find("div", class_="_3RVC5").next_sibling.text.split("px")[0]
            image_size_mb = float(
                soup.find("div", class_="_3RVC5")
                .next_sibling.text.split("px")[1]
                .strip()
                .split(" ")[0]
            )

            published_date = soup.find("div", class_="_1mcmq").find("time")["datetime"]

            if num_comments > 0:
                try:
                    last_comment = (soup.find("span", class_="_2PHJq").text).strip()
                except:
                    last_comment = ""

            results = {
                "image_url": image_url,
                "image_title": image_title,
                "image_author": image_author,
                "image_favs": image_favs,
                "image_com": num_comments,
                "image_views": image_views,
                "private_collections": private_collections,
                "tags": tags,
                "location": location,
                "description": description,
                "image_px": image_px,
                "image_size": image_size_mb,
                "published_date": published_date,
                "last_comment": last_comment,
            }

        return results

## Ejemplo

In [36]:
topics = ["Street art and graffiti", 'sky']
scraper = DeviantArtScraper(max_pages = 2, topics=topics, save_path = './dataset')
#scraper.run_scrapper() 


In [58]:
#scraper.get_info_from_url('https://www.deviantart.com/psktear/art/blue-46653661')

user_agent = UserAgent()

headers = {'User-Agent': user_agent.random}
session = requests.Session()
response = session.get('https://www.deviantart.com/coolarts223/art/Colorful-Street-Art-at-Night-1009044894')
soup = BeautifulSoup(response.text, 'html.parser')
soup.find("div", class_="U2aSH").text
soup.find("span", class_="_12F3u").text
metrics = [metric.text for metric in soup.find_all("span", class_="_3AClx")]
image_favs = [metric.split(" ")[0] for metric in metrics if "Favourites" in metric or  "Favourite" in metric][-1] 
num_comments = [int(metric.split(" ")[0]) for metric in metrics if "Comments" in metric or "Comment" in metric][0]
image_views = [metric.split(" ")[0] for metric in metrics if "Views" in metric or "View" in metric][0]
    
print(metrics)
print(image_favs)
print(num_comments)
print(image_views)


['1 Favourite', '0 Comments', '163 Views']
1
0
163


In [55]:
def get_info_from_url(url: str) -> dict:
    """
    Get information from a given URL.

    Args:
        url (str): The URL of the webpage from which information needs to be extracted.

    Returns:
        dict: A dictionary containing the extracted information from the webpage.
    """
    with requests.Session() as session:
        #headers = {'User-Agent': user_agent.random}
        try:
            page = session.get(url, timeout=5)
        except requests.exceptions.RequestException as e:
            print(f"An error occurred during the request: {e}")
            return None

        if page is None or page.status_code != 200:
            # Handle cases where the page is not retrieved successfully
            print("Failed to get page")
            return None

        soup = BeautifulSoup(page.text, "html.parser")

        # Campos principales de la imagen
        image_url = soup.find("div", class_="_2SlAD").find("img")["src"]
        image_title = soup.find("div", class_="U2aSH").text
        image_author = soup.find("span", class_="_12F3u").text

        # Métricas de la imagen (favs, comments, views y private_collections)
        metrics = [metric.text for metric in soup.find_all("span", class_="_3AClx")]

        # En algunos casos metrics devuelve varios valores para favoritos, el correcto será el último
        image_favs = convert_views(
            [metric.split(" ")[0] for metric in metrics if "Favourites" in metric or "Favourite" in metric][-1]
        )
        num_comments = [
            int(metric.split(" ")[0]) for metric in metrics if "Comments" in metric or "Comment" in metric
        ][0]
        image_views = convert_views(
            [metric.split(" ")[0] for metric in metrics if "Views" in metric or "View" in metric][0]
        )

        try:
            private_collections = [
                int(metric.split(" ")[0])
                for metric in metrics
                if "Collected Privately" in metric
            ][0]
        # Si no existe este campo en metrics devuelve 0
        except IndexError:
            private_collections = 0

        tags = [tag.text for tag in soup.find_all("span", class_="_1nwad")]

        try:
            description = (
                soup.find(
                    "div", class_="legacy-journal _2DahR _3bG54 maturefilter _3if5g"
                )
                .get_text(separator=" ", strip=True)
                .replace("\xa0", "\n")
            )
        except AttributeError:
            description = None

        try:
            location = soup.find("div", class_="_3FMM3").text.split("\xa0")[-1]
        except AttributeError:
            location = None

        image_px = soup.find("div", class_="_3RVC5").next_sibling.text.split("px")[0]
        image_size_mb = float(
            soup.find("div", class_="_3RVC5")
            .next_sibling.text.split("px")[1]
            .strip()
            .split(" ")[0]
        )

        published_date = soup.find("div", class_="_1mcmq").find("time")["datetime"]

        if num_comments > 0:
            try:
                last_comment = (soup.find("span", class_="_2PHJq").text).strip()
            except:
                last_comment = ""

        results = {
            "image_url": image_url,
            "image_title": image_title,
            "image_author": image_author,
            "image_favs": image_favs,
            "image_com": num_comments,
            "image_views": image_views,
            "private_collections": private_collections,
            "tags": tags,
            "location": location,
            "description": description,
            "image_px": image_px,
            "image_size": image_size_mb,
            "published_date": published_date,
            "last_comment": last_comment,
        }

    return results

def convert_views(metric_str: str) -> int:
    """
    Converts a string of any metric field to an integer,
    replacing 'K' for thousands or 'M' for millions.
    """
    if "K" in metric_str:
        return int(float(metric_str.replace("K", "")) * 1000)
    elif "M" in metric_str:
        return int(float(metric_str.replace("M", "")) * 1000000)
    return int(metric_str)


get_info_from_url(url='https://www.deviantart.com/coolarts223/art/Colorful-Street-Art-at-Night-1009044894')

IndexError: list index out of range

In [33]:
scraper.information['data'][0]

{'image_url': 'https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/b3be1dae-3caa-4d45-be6c-3de586ba95e2/dekxdig-c4062bb4-8152-43ca-93a6-6f4a5afa2935.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcL2IzYmUxZGFlLTNjYWEtNGQ0NS1iZTZjLTNkZTU4NmJhOTVlMlwvZGVreGRpZy1jNDA2MmJiNC04MTUyLTQzY2EtOTNhNi02ZjRhNWFmYTI5MzUuanBnIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.2oOfqnCn_Jt7WjX1je45XxrWrT9334AxXWuWbu0ww-8',
 'image_title': 'Photography Session',
 'image_author': 'BisBiswas',
 'image_favs': 1600,
 'image_com': 57,
 'image_views': 720400,
 'private_collections': 1,
 'tags': ['blue',
  'camera',
  'duck',
  'man',
  'sky',
  'art',
  'aesthetic',
  'afternoon',
  'alone',
  'animals',
  'artwork',
  'beautiful',
  'clouds',
  'darkart',
  'darkness',
  'digitalart',
  'digitaldrawing',
  'digitalpainting',
  'drawing',
  '

In [5]:
scraper.generate_df()

In [32]:
## Ejemplo de data frame resultante.
scraper.df.head()

Unnamed: 0,search_topic,page_num,image_url,image_title,image_author,image_favs,image_com,image_views,private_collections,tags,location,description,image_px,image_size,published_date,last_comment
0,sky,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Photography Session,BisBiswas,1600,57,720400,1,"[blue, camera, duck, man, sky, art, aesthetic,...",,Instagram 👉🏿 www.instagram.com/hereisbis/,1920x1080,318.04,2021-06-04T22:38:18.000Z,💕
1,sky,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Sky Temple,AlynSpiller,2900,56,146600,0,"[bigsun, asian, clouds, conceptart, digitalart...",,Thanks for looking! instagram YouTube Facebook,912x1200,867.98,2017-06-09T18:03:29.000Z,Lovely.
2,sky,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Sky Breaker,aerroscape,2500,95,64800,0,"[breaker, clouds, digital, landscape, sky, sto...",,the water cycle ...like you learned in geograp...,7045x4981,15.47,2015-11-19T22:48:01.000Z,Pretty!
3,sky,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Sky for Dreamers,RHADS,12400,393,328500,0,[],,Recommended music: Duran Duran - Ordinary Worl...,3200x2400,1.13,2013-08-02T17:40:43.000Z,"I really like this kind of landscape comics, l..."
4,sky,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Divine Sky,BisBiswas,2100,85,459000,3,"[black, blue, devine, god, man, sky, art, aest...",,Instagram 👉🏿 www.instagram.com/hereisbis/,3840x2160,2.76,2022-02-02T22:40:28.000Z,This Art is stuning!! the sky is amazing!!! lo...


In [6]:
import os
from typing import Optional
import requests

def download_image(
    url_image: str, images_folder: Optional[str] = "./deviantart_images"
):
    """
    Function to download images from url.
    """
    # Crea el directorio si no existe
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)

    image_title = url_image.split("?")[0].split("/")[-1]
    response = requests.get(url_image, stream=True)
    if response.status_code == 200:
        with open(f"{images_folder}/{image_title}", "wb") as out_file:
            out_file.write(response.content)
    del response


In [7]:
image_url = "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/0a843e25-2d15-4be3-89b7-0988b20ba533/dgxans7-5ae719f2-1827-4f45-bf4e-35d6ed29c13e.png/v1/fill/w_1192,h_670,q_70,strp/infinite_love_by_mumu0909_dgxans7-pre.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7ImhlaWdodCI6Ijw9MTA4MCIsInBhdGgiOiJcL2ZcLzBhODQzZTI1LTJkMTUtNGJlMy04OWI3LTA5ODhiMjBiYTUzM1wvZGd4YW5zNy01YWU3MTlmMi0xODI3LTRmNDUtYmY0ZS0zNWQ2ZWQyOWMxM2UucG5nIiwid2lkdGgiOiI8PTE5MjAifV1dLCJhdWQiOlsidXJuOnNlcnZpY2U6aW1hZ2Uub3BlcmF0aW9ucyJdfQ.w9ViEdqc7o6LD0D1u0Ze1ycczCudKRbxVS-VLcnq4Ts"
download_image(image_url)

In [31]:
data = pd.read_csv('C:/Users/esteb/OneDrive/Escritorio/Cursos y Materias/UOC Master Ciencia de Datos/Web-Scrapping-Practica1-/dataset/images_db.csv', index_col=0, sep=',')
data.tail()

Unnamed: 0,search_topic,page_num,image_url,image_title,image_author,image_favs,image_com,image_views,private_collections,tags,location,description,image_px,image_size,published_date,last_comment
6570,Street art and graffiti,14,https://images-wixmp-ed30a86b8c4ca887773594c2....,Ethel,WhoAm-Irony,52.0,6.0,1500.0,0.0,"['coloured', 'die', 'graffiti', 'hair', 'liver...",,"My new piece in Liverpool, UK. Model Anna Ance...",6016x4000,10.21,2015-02-22T15:48:58.000Z,
6571,Street art and graffiti,14,https://images-wixmp-ed30a86b8c4ca887773594c2....,Urban Elegy - Mobile Wallpaper,EdenAgency,40.0,3.0,19500.0,0.0,"['digitalart', 'digitalillustration', 'digital...",,"Step into the haunting beauty of ""Urban Elegy,...",1024x1792,2.86,2024-01-17T08:20:46.000Z,"All I can say is, ""Wow, AI!"""
6572,Street art and graffiti,14,https://images-wixmp-ed30a86b8c4ca887773594c2....,Street Art,gnuman12,27.0,2.0,1600.0,0.0,"['ai', 'colorful', 'girl', 'highres', 'cellpho...",,,5504x3456,24.27,2023-09-17T00:29:55.000Z,Saw it this morning on my cell phone and I did...
6573,Street art and graffiti,14,https://images-wixmp-ed30a86b8c4ca887773594c2....,Graffiti 3204,cmdpirxII,15.0,0.0,1800.0,0.0,"['aerosol', 'artist', 'bombing', 'can', 'chara...",,I only took the photo !! I have no claim to th...,1600x941,854.9,2014-10-16T20:33:12.000Z,
6574,Street art and graffiti,14,https://images-wixmp-ed30a86b8c4ca887773594c2....,RADICALS CREW,imagophil,10.0,0.0,620.0,0.0,"['graffiti', 'graffitiart', 'urban', 'urbexpho...",,street art gallery,2131x1795,2.8,2022-10-23T18:27:21.000Z,
