In [1]:
import numpy as np 
import pandas as p
import requests
from bs4 import BeautifulSoup

In [2]:
url1 = "https://www.plusvalia.com"
url2 = "https://www.deviantart.com"
url3 = "https://store.steampowered.com"

# Posibilidades con devianrt

1. Búsqueda por palabra, por ejemplo "sky" (240k resultados):
    https://www.deviantart.com/search?q=sky&cursor=MTQwYWI2MjA9NCY1OTBhY2FkMD03MiZkMTc0YjZiYz1OJTJGQSY3ODAwOTk4MiU1QjAlNUQ9NjAxNzM0MzgxJjc4MDA5OTgyJTVCMSU1RD05Njg3OTExNjYmNzgwMDk5ODIlNUIyJTVEPTg4Mzg5NTgwNCY3ODAwOTk4MiU1QjMlNUQ9ODg2NjA2Nzk4Jjc4MDA5OTgyJTVCNCU1RD02OTcxNDU0NzQ

2. iterar por las páginas a partir del botón next

3. de cada imagen sacar:
    - enlace de la imagen
    - título
    - autor
    - num. favoritos
    - num. comentarios
    - num. views
    - tags
    - fecha de publicación
    - tamaño de la imagen

NO necesario autenticarse

4. Términos de uso: https://www.deviantart.com/about/policy/service 
    Hay sección de scraping y IAs pero no se menciona nada de limitaciones a la hora de hacer scraping.

# Posibilidades con steam

1. búsqueda de juegos por género, ejemplo "free to play"
    https://store.steampowered.com/genre/Free%20to%20Play/

    Es necesario selenium para paginar contenido.

2. de cada juego, en su página, extraer:
    - título
    - requisitos del sistema, mínimo y recomendado
    - desarrollador
    - género
    - editor
    - fecha de lanzamiento
    - link al vídeo
    - link a las imágenes

3. Acuerdo de privacidad: https://store.steampowered.com/privacy_agreement/?snr=1_44_44_
    Informacion legal: https://store.steampowered.com/legal/?snr=1_44_44_
    Sin menciones a scraping.

**Conclusiones**:
Ambas se pueden scrapear y dependiendo de lo que queramos sacar puede ser más sencillo o más complejo. La de deviantart me parece una página más agradable y limpia, pero ambas me parecen buena opción. No he hecho la revisión de plusvalia.com pero, por encima, parece muy similar a idealista o fotocasa.

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from utils import *
import urllib.parse
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import json

In [7]:
topics = [
    "Fantasy art",
    "Science fiction art",
    "Anime and manga art",
    "Fan art (for specific fandoms)",
    "Digital paintings",
    "Traditional drawings",
    "Character designs",
    "Creature concepts",
    "Landscape art",
    "Abstract art",
    "Surrealism",
    "Steampunk art",
    "Cyberpunk art",
    "Gothic art",
    "Horror art",
    "Cosplay photography",
    "Pixel art",
    "Concept art",
    "Comics and graphic novels",
    "Street art and graffiti"
]


In [3]:
class DeviantArtScraper: 
    def __init__(self, max_pages, topics):
        self.max_pages=max_pages
        self.topics=topics
        self.base_url="https://www.deviantart.com/search"
        self.save_path_csv = './csv_data/images_db.csv'
        self.save_path_json = './json_data/images_db.csv'
        self.information= {
            'data':[],
            'search_topic':[],
            'page_num':[]
        }
        self.error_links = []
        self.start_driver()

    def close_driver(self):
        self.driver.close()

    def start_driver(self):
        self.driver=webdriver.Chrome()

    def run_scrapper(self):
        # Realizamos busqueda de un tema
        for topic in self.topics:
            parsed_topic = urllib.parse.quote(topic)
            search_url = f'{self.base_url}?q={parsed_topic}'
            page = 0

            # Navegamos por un numero de paginas predeterminadas
            while page < self.max_pages:
                self.driver.get(search_url)
                time.sleep(2)
                
                # Obtenemos links de cada imagen
                image_classes = self.driver.find_elements(By.CLASS_NAME, "_3Y0hT")
                image_links = [image.find_element(By.TAG_NAME, 'a').get_attribute('href') for image in image_classes]
                
                # Extraemos informacion
                for link in image_links:
                    try:
                        self.information['data'].append(get_info_from_url(link))
                        self.information['search_topic'].append(topic)
                        self.information['page_num'].append(page)
                    except Exception as e:
                        print(f' Ha habido un error al tratar de procesar el siguiente link: {link}, pagina {page}, tema {topic}')
                        print(e)
                        self.error_links.append(link)
                
                # Navegamos a la siguiente pagina.
                try:
                    next_page = self.driver.find_element(By.LINK_TEXT,'Next')
                    search_url = next_page.get_attribute('href')
                except Exception as e:
                    print(f'Error al encontrar una nueva pagina {topic}, page: {page}')
                    print(e)
                    break
                page += 1

        # Ceerramos el browser
        self.close_driver()

    def generate_df(self):
        ''' 
        Este metodo crea un data frame de pandas a partir del diccionario resultante.
        
        '''
        self.df = pd.DataFrame(self.information)
        self.df = pd.concat([self.df.loc[:,['search_topic', 'page_num']], pd.json_normalize(self.df['data'])], axis=1)
        
    def save_csv(self):
        ''''
        Este metodo guarda los datos con formato csv.
        '''
        self.df.to_csv(self.save_path, sep=',')

    def save_json(self):
        ''''
        Este metodo guarda los datos con formato json.
        '''
        with open(self.save_path_json, 'w') as json_file:
            json.dump(self.information, json_file)



In [4]:
topics = ['Fantasy Art', 'sky']
scraper = DeviantArtScraper(max_pages = 1, topics=topics)
scraper.run_scrapper() 
print(scraper.information)


{'data': [{'image_url': 'https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/e090c796-b870-437e-93ec-bc3ea196a4d9/dciirjp-22b2f65b-b9f5-4cd9-8fcb-4fac2e2b8125.jpg/v1/fill/w_1600,h_864,q_75,strp/the_magocracy_of_helinde_by_ferdinandladera_dciirjp-fullview.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7ImhlaWdodCI6Ijw9ODY0IiwicGF0aCI6IlwvZlwvZTA5MGM3OTYtYjg3MC00MzdlLTkzZWMtYmMzZWExOTZhNGQ5XC9kY2lpcmpwLTIyYjJmNjViLWI5ZjUtNGNkOS04ZmNiLTRmYWMyZTJiODEyNS5qcGciLCJ3aWR0aCI6Ijw9MTYwMCJ9XV0sImF1ZCI6WyJ1cm46c2VydmljZTppbWFnZS5vcGVyYXRpb25zIl19.PREfvF-2EEBrsKMB2wz97Tr7MMYq8zCMAFv1MMLmwzw', 'image_title': 'The Magocracy of Helinde', 'image_author': 'FerdinandLadera', 'image_favs': 1100, 'image_com': 28, 'image_views': 13700, 'private_collections': 0, 'tags': [], 'location': None, 'description': 'Fantasy landscape, personal art.', 'image_px': '5000x270

In [5]:
scraper.generate_df()

In [6]:
scraper.df

Unnamed: 0,search_topic,page_num,image_url,image_title,image_author,image_favs,image_com,image_views,private_collections,tags,location,description,image_px,image_size,published_date,num_comments,last_comment
0,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,The Magocracy of Helinde,FerdinandLadera,1100,28,13700,0,[],,"Fantasy landscape, personal art.",5000x2700,11.77,2018-07-28T15:29:09.000Z,28,
1,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Forgotten Kingdom I,JJcanvas,3000,42,79600,1,"[castle, conceptart, elven, epic, fantasy, for...",,Hey! Here's something new I painted in photosh...,1400x875,1.1,2018-09-27T10:58:40.000Z,42,😍
2,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Solitude,PiotrDura,3300,69,54600,0,"[ancient, clouds, concept, environment, epic, ...",,Personal project,1484x2000,900.75,2017-08-10T16:54:30.000Z,68,All that I have looked at just 1 word (Awesome)
3,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Mushy Land,Raphael-Lacoste,2100,52,68500,0,"[birds, conceptart, fantasy, illustration, lan...",,Concept art done for a personal project hope y...,1920x1113,594.45,2018-04-03T12:53:41.000Z,52,Definitely looks like it'll blend in well in S...
4,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,The Witch King of Angmar,AnatoFinnstark,2300,39,288200,0,"[fantasy, illustration, lotr, middleearth, naz...",,Limited Print during only 72h from now ! Avail...,2000x1074,635.07,2022-02-05T09:13:51.000Z,39,Did much to destroy the already decayed kingdo...
5,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,City of elves,panjoool,1900,58,733600,0,"[blender, conceptart, digitalart, digitalpaint...",,inspired by the Elf city in LOTR by J. R. R. T...,3000x1568,4.17,2022-01-06T15:17:21.000Z,58,Incroyablement jolis
6,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Cosmic Forest,JJcanvas,5400,129,154200,2,"[cosmic, digitalart, digitalpainting, fantasyi...",,CORE MEMBERSHIP GIVEAWAY - I'm giving away 10 ...,1920x1080,1.72,2020-08-26T13:18:46.000Z,129,beautyfull ♡ wanna learn that
7,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Enchanted Lake,JJcanvas,2900,58,45100,0,"[conceptart, fantasy, fantasyart, fantasyillus...",,Reworked an old piece in ProCreate! Website - ...,1650x928,1.21,2019-02-25T19:09:43.000Z,58,I love your artwork! Its so beautiful and fuel...
8,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,Strange The Dreamer Illustration I,JJcanvas,902,10,94600,0,"[fantasy, fantasyart, bookillustration, fantas...",,An illustrator for Laini Taylor's Strane the D...,1800x1276,1.73,2021-06-29T11:24:08.000Z,10,Nice!!! Check out my youtube channle: https://...
9,Fantasy Art,0,https://images-wixmp-ed30a86b8c4ca887773594c2....,My town,maxasabin,1200,32,649400,0,"[conceptart, digitalart, environment, fantasy,...",,Making of PSD file is available on my Patreon ...,1920x1080,2.38,2022-04-13T21:32:57.000Z,32,
