In [37]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.select import Select
import pandas as pd

In [42]:
url = "https://steamdb.info/sales/"
steam_data = {
    "ID": [],
    "Nome": [],
    "SubInfo": [],
    "Desconto": [],
    "Preco": [],
    "Rating": [],
    "Lancamento": [],
}

driver_options = webdriver.FirefoxOptions()
driver_options.headless = True 
driver_options.set_preference("permissions.default.image", 2)  # Desativa imagens
driver_options.set_preference("permissions.default.stylesheet", 2)  # Desativa CSS
driver_options.set_preference("plugin.state.flash", 0)  # Desativa o Flash
driver_options.set_preference("webgl.disabled", True)  # Desativa WebGL
driver_options.set_preference("gfx.direct2d.disabled", True)  # Desativa aceleração de hardware
driver_options.set_preference("browser.cache.disk.enable", False)  # Desativa cache do navegador
driver_options.set_preference("browser.cache.memory.enable", False)  # Desativa cache do navegador

driver = webdriver.Firefox(options=driver_options)
driver.maximize_window()
driver.get(url)

entries = driver.find_element(By.CSS_SELECTOR, "[name='DataTables_Table_0_length']")
entry = Select(entries)
entry.select_by_value('-1')

WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.CLASS_NAME, "app"))
)
rows = driver.find_elements(By.CLASS_NAME, "app")

for row in rows:
    try:
        # ID
        app_id = row.get_attribute("data-appid")
        # Nome
        app_name = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) a").text

        # SubInfo
        try:
            subinfo = row.find_element(By.CSS_SELECTOR, "td:nth-child(3) div span:nth-child(1)").text
        except NoSuchElementException:
            subinfo = ""

        # Desconto
        discount = row.find_element(By.CSS_SELECTOR, "td:nth-child(4)").text
        # Preço
        price = row.find_element(By.CSS_SELECTOR, "td:nth-child(5)").text
        # Rating
        rating = row.find_element(By.CSS_SELECTOR, "td:nth-child(6)").text
        # Lancamento
        release = row.find_element(By.CSS_SELECTOR, "td:nth-child(7)").text

        """
        TODO: Data Final e Data Inicio não estão funcionando,
              Arquivo timeago.js está renderizando os dados, porém
              Mesmo após esperar a execução do js, não está sendo exibido o dado...
        RESOLUÇÃO: 1. phantomJS (NAO TESTADO); 2. executar js com selenium (MUITO LENTO)
        
        try:
            # Esperar js carregar o dado
            end_date = WebDriverWait(row, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "td:nth-child(8)"))
            ).text
        except (NoSuchElementException, TimeoutException):
            end_date = "N/A"
        """
        
        steam_data["ID"].append(app_id)
        steam_data["Nome"].append(app_name)
        steam_data["SubInfo"].append(subinfo)
        steam_data["Desconto"].append(discount)
        steam_data["Preco"].append(price)
        steam_data["Rating"].append(rating)
        steam_data["Lancamento"].append(release)

    except Exception as e:
        print(f"Erro ao processar a linha: {e}")
        
driver.quit()

In [43]:
steam_df = pd.DataFrame(data=steam_data)
steam_df

Unnamed: 0,ID,Nome,SubInfo,Desconto,Preco,Rating,Lancamento
0,1507190,Machinika: Museum,Free To Keep,-100%,"R$ 0,00",87.68%,Mar 2021
1,289130,ENDLESS™ Legend,Free To Keep,-100%,"R$ 0,00",82.24%,Sep 2014
2,282800,100% Orange Juice,Free To Keep,-100%,"R$ 0,00",90.41%,Sep 2013
3,232090,Killing Floor 2,Play For Free,-95%,"R$ 2,79",86.56%,Nov 2016
4,1086940,Baldur's Gate 3,Weekend Deal,-15%,"R$ 169,99",95.93%,Aug 2023
...,...,...,...,...,...,...,...
5723,2063180,The Tower on the Borderland,Introductory Offer,-10%,"R$ 44,99",—,May 2024
5724,2026630,H.I.v.C.A.: Human Intelligence vs Computer Alg...,Introductory Offer,-10%,"R$ 29,69",—,May 2024
5725,1640370,The Court Of Wanderers,Introductory Offer,-10%,"R$ 34,64",—,May 2024
5726,1534870,风之界限 the border of wind,Week Long Deal,-10%,"R$ 26,99",—,Apr 2023


In [44]:
from google.cloud import bigquery
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file('beanalytictest-9034e310532b.json')

In [54]:
credentials.project_id

'beanalytictest'

In [55]:
client = bigquery.Client(credentials=credentials, project=credentials.project_id)
dataset_id = 'beanalytic'
table_id = f'{credentials.project_id}.{dataset_id}.steamdb'

In [56]:
schema=[
    bigquery.SchemaField('ID', 'STRING'),
    bigquery.SchemaField('Nome', 'STRING'),
    bigquery.SchemaField('SubInfo', 'STRING'),
    bigquery.SchemaField('Desconto', 'STRING'),
    bigquery.SchemaField('Preco', 'STRING'),
    bigquery.SchemaField('Rating', 'STRING'),
    bigquery.SchemaField('Lancamento', 'STRING'),
]

table = bigquery.Table(table_id, schema=schema)
a = client.create_table(table, exists_ok=True)
a

Table(TableReference(DatasetReference('beanalytictest', 'beanalytic'), 'steamdb'))

In [57]:
job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")
job = client.load_table_from_dataframe(steam_df, table_id, job_config=job_config)
job.result()

LoadJob<project=beanalytictest, location=southamerica-east1, id=4a1c1873-1ddd-4427-82f1-396fdbf6d506>