Imports necesarios

In [1]:
import pandas as pd
import random
import requests

from bs4 import BeautifulSoup
from selenium import webdriver

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Variables de configuración

In [2]:
BASE_URL = 'https://www.metacritic.com'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

options = webdriver.ChromeOptions()

options.add_argument('--headless')
options.add_argument("enable-automation")
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
options.add_argument("--dns-prefetch-disable")
options.add_argument("--disable-gpu")

Funciones importantes

In [3]:
def scrape_basic_game_information(url):
    details_url = f'{BASE_URL}{url}details/'
    response = requests.get(details_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')

    game_name = soup.find('a', class_="c-productSubpageHeader_back").text.strip()
    game_description = soup.find('div', class_="c-pageProductDetails_description").text.strip()
    game_developer = soup.find('div', class_="c-gameDetails_Developer").find('li', class_="c-gameDetails_listItem").text.strip()
    game_genre = soup.find('li', class_="c-genreList_item").text.strip()

    return game_name, game_description, game_developer, game_genre

In [4]:
def scrape_reviews(url, review_endpoint, score_type, review_type):
    reviews_url = f'{BASE_URL}{url}{review_endpoint}/'
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(120)
    driver.get(reviews_url)
    page_content = driver.page_source
    driver.quit()

    soup = BeautifulSoup(page_content, 'html.parser')

    reviews = []
    review_elements = soup.find_all('div', class_='c-siteReview_main')
    for review in review_elements:
        score = review.find('div', class_='c-siteReviewScore').text.strip()
        review_text = review.find('div', class_='c-siteReview_quote').text.strip()
        reviews.append({score_type: score, review_type: review_text})
    return reviews

In [5]:
def scrape_user_reviews(url):
    return scrape_reviews(url=url, review_endpoint='user-reviews', score_type='user_score', review_type='user_review_text')

def scrape_critic_reviews(url):
    return scrape_reviews(url=url, review_endpoint='critic-reviews', score_type='critic_score', review_type='critic_review_text')

In [6]:
def scrape_game_data(url):
    game_name, game_description, game_developer, game_genre = scrape_basic_game_information(url=url)
    user_reviews = scrape_user_reviews(url=url)
    critic_reviews = scrape_critic_reviews(url=url)

    return {
        'Game_name': game_name,
        'Game_description': game_description.replace("Description:\n", "").strip(),
        'Game_developer': game_developer,
        'Game_genre': game_genre,
        'User_reviews': user_reviews,
        'Critic_reviews': critic_reviews,
        'Link': url
    }

In [7]:
def load_previous_data(filename):
    try:
        df = pd.read_csv(filename)
        print(f"Cargando los datos guardados del archivo {filename}")
        return df
    except FileNotFoundError:
        print("No se encontró un archivo CSV. Se creará uno nuevo.")
        return pd.DataFrame()


In [8]:
def get_extended_data(scraped_games):
    extended_data = []
    for game in scraped_games:
        number_of_reviews = min(len(game['User_reviews']), len(game['Critic_reviews']))
        for i in range(number_of_reviews):
            extended_data.append({
                'Game_name': game['Game_name'],
                'Game_description': game['Game_description'],
                'Game_developer': game['Game_developer'],
                'Game_genre': game['Game_genre'],
                'User_score': game['User_reviews'][i]['user_score'],
                'User_review_text': game['User_reviews'][i]['user_review_text'],
                'Critic_score': game['Critic_reviews'][i]['critic_score'],
                'Critic_review_text': game['Critic_reviews'][i]['critic_review_text'],
                'Link': game['Link']
            })
    return extended_data

In [9]:
metacritic_df = load_previous_data('metacritic_data.csv')

scraped_links = []
if not metacritic_df.empty:
    scraped_links = metacritic_df['Link'].tolist()

links_df = pd.read_csv('game_links.csv')
random_links = links_df['game_links']
random.shuffle(random_links)

scraped_games = []

for link in random_links:
    if link in scraped_links:
        print(f"El juego con la URL {link} ya ha sido recorrido. Saltando al siguiente.")
        continue
    try:
        game_data = scrape_game_data(url=link)
        scraped_games.append(game_data)
        print(f"Datos del juego {game_data['Game_name']} extraídos correctamente.")
        extended_data = get_extended_data(scraped_games=scraped_games)
        scraped_games_df = pd.DataFrame(extended_data)
        metacritic_df = pd.concat([metacritic_df, scraped_games_df], ignore_index=True)

        metacritic_df.to_csv('metacritic_data.csv', index=False)
        print("Los datos se han guardado correctamente en metacritic_data.csv")
    except:
        print(f"No se pudo obtener la información del juego con la URL {link}. Saltando al siguiente.")
        continue


Cargando los datos guardados del archivo metacritic_data.csv


AttributeError: 'Series' object has no attribute 'toList'