In [1]:
import requests
import json
from bs4 import BeautifulSoup as bs
from api.iterating_api import generator_over_review
from api.parsers_api import headers, HTTP_error, parse_GameSpot
from tqdm import tqdm

In [2]:
url_1 = "https://www.eurogamer.net/archive/reviews"
url_3 =  "https://www.eurogamer.net/archive/reviews?page=3"
url_2 = "https://www.eurogamer.net/archive/reviews?page=277"
right_kickers_set = {"Review |", "Recommended |", "Essential |", "Avoid |"}

In [3]:
first_list = ["https://www.eurogamer.net/archive/reviews"]

for i in range(2,310):
    first_list.append("https://www.eurogamer.net/archive/reviews?page="+str(i))

len(first_list)

309

In [4]:
game_urls = []

In [5]:
current_position = 10
for url in tqdm(first_list[current_position:]):
    r = requests.get(url, headers=headers, timeout=10)

    # HTTP check (f.e. for 404 request status)
    if not r.ok:
        raise HTTP_error(r.status_code, r.url, "Big error")

    # HTML code scrubbing start
    soup = bs(r.text, "html.parser")
    body = soup.find(class_="summary_list")

    summary_list = body.find_all("div", class_="summary")

    for s in summary_list:
        if s.find(class_="kicker").text in right_kickers_set:
            game_urls.append(s.find(class_="link_overlay")["href"])


100%|██████████| 299/299 [03:01<00:00,  1.64it/s]


In [6]:
len(set(game_urls))

7202

In [7]:
def parse_EuroGamer(URL: str,
                   save_path: str = "",
                   json_save: bool = False
                   ):
    """
    Gets information about review about game on a page (name of game,
    headline of article, text of review) and returns it as dict. Works only with that site!
    IGN main page - https://www.gamespot.com
    :param URL: str
        page url
    :param  save_path: str
        Path to save data on json format
    :return: None or dict
        if dict - full of info (keys: name_review, ref, text)
    """

    # get HTTP page by GET request
    r = requests.get(URL, headers=headers, timeout=10)

    # HTML code scrubbing start
    soup = bs(r.text, "html.parser")
    body = soup.find(class_="article_body_content")

    p_text_list = body.find_all("p")

    game_full_rewiev = ""
    for p in p_text_list:
        game_full_rewiev += p.text

    if soup.find(class_="article_header").find(class_="published_at"):
        game_date = soup.find(class_="article_header").find(class_="published_at").time.text

    elif soup.find(class_="article_header").find(class_="updated_at"):
        game_date = soup.find(class_="article_header").find(class_="updated_at").time.text

    game_name = soup.find(class_="nav_breadcrumbs").div.ul.find_all("li")[-1].text.strip()

    # HTML code scrubbing end

    # Dictionary with data forming
    data = {
        'ref': URL,
        'date': game_date,
        'game_name': game_name,
        'text': game_full_rewiev}

    # Save dictionary as json or return
    if json_save:
        with open(save_path, 'w') as f:
            json.dump(data, f)
        return
    else:
        return data

test_url ='https://www.eurogamer.net/thief2'
parse_EuroGamer(game_urls[80],json_save = False, save_path="" )

{'ref': 'https://www.eurogamer.net/tetris-effect-review-the-eternal-puzzler-reimagined-on-a-truly-cosmic-scale-3',
 'date': '19 Mar 2021',
 'game_name': 'Tetris Effect',
 'text': 'Timeless, immediately compelling and utterly without mercy, Tetris has always been a game about what isn\'t there - or rather a game about what isn\'t there yet. It\'s a game about the puzzle pieces you don\'t currently have, and all the stupid stuff you get up to before they arrive. Tetris - the way I play it anyway, forever awaiting that long block - is the story of how you got so hopelessly drunk to fight off pre-party nerves that, once the actual party had started, you had to go home early - and on the way home you fell down an open manhole and broke your ankle. \r\nTo put it another way, Tetris, like Hokusai\'s wave and the FedEx logo, is sort of a secret primer in the power of negative space: over the last 30 years of playing Tetris I have come to recognise the shapes I need to build, and understand tha

In [10]:
len(game_urls)

7202

In [8]:
wrong_list = []

In [9]:
actual_position = 0

In [11]:
import time
basic_root = "/Users/fedor/Desktop/meta_project/data/EuroGamer/EuroGamer_final/"
for counter, url in  enumerate(tqdm(game_urls[actual_position:])):
    try:
        save_path = basic_root+ str(counter + actual_position)+".json"
        parse_EuroGamer(url,json_save = True, save_path=save_path )

    except requests.ConnectionError:
        wrong_list.append(url)
        time.sleep(3)
    except ...:
        wrong_list.append(url)
        time.sleep(3)

100%|██████████| 7202/7202 [1:13:14<00:00,  1.64it/s]


In [85]:
parse_EuroGamer(wrong_list[1000],json_save = False, save_path="" )

{'ref': 'https://www.eurogamer.net/super-mario-bros-movie-review-an-entertaining-advert-for-everything-nintendo',
 'date': '5 Apr 2023',
 'game_name': 'Reviews',
 'text': '\r\nIt\'s taken 30 years, but Nintendo finally has a Super Mario Bros. movie to please the masses. As a life-long Nintendo fan, I still can\'t quite believe it exists - even seeing photos of Shigeru Miyamoto rubbing shoulders with Chris Pratt on the Hollywood red carpet this week felt like a bizarre collision of galaxies - and yet in this age of Sonic the Hedgehog film sequels and Mushroom Kingdom theme parks, it\'s simultaneously odd it has taken this long. Here we are then, with 90 minutes of bright and breezy fare bringing Mario and the gang to the big screen in an adventure which is all-action and wafer-thin on plot - just like most Mario games! - albeit with a few tantalising hints at character development buried between the constant cameos and continual laughs.\r\n\r\nAside from those cameos, it\'s almost impos