In [1]:
import requests, csv, pandas as pd, pprint, time
from bs4 import BeautifulSoup
import lxml, html5lib
import re
from IPython.display import display, HTML

data_dict = {
    "name": [],
    "date": [],
    "platform": [],
    "score": [],
    "url": [],
    "ratings": [],
    "userscore": [],
}  # Data Structure


def webpage(
    pageNum, system
):  # function that navigates the metacritic SRP(Search Results Pages) based on the page number
    url = (
        "https://www.metacritic.com/browse/games/score/metascore/year/all/filtered?year_selected=2020&distribution=&sort=desc&view=condensed&page="
        + str(pageNum)
    )
    userAgent = {"User-agent": "Mozilla/5.0"}
    response = requests.get(url, headers=userAgent)
    return response


def numberPages(
    response,
):  # Helper Function that determines how many pages are in a SRP to know how many times to run scrapper function
    soup = BeautifulSoup(response.text, "html.parser")
    pages = soup.find_all("li", {"class": "page last_page"})
    try:
        pagesCleaned = pages[0].find("a", {"class": "page_num"}).text
    except:
        pagesCleaned = "1"
    return pagesCleaned


def scrapper(num_loops, content):
    tblnum = 0
    while tblnum < num_loops:
        # get Game name
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for a in td[1].find_all("a", {"class": "title"}):
                data_dict["name"].append(a.find("h3").text)

        # get Game release date
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for date in td[1].find_all("span", {"class": ""}):
                data_dict["date"].append(date.text)

        # get platform
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for platform in td[1].find_all("span", {"class": "data"}):
                data_dict["platform"].append(platform.text.strip())

        # get Game score
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for user in td[0].find_all("div", {"class": "metascore_w"}):
                data_dict["score"].append(user.text.strip())

        # getting game url
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for a in td[1].find_all("a", {"class": "title"}, href=True):
                data_dict["url"].append(a["href"])
                # print(a['href'])
                game_soup = BeautifulSoup(
                    requests.get(
                        "https://www.metacritic.com" + str(a["href"]) + "/user-reviews",
                        headers={"User-agent": "Mozilla/5.0"},
                    ).text,
                    "html.parser",
                )
                game_ratings = game_soup.find("strong", text=re.compile("Ratings"))
                if game_ratings:
                    game_ratings = (
                        game_ratings.get_text().strip().replace(" Ratings", "")
                    )
                else:
                    game_ratings = "0"
                # print(game_ratings)
                data_dict["ratings"].append(game_ratings)

        # get Game userscore
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for score in td[1].find_all("div", {"class": "metascore_w"}):
                data_dict["userscore"].append(score.text)
        tblnum += 1


def pages(
    lastPageNum, system
):  # Function that returns the html(code) and initiates the web scrapper
    currentPage = 0
    while currentPage < int(lastPageNum):
        url = url = (
            "https://www.metacritic.com/browse/games/score/metascore/year/all/filtered?year_selected=2020&distribution=&sort=desc&view=condensed&page="
            + str(currentPage)
        )
        userAgent = {"User-agent": "Mozilla/5.0"}
        response = requests.get(url, headers=userAgent)
        soup = BeautifulSoup(response.text, "html.parser")
        content = soup.find_all("table")

        num_loops = len(content)
        # print(num_loops)
        scrapper(num_loops, content)
        # print(data_dict)
        currentPage += 1
        time.sleep(6)


def main():
    # systems = ["switch", "ps4", "ps5"]
    systems = ["all"]
    for system in systems:
        numPage = numberPages(webpage(0, system))
        pages(int(numPage), system)
        time.sleep(5)
    xData = pd.DataFrame.from_dict(data_dict)
    xData.to_csv("mc_2020.csv")


main()


In [2]:
video_games = pd.DataFrame.from_dict(data_dict)

video_games = video_games[~video_games["userscore"].isin(["tbd"])]
video_games = video_games[~video_games["score"].isin(["tbd"])]
video_games = video_games[~video_games["ratings"].isin(["tbd"])]

video_games["ratings"] = video_games["ratings"].astype(float)
video_games["score"] = video_games["score"].astype(float)
video_games["userscore"] = video_games["userscore"].astype(float) * video_games["ratings"]

d = {
    "date": "date",
    "platform": "platform",
    "score": "score",
    "ratings": "ratings",
    "userscore": "userscore",
}
video_games = (
    video_games.groupby("name")
    .agg(
        {
            "date": "min",
            "platform": list,
            "score": "mean",
            "ratings": "sum",
            "userscore": "sum",
        }
    )
    .rename(columns=d)
)

video_games["userscore"] = (video_games["userscore"] / video_games["ratings"]).round(1)
video_games["ratings"] = video_games["ratings"].astype(int)
video_games["score"] = video_games["score"].astype(int)

video_games = video_games[
    ((video_games["userscore"] * 10) + video_games["score"]) / 2 >= 75
]
video_games = video_games[video_games["ratings"] >= 150]

display(HTML(video_games.sort_values(["name"]).to_html()))


Unnamed: 0_level_0,date,platform,score,ratings,userscore
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13 Sentinels: Aegis Rim,"September 22, 2020",[PlayStation 4],85,216,9.0
Assassin's Creed Valhalla,"November 10, 2020","[PlayStation 5, Xbox Series X, PC, Xbox One, PlayStation 4]",82,6739,7.5
Astro's Playroom,"November 12, 2020",[PlayStation 5],83,796,9.3
Bayonetta & Vanquish,"February 18, 2020","[PlayStation 4, Xbox One]",84,361,8.0
Bugsnax,"November 12, 2020","[PlayStation 4, PlayStation 5, PC]",75,435,7.6
Call of the Sea,"December 8, 2020","[Xbox One, PC, Xbox Series X]",78,242,7.4
Carrion,"July 23, 2020","[Switch, Xbox One, PC]",75,289,7.9
Command & Conquer Remastered Collection,"June 5, 2020",[PC],82,267,8.7
Crash Bandicoot 4: It's About Time,"October 2, 2020","[PlayStation 4, Xbox One]",84,731,8.1
Crusader Kings III,"September 1, 2020",[PC],91,453,8.4
