In [1]:
import requests, csv, pandas as pd, pprint, time
from bs4 import BeautifulSoup
import lxml, html5lib
import re
from IPython.display import display, HTML

data_dict = {
    "name": [],
    "date": [],
    "platform": [],
    "score": [],
    "url": [],
    "ratings": [],
    "reviews": [],
    "userscore": [],
}  # Data Structure


def webpage(
    pageNum, system
):  # function that navigates the metacritic SRP(Search Results Pages) based on the page number
    url = (
        "https://www.metacritic.com/browse/games/score/metascore/90day/"
        + str(system)
        + "/filtered?view=condensed&page="
        + str(pageNum)
    )
    userAgent = {"User-agent": "Mozilla/5.0"}
    response = requests.get(url, headers=userAgent)
    return response


def numberPages(
    response,
):  # Helper Function that determines how many pages are in a SRP to know how many times to run scrapper function
    soup = BeautifulSoup(response.text, "html.parser")
    pages = soup.find_all("li", {"class": "page last_page"})
    try:
        pagesCleaned = pages[0].find("a", {"class": "page_num"}).text
    except:
        pagesCleaned = "1"
    return pagesCleaned


def scrapper(num_loops, content):
    tblnum = 0
    while tblnum < num_loops:
        # get Game name
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for a in td[1].find_all("a", {"class": "title"}):
                data_dict["name"].append(a.find("h3").text)

        # get Game release date
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for date in td[1].find_all("span", {"class": ""}):
                data_dict["date"].append(date.text)

        # get platform
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for platform in td[1].find_all("span", {"class": "data"}):
                data_dict["platform"].append(platform.text.strip())

        # get Game score
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for user in td[0].find_all("div", {"class": "metascore_w"}):
                data_dict["score"].append(user.text.strip())

        # getting game url
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for a in td[1].find_all("a", {"class": "title"}, href=True):
                data_dict["url"].append(a["href"])
                game_soup = BeautifulSoup(
                    requests.get(
                        "https://www.metacritic.com" + str(a["href"]) + "/user-reviews",
                        headers={"User-agent": "Mozilla/5.0"},
                    ).text,
                    "html.parser",
                )
                game_ratings = game_soup.find("strong", text=re.compile("Ratings"))
                if game_ratings:
                    game_ratings = (
                        game_ratings.get_text().strip().replace(" Ratings", "")
                    )
                else:
                    game_ratings = "0"
                data_dict["ratings"].append(game_ratings)

                critic_soup = BeautifulSoup(
                    requests.get(
                        "https://www.metacritic.com"
                        + str(a["href"])
                        + "/critic-reviews",
                        headers={"User-agent": "Mozilla/5.0"},
                    ).text,
                    "html.parser",
                )
                critic_reviews = critic_soup.find(
                    "strong", text=re.compile("Critic Reviews")
                )
                if critic_reviews:
                    critic_reviews = (
                        critic_reviews.get_text().strip().replace(" Critic Reviews", "")
                    )
                else:
                    critic_reviews = "0"
                data_dict["reviews"].append(critic_reviews)

        # get Game userscore
        table_rows = content[tblnum].find_all("tr")
        for tr in table_rows:
            td = tr.find_all("td")
            for score in td[1].find_all("div", {"class": "metascore_w"}):
                data_dict["userscore"].append(score.text)
        tblnum += 1


def pages(
    lastPageNum, system
):  # Function that returns the html(code) and initiates the web scrapper
    currentPage = 0
    while currentPage < int(lastPageNum):
        url = url = (
            "https://www.metacritic.com/browse/games/score/metascore/90day/"
            + str(system)
            + "/filtered?view=condensed&page="
            + str(currentPage)
        )
        userAgent = {"User-agent": "Mozilla/5.0"}
        response = requests.get(url, headers=userAgent)
        soup = BeautifulSoup(response.text, "html.parser")
        content = soup.find_all("table")

        num_loops = len(content)
        # print(num_loops)
        scrapper(num_loops, content)
        # print(data_dict)
        currentPage += 1
        time.sleep(6)


def main():
    # systems = ["switch", "ps4", "ps5"]
    systems = ["all"]
    for system in systems:
        numPage = numberPages(webpage(0, system))
        pages(int(numPage), system)
        time.sleep(5)
    xData = pd.DataFrame.from_dict(data_dict)
    xData.to_csv("mc_90_days.csv")


main()

In [2]:
video_games = pd.DataFrame.from_dict(data_dict)

video_games = video_games[~video_games["userscore"].isin(["tbd"])]
video_games = video_games[~video_games["score"].isin(["tbd"])]
video_games = video_games[~video_games["ratings"].isin(["tbd"])]
video_games = video_games[~video_games["reviews"].isin(["tbd"])]

video_games["ratings"] = video_games["ratings"].astype(float)
video_games["reviews"] = video_games["reviews"].astype(float)
video_games["score"] = video_games["score"].astype(float) * video_games["reviews"]
video_games["userscore"] = (
    video_games["userscore"].astype(float) * video_games["ratings"]
)

d = {
    "date": "date",
    "platform": "platform",
    "score": "score",
    "ratings": "ratings",
    "userscore": "userscore",
}
video_games = (
    video_games.groupby("name")
    .agg(
        {
            "date": "min",
            "platform": list,
            "score": "sum",
            "ratings": "sum",
            "reviews": "sum",
            "userscore": "sum",
        }
    )
    .rename(columns=d)
)

video_games["userscore"] = (video_games["userscore"] / video_games["ratings"]).round(1)
video_games["score"] = (video_games["score"] / video_games["reviews"]).round(0)
video_games["ratings"] = video_games["ratings"].astype(int)
video_games["reviews"] = video_games["reviews"].astype(int)
video_games["score"] = video_games["score"].fillna(0).astype(int)

video_games = video_games[
    ((video_games["userscore"] * 10) + video_games["score"]) / 2 >= 75
]
video_games = video_games[video_games["ratings"] >= 100]

display(HTML(video_games.sort_values(["ratings"], ascending=False).to_html()))

Unnamed: 0_level_0,date,platform,score,ratings,reviews,userscore
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Returnal,"April 30, 2021",[PlayStation 5],86,2825,111,7.3
Resident Evil Village,"May 7, 2021","[PlayStation 5, PC, Xbox Series X, PlayStation 4]",84,2543,146,8.5
It Takes Two,"March 26, 2021","[Xbox Series X, PlayStation 4, PlayStation 5, PC]",88,1054,113,9.1
Ratchet & Clank: Rift Apart,"June 11, 2021",[PlayStation 5],89,713,111,9.2
Monster Hunter Rise,"March 26, 2021",[Switch],88,586,120,9.1
NieR Replicant ver.1.22474487139...,"April 23, 2021","[Xbox One, PlayStation 4, PC]",83,485,110,8.3
Days Gone,"May 18, 2021",[PC],76,467,45,8.6
The House in Fata Morgana - Dreams of the Revenants Edition -,"April 9, 2021",[Switch],98,394,8,8.3
Disco Elysium: The Final Cut,"March 30, 2021","[PC, PlayStation 5, PlayStation 4]",90,343,48,8.0
Final Fantasy VII Remake Intergrade,"June 10, 2021",[PlayStation 5],87,113,20,8.6
