In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import pickle
import time
import csv

In [257]:
game_response = requests.get("https://www.backloggd.com/games/virtual-pro-wrestling-64/")
game_soup = BeautifulSoup(game_response.content, 'html.parser')

In [258]:
# Scraping the title
title = game_soup.find('h1', class_='mb-0').string
title

'Virtual Pro Wrestling 64'

In [259]:
# Scraping the date, and making it DateTime
date_step = game_soup.find('div', class_='col-auto mt-auto pr-0')
date_step2 = date_step.find('a', href=True).string
if date_step2 == 'TBD':
    date_as_datetime = datetime.datetime(1, 1, 1).strftime("%Y-%m-%d")
else:
    date_as_datetime = datetime.datetime.strptime(date_step2, '%b %d, %Y').strftime('%Y-%m-%d')
date_as_datetime

'1997-12-19'

In [260]:
# Getting the plays, playing, backlogs and wishlist information
counter = game_soup.find('div', id='log-counters').find_all('a', class_='plays-counter')
plays = counter[0].find('p', class_='mb-0').string
playing = counter[1].find('p', class_='mb-0').string
backlogs = counter[2].find('p', class_='mb-0').string
wishlist = counter[3].find('p', class_='mb-0').string

In [261]:
plays

'8'

In [262]:
playing

'0'

In [263]:
backlogs

'4'

In [264]:
wishlist

'2'

In [265]:
# Get a list of publishers
publisher_list = []
try:
    publishers = game_soup.find('div', class_='col-auto pl-lg-1 sub-title').find_all('a', href=True)
    for i in range(len(publishers)):
        publisher_list.append(publishers[i].string)
except:
    publisher_list = []

publisher_list

['AKI Corporation', 'Asmik Ace Entertainment']

In [266]:
# Get Average review score
review_score = game_soup.find('h1', class_='text-center').string
if review_score == 'N/A':
    review_score = 0
else:
    review_score = float(review_score)

review_score

2.9

In [267]:
# Get genres
genres = game_soup.find_all('p', class_='genre-tag')
genre_list = []
for i in range(len(genres)):
    genre_list.append(genres[i].string)

genre_list

['Fighting', 'Sport']

In [268]:
# Get platforms
platforms = game_soup.find_all('a', class_='game-page-platform')
platform_list = []
for i in range(len(platforms)):
    platform_list.append(platforms[i].get_text(strip=True))

platform_list

['Nintendo 64']

In [269]:
# Get description
description = game_soup.find('div', id='collapseSummary').get_text(strip=True)

description

'Over 78 wrestlers ready to rumble! The no.1 wrestling promotion in the world, WCW, takes on the world and combines talents from around the world to settle their scores, and take on the nWo led by Hollywood Hogan.Hundreds of moves, 4 player action, this is wrestling!'

In [270]:
# Get number of reviews - number of lists associated is here as well
lists_reviews = game_soup.find_all('p', class_='game-page-sidecard')

total_lists = lists_reviews[0].get_text(strip=True).strip(" Lists")
total_reviews = lists_reviews[1].get_text(strip=True).strip(" Reviews")

total_reviews

'1'

In [271]:
# Get game category + main (If applicable)
# If the search for category fails, then the game is the main game
try:
    main_game = game_soup.find('p', class_='mb-2 game-parent-category').find('a').get_text()
    full_sentence = game_soup.find('p', class_='mb-2 game-parent-category').get_text()
    category = full_sentence.replace(main_game, '').strip()
except:
    main_game = title
    category = 'main'

category

'main'

In [272]:
main_game

'Virtual Pro Wrestling 64'

In [273]:
# Get ratings, ten categories from 0.5 to 5.0

ratings = game_soup.find_all('div', class_="col px-0 top-tooltip")

ratings_zero_five = int(ratings[0]['data-tippy-content'].split(' |')[0])
ratings_one_zero = int(ratings[1]['data-tippy-content'].split(' |')[0])
ratings_one_five = int(ratings[2]['data-tippy-content'].split(' |')[0])
ratings_two_zero = int(ratings[3]['data-tippy-content'].split(' |')[0])
ratings_two_five = int(ratings[4]['data-tippy-content'].split(' |')[0])
ratings_three_zero = int(ratings[5]['data-tippy-content'].split(' |')[0])
ratings_three_five = int(ratings[6]['data-tippy-content'].split(' |')[0])
ratings_four_zero = int(ratings[7]['data-tippy-content'].split(' |')[0])
ratings_four_five = int(ratings[8]['data-tippy-content'].split(' |')[0])
ratings_five_zero = int(ratings[9]['data-tippy-content'].split(' |')[0])

In [274]:
#Checking ratings variables #NOT FOR SCRIPT DO NOT KEEP

ratings_dict = {}
ratings_dict['0.5 stars'] = ratings_zero_five
ratings_dict['1.0 stars'] = ratings_one_zero 
ratings_dict['1.5 stars'] = ratings_one_five 
ratings_dict['2.0 stars'] = ratings_two_zero
ratings_dict['2.5 stars'] = ratings_two_five 
ratings_dict['3.0 stars'] = ratings_three_zero 
ratings_dict['3.5 stars'] = ratings_three_five 
ratings_dict['4.0 stars'] = ratings_four_zero 
ratings_dict['4.5 stars'] = ratings_four_five 
ratings_dict['5.0 stars'] = ratings_five_zero 

ratings_dict

{'0.5 stars': 0,
 '1.0 stars': 0,
 '1.5 stars': 0,
 '2.0 stars': 0,
 '2.5 stars': 1,
 '3.0 stars': 4,
 '3.5 stars': 0,
 '4.0 stars': 0,
 '4.5 stars': 0,
 '5.0 stars': 0}

In [275]:
image_url = 'https://images.igdb.com/igdb/image/upload/t_cover_big/'

game_image_url = ''
for n in game_soup.find_all('img'):
    if(n.get('src').startswith(image_url)):
        game_image_url = (n.get('src'))

game_image_url

'https://images.igdb.com/igdb/image/upload/t_cover_big/co6wr1.jpg'

In [279]:
#Beta image scraper tester for game with no image

image_url = 'https://images.igdb.com/igdb/image/upload/t_cover_big/'

game_response_beta = requests.get("https://www.backloggd.com/games/clifford-the-big-red-dog-reading/")
game_soup_beta = BeautifulSoup(game_response_beta.content, 'html.parser')

game_image_url_beta = ''
for n in game_soup_beta.find_all('img'):
    if(n.get('src').startswith(image_url)):
        game_image_url_beta = (n.get('src'))

game_image_url_beta

''

In [277]:
   
###############DO NOT KEEP##########
game_data = []
############DO NOT KEEP########

game_data.append({'title': title,
    'release_date': date_as_datetime,
    'plays':plays,
    'playing':playing,
    'backlogs':backlogs,
    'wishlist':wishlist,
    'developers':publisher_list,
    'avg_review':review_score,
    'genres':genre_list,
    'platforms':platform_list,
    'description':description,
    'total_reviews':total_reviews,
    'total_lists':total_lists,
    'category':category,
    'main':main_game,
    'ratings_zero_five':ratings_zero_five,
    'ratings_one_zero':ratings_one_zero,
    'ratings_one_five':ratings_one_five,
    'ratings_two_zero':ratings_two_zero,
    'ratings_two_five':ratings_two_five,
    'ratings_three_zero':ratings_three_zero,
    'ratings_three_five':ratings_three_five,
    'ratings_four_zero':ratings_four_zero,
    'ratings_four_five':ratings_four_five,
    'ratings_five_zero':ratings_five_zero,
    'image':game_image_url
                 })

In [278]:
game_df = pd.DataFrame(game_data)
game_df

Unnamed: 0,title,release_date,plays,playing,backlogs,wishlist,developers,avg_review,genres,platforms,...,ratings_one_zero,ratings_one_five,ratings_two_zero,ratings_two_five,ratings_three_zero,ratings_three_five,ratings_four_zero,ratings_four_five,ratings_five_zero,image
0,Virtual Pro Wrestling 64,1997-12-19,8,0,4,2,"[AKI Corporation, Asmik Ace Entertainment]",2.9,"[Fighting, Sport]",[Nintendo 64],...,0,0,0,1,4,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...


In [74]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import pickle
import time
import csv

In [75]:
# Loading my link data
with open('../raw_data/all_links_v1', 'rb') as links:
    game_links = pickle.load(links)
frame = pd.DataFrame(game_links)

In [227]:
# Prepeing final list and setting count for montoring progress
# Only the first 116928 are relevant, and we will scrape in batches of 25000
missed_data = []
game_data = []
count = 0
game_df = []

for game in frame[0][115:140]:
    # Monitoring progress
    if count % 250 == True:
        print(count)
        print(game)
    else:
        pass
    count += 1

    try:
        # Setting up the html parser + beautful soup
        game_response = requests.get(f"https://www.backloggd.com{game}")
        game_soup = BeautifulSoup(game_response.content, 'html.parser')

        # Scraping the title
        title = game_soup.find('h1', class_='mb-0').string

        # Scraping the date, and making it DateTime
        date_step = game_soup.find('div', class_='col-auto mt-auto pr-0')
        date_step2 = date_step.find('a', href=True).string
        if date_step2 == 'TBD':
            date_as_datetime = datetime.datetime(1, 1, 1).strftime("%Y-%m-%d")
        else:
            date_as_datetime = datetime.datetime.strptime(date_step2, '%b %d, %Y').strftime('%Y-%m-%d')

        # Getting the plays, playing, backlogs and wishlist information
        counter = game_soup.find('div', id='log-counters').find_all('a', class_='plays-counter')
        plays = counter[0].find('p', class_='mb-0').string
        playing = counter[1].find('p', class_='mb-0').string
        backlogs = counter[2].find('p', class_='mb-0').string
        wishlist = counter[3].find('p', class_='mb-0').string

        # Get a list of publishers
        publisher_list = []
        try:
            publishers = game_soup.find('div', class_='col-auto pl-lg-1 sub-title').find_all('a', href=True)
            for i in range(len(publishers)):
                publisher_list.append(publishers[i].string)
        except:
            publisher_list = []

        # Get Average review score
        review_score = game_soup.find('h1', class_='text-center').string
        if review_score == 'N/A':
            review_score = 0
        else:
            review_score = float(review_score)

        # Get genres
        genres = game_soup.find_all('p', class_='genre-tag')
        genre_list = []
        for i in range(len(genres)):
            genre_list.append(genres[i].string)

        # Get platforms
        platforms = game_soup.find_all('a', class_='game-page-platform')
        platform_list = []
        for i in range(len(platforms)):
            platform_list.append(platforms[i].get_text(strip=True))

        # Get description
        description = game_soup.find('div', id='collapseSummary').get_text(strip=True)

        # Get number of reviews - number of lists associated is here as well
        lists_reviews = game_soup.find_all('p', class_='game-page-sidecard')

        total_lists = lists_reviews[0].get_text(strip=True).strip(" Lists")
        total_reviews = lists_reviews[1].get_text(strip=True).strip(" Reviews")

        # Get game category + main (If applicable)
        # If the search for category fails, then the game is the main game
        try:
            main_game = game_soup.find('p', class_='mb-2 game-parent-category').find('a').get_text()
            full_sentence = game_soup.find('p', class_='mb-2 game-parent-category').get_text()
            category = full_sentence.replace(main_game, '').strip()
        except:
            main_game = title
            category = 'main'

        # Get ratings, ten categories from 0.5 to 5.0

        ratings = game_soup.find_all('div', class_="col px-0 top-tooltip")

        ratings_zero_five = int(ratings[0]['data-tippy-content'].split(' |')[0])
        ratings_one_zero = int(ratings[1]['data-tippy-content'].split(' |')[0])
        ratings_one_five = int(ratings[2]['data-tippy-content'].split(' |')[0])
        ratings_two_zero = int(ratings[3]['data-tippy-content'].split(' |')[0])
        ratings_two_five = int(ratings[4]['data-tippy-content'].split(' |')[0])
        ratings_three_zero = int(ratings[5]['data-tippy-content'].split(' |')[0])
        ratings_three_five = int(ratings[6]['data-tippy-content'].split(' |')[0])
        ratings_four_zero = int(ratings[7]['data-tippy-content'].split(' |')[0])
        ratings_four_five = int(ratings[8]['data-tippy-content'].split(' |')[0])
        ratings_five_zero = int(ratings[9]['data-tippy-content'].split(' |')[0])

        # Get image url
        
        image_url = 'https://images.igdb.com/igdb/image/upload/t_cover_big/'
        game_image_url = ''
        for n in game_soup.find_all('img'):
            if(n.get('src').startswith(image_url)):
                game_image_url = (n.get('src'))

       
        game_data.append({'title': title,
                        'release_date': date_as_datetime,
                        'plays':plays,
                        'playing':playing,
                        'backlogs':backlogs,
                        'wishlist':wishlist,
                        'developers':publisher_list,
                        'avg_review':review_score,
                        'genres':genre_list,
                        'platforms':platform_list,
                        'description':description,
                        'total_reviews':total_reviews,
                        'total_lists':total_lists,
                        'category':category,
                        'main':main_game,
                        'ratings_zero_five':ratings_zero_five,
                        'ratings_one_zero':ratings_one_zero,
                        'ratings_one_five':ratings_one_five,
                        'ratings_two_zero':ratings_two_zero,
                        'ratings_two_five':ratings_two_five,
                        'ratings_three_zero':ratings_three_zero,
                        'ratings_three_five':ratings_three_five,
                        'ratings_four_zero':ratings_four_zero,
                        'ratings_four_five':ratings_four_five,
                        'ratings_five_zero':ratings_five_zero,
                        'image':game_image_url})
    except:
        print(f'Failed at {game}, count = {count}')
        missed_data.append(game)
        time.sleep(70)

game_df = pd.DataFrame(game_data)
#game_df.to_csv('all_data_batch5', index=False)

#with open ('missed_data_v5', 'w') as csvfile:
#    writer = csv.writer(csvfile, delimiter=',')
#    writer.writerow(missed_data)

1
/games/beastieball/


In [228]:
game_df

Unnamed: 0,title,release_date,plays,playing,backlogs,wishlist,developers,avg_review,genres,platforms,...,ratings_one_zero,ratings_one_five,ratings_two_zero,ratings_two_five,ratings_three_zero,ratings_three_five,ratings_four_zero,ratings_four_five,ratings_five_zero,image
0,Nivalis,2024-12-31,0,0,44,201,"[ION LANDS, 505 Games]",0,"[Adventure, RPG, Simulator]",[Windows PC],...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
1,Beastieball,2024-12-31,7,0,16,114,"[Wishes Ultd., Klei Publishing]",0,"[Adventure, Indie, RPG, Sport, Tactical]","[Windows PC, Mac]",...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
2,#Blud,2024-12-31,0,0,9,53,"[Humble Games, Exit 73 Studios]",0,"[Adventure, Brawler, Indie]",[Windows PC],...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
3,Dread Pilots,2024-12-31,0,0,4,15,[Klei Entertainment],0,"[Adventure, RPG]",[Windows PC],...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
4,They Came From Dimension X,2024-12-31,1,0,1,5,[],0,"[Adventure, Indie]",[Windows PC],...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
5,Reka,2024-12-31,0,0,10,46,"[Fireshine Games, Emberstorm Entertainment]",0,"[Adventure, Indie, RPG]",[Windows PC],...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
6,Another Crab's Treasure,2024-12-31,2,0,32,214,[Aggro Crab Games],0,"[Adventure, Indie, Platform, RPG]","[Windows PC, Nintendo Switch, Xbox Series]",...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
7,Bo: Path of the Teal Lotus,2024-12-31,0,0,15,93,[Squid Shock Studios],0,"[Adventure, Platform]","[Windows PC, Nintendo Switch]",...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
8,Borneo: A Jungle Nightmare,2024-12-31,0,0,0,2,[Fantastico Studio],0,"[Point-and-Click, Visual Novel]",[Windows PC],...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...
9,Tsurugihime,2024-12-31,0,0,1,15,[Fahrenheit 213],0,"[Brawler, Indie, RPG]",[Windows PC],...,0,0,0,0,0,0,0,0,0,https://images.igdb.com/igdb/image/upload/t_co...


In [229]:
game_df['image'][0]

'https://images.igdb.com/igdb/image/upload/t_cover_big/co4td5.jpg'