In [6]:
"""Simple scraper that goes to VGMusic URL
and downloads all songs of the requested category.

This should be converted to a class for reusability and integration
with tkinter later.

Author: Jared Frazier
"""

import bs4
import requests
import time
import random
import os

In [7]:
# Scrape list of midis from website
def list_of_vg_midis(name_of_vg):
    """Requests site and gets list of midi files for a videogame.
    
    :param name_of_vg: <class 'str'> The name of a videogame
        from which the list of midis will be created.
    :return: <class 'list'> of all .mid files for a particular 
        videogame.
    """
    # URL to scrape
    URL = "https://www.vgmusic.com/music/console/nintendo/nes/"

    # Get the HTTP response content for this URL
    time.sleep(random.uniform(1, 5))
    res = requests.get(URL).content

    # Get the soup obj
    soup = bs4.BeautifulSoup(res, "html.parser")

    # Find where the name of the videogame is in the bs4 tree 
    # ERROR CHECK HERE?
    vg_str = soup.find(string=name_of_vg)

    # Get the parent tag of that videogame as a starting point -- can go straight to find next if necessary
    vg_tr_header = vg_str.find_parent('tr')

    # Get the next tag after that table header. This
    # represents an html row containing the midi, file size, who sequenced it, and comments
    tr_tag = vg_tr_header.find_next('tr')

    # Initialize loop var. When an html row is whitespace ONLY, the 
    # videogame has no more midi files associated with it.
    is_whitespace = tr_tag.get_text().isspace()
    midis = []
    while(not is_whitespace):
        # Get the midi
        midi = tr_tag.find('a').get('href')
        midis.append(midi)

        # Get the next tag
        tr_tag = tr_tag.find_next('tr')
        
        # Update loop var
        is_whitespace = tr_tag.get_text().isspace()

    # Return the list of midis 
    return midis

In [8]:
# Function to request those videogames and write them to file
def scrape_midis(midis, write_to_dir=os.path.join(os.getcwd(), 'data')):
    """Function accepts list of midis, downloads, and writes them to disk.
    
    :param midis: <class 'list'> of midi names to scrape from VGMusic.
    :param write_to_dir: <class 'str'> The folder to write the data to. 
        Defaults to 'cwd/data/'
    :return: None
    """
    # URL to scrape
    URL = "https://www.vgmusic.com/music/console/nintendo/nes/"

    # Check if the write directory exists and create it if not
    if not os.path.exists(write_to_dir):
        os.makedirs(write_to_dir)

    # Iterate through midis and write to disk
    for midi in midis:
        # Download the data
        time.sleep(random.uniform(1,5))
        data_res = requests.get(URL + '/' + midi, allow_redirects=True).content

        # Write the data
        with open(os.path.join(write_to_dir, midi), "wb") as fobj:
            fobj.write(data_res)

    # Void function
    return None

# Demo Scraper Below

In [9]:
# Videogame 1943 -- the second videogame on VGMusic's NES website
name_of_vg = "1943"
midis_1943 = list_of_vg_midis(name_of_vg)
print(*midis_1943)

1943.mid 1943sab.mid 1943-lev1.mid 43pbos1.mid 43pbos12.mid 1943-lev3.mid 1943-Lev3Win.mid 1943lost.mid 1943won.mid 1943boss.mid 1943boss1.mid 1943BossWin.mid


In [10]:
# Download the data and write to disk
scrape_midis(midis_1943, write_to_dir="demo_data/")

# Download Castlevania MIDIs -- Please don't run this code. We already have the data.

In [12]:
# List of castlevania game titles from which MIDI files will be downloaded
if (not os.path.exists('./data/Castlevania')):
    castlevania_list = ["Castlevania", "Castlevania 2", "Castlevania 3"]

    # Download the music for these games
    for game in castlevania_list:
        midis = list_of_vg_midis(game)
        scrape_midis(midis, write_to_dir="data/" + game)
else:
    print("We already have the data. Please don't scrape again!")

We already have the data. Please don't scrape again!


# The Exploration Of Webpage HTML And The `<class 'bs4.BeautifulSoup()'>` Syntax That Led To The Above Functions

In [None]:
# URL to scrape
URL = "https://www.vgmusic.com/music/console/nintendo/nes/"

# Get the HTTP response content for this URL
time.sleep(random.uniform(1, 5))
res = requests.get(URL).content

# Get the soup obj
soup = bs4.BeautifulSoup(res, "html.parser")

In [None]:
# Find all td headers
#tds = [str(i) for i in  soup.find_all("td")]
tds = [str(i) for i in soup.find_all("td", class_="header")]

In [None]:
# Find the tds worth looking for 
# NOT A GOOD STRATEGY BECAUSE THIS IS JUST LIST ITERATION
game_matching_ix = []
for ix, i in enumerate(tds):
    try:
        found_game = i.index("Castlevania")
        index = ix
        result = i
        game_matching_ix.append(index)
    except:
        pass
print(result)
print(index)
print(game_matching_ix)

In [None]:
# The names of all mid files on webpage
links = []
for link in soup.find_all('a'):
    try:
        if (link.get('href').endswith('.mid')):
            links.append(link.get('href'))
    except:
        pass
for i in range (10):
    print(i)
    print(links[i])
    print()

In [None]:
# Get the HTTP Response content of one of these links
time.sleep(random.uniform(1,5))
link_res = requests.get(URL + '/' + links[0], allow_redirects=True).content

In [None]:
# Write that binary response content to file
with open(os.path.join(os.getcwd(), links[0]), 'wb') as fobj:
    fobj.write(link_res)

# Get *specific* videogame music
The parent-sibling bs4 tree can be traversed to acquire different video game links. Each videogame section (e.g. Castlevania) ends when a whitespace is encountered (i.e.,an empty row). Iterating through through sucessive next siblings until a whitespace text is encountered is how a videogame section may be delineated and how those links may then be scraped. *Below is an example of that for the first video game section on VGMusic's site.*

In [None]:
# Find the tag that matches this string
vg_str = soup.find(string="10-Yard Fight")

# Get the 'table header' <tr> tag which is the parent of the string
vg_tr_tag = vg_str.find_parent('tr')
print("FIND_PARENT")
print(vg_tr_tag)
print()

# Get text on that tag
vg_tr_tag_text = vg_tr_tag.get_text()
print('GET_TEXT:')
print(vg_tr_tag)
print()

# Get the midi for the first song immediately after the 'table header'
print("GET('href')")
vg_first_song_mid = vg_tr_tag.find_next('tr').find('a').get('href')
print(vg_first_song_mid)
print()

# Next sibling
print("FIND_NEXT('tr')")
vg_next_tr_tag = vg_tr_tag.find_next('tr').find_next('tr')
print(vg_next_tr_tag)
print()

# Get the whitespace
print("CHECK FOR WHITESPACE BY USING GET_TEXT()")
whitespace = vg_next_tr_tag.get_text()
print("Is the whitespace found the same as the unicode xa0 (No-break space)?", 'Yes' if (whitespace.isspace()) else 'No')

In [None]:
# Prototype scraper for first Videogame

# Name of videogame
name_of_vg = "10-Yard Fight"

# Find where the name of the videogame is in the bs4 tree
vg_str = soup.find(string=name_of_vg)

# Get the parent tag of that string as a starting point 
vg_tr_header = vg_str.find_parent('tr')
print("HEADER")
print(vg_tr_header)
print()

# Get the next tag
tr_tag = vg_tr_header.find_next('tr')
print("NEXT TAG")
print(tr_tag)
print()

# Get initial tag whitespace bool
is_whitespace = tr_tag.get_text().isspace()
midis = []
while(not is_whitespace):
    # Get the midi
    midi = tr_tag.find('a').get('href')
    midis.append(midi)

    # Get the next tag
    tr_tag = tr_tag.find_next('tr')
    
    # Update loop var
    is_whitespace = tr_tag.get_text().isspace()
print(midis)