In [1]:
import os
import re
import time
import random
import pandas as pd
from acquire import *
from selenium import webdriver

In [2]:
def load_current_index(filename):
    """
    Loads the current index from the specified file, or returns 0 if the file does not exist.

    Args:
        filename (str): The name of the file containing the current index.

    Returns:
        int: The current index.
    """
    try:
        with open(filename, 'r') as f:
            current_index = int(f.read())
    except FileNotFoundError:
        current_index = 0
    
    return current_index

In [19]:
def scrape_robot_data(robot_links_dict, current_index=0):
    """
    Scrapes robot data from the provided dictionary of links, starting from the specified index.

    Args:
        robot_links_dict (dict): A dictionary of links to robot pages on the BattleBots website.
        current_index (int, optional): The index of the first link to process. Defaults to 0.

    Returns:
        None
    """
    # Define the regex pattern for matching filenames
    regex_pattern = r'(.+)_((match_history)|(stats))\.pkl'

    # Scan the directory and store the existing files' information in a dictionary
    existing_files = {}
    for filename in os.listdir('pickle-files'):
        match = re.match(regex_pattern, filename)
        if match:
            robot_name = match.group(1)
            table_type = match.group(2)
            if robot_name not in existing_files:
                existing_files[robot_name] = set()
            existing_files[robot_name].add(table_type)

    # Start scraper with link
    driver = webdriver.Chrome()

    # Iterate through the dictionary of links starting from the current index
    processed_links = 0
    
    # loop
    for season, links in robot_links_dict.items():
        for link in links:
            # Skip the robots that have already been processed
            if processed_links < current_index:
                processed_links += 1
                continue

            

            print("reading robot information")
            
            # Extract robot name from the link
            robot_name = link.split('/')[-2]

            # Check if the robot and table types are already in the existing_files dictionary
            if robot_name in existing_files and 'stats' in existing_files[robot_name] and 'match_history' in existing_files[robot_name]:
                print(f"found {robot_name} with both tables already in your files")
                # Increment the current index and save to file
                current_index += 1
                with open('current_index.txt', 'w') as f:
                    f.write(str(current_index))
                continue

            # Send link to scraper browser
            driver.get(link)

            # Create a subdirectory to store the pickle files
            subdirectory = 'pickle-files'
            if not os.path.exists(subdirectory):
                os.makedirs(subdirectory)

            # Get table elements
            div_element = driver.find_element_by_css_selector(".grid-1-1.last")
            table_elements = div_element.find_elements_by_css_selector("table")

            # Check if there are two tables
            if len(table_elements) == 2:
                stats_history_html = table_elements[0].get_attribute("outerHTML")
                match_history_html = table_elements[1].get_attribute("outerHTML")

                # Create dataframes from the tables
                stats_df = pd.read_html(stats_history_html)[0]
                match_history_df = pd.read_html(match_history_html)[0]

                # Save the dataframes to file
                stats_filename = os.path.join(subdirectory, f'{robot_name}_stats.pkl')
                stats_df.to_pickle(stats_filename)
                print(f'writing: {stats_filename}')

                match_history_filename = os.path.join(subdirectory, f'{robot_name}_match_history.pkl')
                match_history_df.to_pickle(match_history_filename)
                print(f'writing: {match_history_filename}')
            
            # Sleep for a random amount of time between 3 and 7 seconds
            time.sleep(random.uniform(1, 5))
            
            # Increment the current index and save to file
            current_index += 1
            print(f"processed link {current_index}: {link}")

    # Close the driver
    driver.quit()

In [20]:
# loading the JSON file with all season & robot links
robot_links_dict = load_from_json("all_season_robot_links.json")

In [21]:
for key in robot_links_dict.keys():
    print(f"Length of {key}: {len(robot_links_dict[key])}")

Length of world-championship-vii-robots: 57
Length of 2021-season-robots: 63
Length of 2020-season-robots: 62
Length of 2019-season-robots: 69
Length of 2018-season-robots: 55
Length of season-2-robots: 60
Length of season-1-robots: 28


In [22]:
total_length = 0
for value in robot_links_dict.values():
    if isinstance(value, list):
        total_length += len(value)

print(f"Total number of items in all lists: {total_length}")

Total number of items in all lists: 394


In [23]:
robot_links_dict = load_from_json("all_season_robot_links.json")

# Initialize a set to store unique robot names
unique_robot_names = set()

# Iterate over each key in the dictionary
for key in robot_links_dict.keys():
    # Iterate over each robot link in the list
    for robot_link in robot_links_dict[key]:
        # Extract the robot name from the link
        robot_name = robot_link.split("/")[-2]
        # Add the robot name to the set of unique robot names
        unique_robot_names.add(robot_name)

# Print the total count of unique robot names
print(f"Total unique robot names: {len(unique_robot_names)}")

Total unique robot names: 394


In [24]:
# resume from last index
checkpoint = load_current_index('current_index.txt')

In [25]:
# scrape & save robot tables to Pickle
scrape_robot_data(robot_links_dict,checkpoint)

reading robot information
found banshee-wcvii with both tables already in your files
reading robot information
found big-dill-wcvii with both tables already in your files
reading robot information
found blip-wcvii with both tables already in your files
reading robot information
found captain-shredderator-wcvii with both tables already in your files
reading robot information
found cobalt-wcvii with both tables already in your files
reading robot information
found deathroll-wcvii with both tables already in your files
reading robot information
found doomba-wcvii with both tables already in your files
reading robot information
found dragon-king-wcvii with both tables already in your files
reading robot information
found end-game-wcvii with both tables already in your files
reading robot information
found fusion-wcvii with both tables already in your files
reading robot information
found glitch-wcvii with both tables already in your files
reading robot information
found hijinx-wcvii with b

processed link 101: https://battlebots.com/robot/electric-ray-2019/
reading robot information
found extinguisher-2019 with both tables already in your files
reading robot information
found the-four-horsemen-2019 with both tables already in your files
reading robot information
found free-shipping-2019 with both tables already in your files
reading robot information
found gigabyte-2019 with both tables already in your files
reading robot information
found huge-2019 with both tables already in your files
reading robot information
found hypershock-2019 with both tables already in your files
reading robot information
found kingpin-2019 with both tables already in your files
reading robot information
found lock-jaw-2019 with both tables already in your files
reading robot information
found madcatter-2019 with both tables already in your files
reading robot information
found marvin-2019 with both tables already in your files
reading robot information
found monsoon-2019 with both tables alread