In [12]:
import os
import re
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import time
import random

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from acquire import *

### All Season & Robot Links

In [13]:
# loading the JSON file with all season & robot links
robot_links_dict = load_from_json("all_season_robot_links.json")

In [14]:
total_links = sum(len(links) for links in robot_links_dict.values())

In [15]:
print(total_links)

394


In [16]:
# Define the regex pattern for matching filenames
regex_pattern = r'(.+)_((match_history)|(stats))\.pkl'

# Scan the directory and store the existing files' information in a dictionary
existing_files = {}
for filename in os.listdir('.'):
    match = re.match(regex_pattern, filename)
    if match:
        robot_name = match.group(1)
        table_type = match.group(2)
        if robot_name not in existing_files:
            existing_files[robot_name] = set()
        existing_files[robot_name].add(table_type)


# Load the current index from file, or start from the beginning
try:
    with open('current_index.txt', 'r') as f:
        current_index = int(f.read())
except:
    current_index = 0
print(f"Starting at index {current_index}")

# Start scraper with link
driver = webdriver.Chrome()

# Iterate through the dictionary of links starting from the current index
processed_links = 0
for season, links in robot_links_dict.items():
    for link in links:
        # Skip the robots that have already been processed
        if processed_links < current_index:
            processed_links += 1
            continue

        # Extract robot name from the link
        robot_name = link.split('/')[-2]
        
        # Check if the robot and table types are already in the existing_files dictionary
        if robot_name in existing_files and 'stats' in existing_files[robot_name] and 'match_history' in existing_files[robot_name]:
            print(f"found {robot_name} with both tables already in your files")
            # Increment the current index and save to file
            current_index += 1
            with open('current_index.txt', 'w') as f:
                f.write(str(current_index))
            continue

        print("reading robot information")
        # Extract robot name from the link
        robot_name = link.split('/')[-2]

        # Send link to scraper browser
        driver.get(link)

        # Get table elements
        div_element = driver.find_element_by_css_selector(".grid-1-1.last")
        table_elements = div_element.find_elements_by_css_selector("table")

        # Check if there are two tables
        if len(table_elements) == 2:
            stats_history_html = table_elements[0].get_attribute("outerHTML")
            match_history_html = table_elements[1].get_attribute("outerHTML")

            # Create dataframes from the tables
            stats_df = pd.read_html(stats_history_html)[0]
            match_history_df = pd.read_html(match_history_html)[0]

            # Save the dataframes to file
            print(f'writing: {robot_name}_stats.pkl')
            stats_df.to_pickle(f'{robot_name}_stats.pkl')
            print(f'writing: {robot_name}_match_history.pkl')
            match_history_df.to_pickle(f'{robot_name}_match_history.pkl')
        
        # Sleep for a random amount of time between 3 and 7 seconds
        time.sleep(random.uniform(1, 5))
        
        # Increment the current index and save to file
        current_index += 1
        print(f"processed link {current_index}: {link}")
        with open('current_index.txt', 'w') as f:
            f.write(str(current_index))

# Close the driver
driver.quit()

Starting at index 0
found banshee-wcvii with both tables already in your files
found big-dill-wcvii with both tables already in your files
found blip-wcvii with both tables already in your files
found captain-shredderator-wcvii with both tables already in your files
found cobalt-wcvii with both tables already in your files
found deathroll-wcvii with both tables already in your files
found doomba-wcvii with both tables already in your files
found dragon-king-wcvii with both tables already in your files
found end-game-wcvii with both tables already in your files
found fusion-wcvii with both tables already in your files
found glitch-wcvii with both tables already in your files
found hijinx-wcvii with both tables already in your files
found huge-wcvii with both tables already in your files
found hypershock-wcvii with both tables already in your files
found kraken-wcvii with both tables already in your files
found lucky-wcvii with both tables already in your files
found malice-wcvii with bo

processed link 132: https://battlebots.com/robot/deviled-egg-2018/
reading robot information
writing: double-jeopardy-2018_stats.pkl
writing: double-jeopardy-2018_match_history.pkl
processed link 133: https://battlebots.com/robot/double-jeopardy-2018/
reading robot information
writing: end-game_stats.pkl
writing: end-game_match_history.pkl
processed link 134: https://battlebots.com/robot/end-game/
reading robot information
writing: gamma-9-s3_stats.pkl
writing: gamma-9-s3_match_history.pkl
processed link 135: https://battlebots.com/robot/gamma-9-s3/
reading robot information
writing: gigabyte_stats.pkl
writing: gigabyte_match_history.pkl
processed link 136: https://battlebots.com/robot/gigabyte/
reading robot information
writing: hypershock-2018_stats.pkl
writing: hypershock-2018_match_history.pkl
processed link 137: https://battlebots.com/robot/hypershock-2018/
reading robot information
writing: icewave-3_stats.pkl
writing: icewave-3_match_history.pkl
processed link 138: https://battl

processed link 186: https://battlebots.com/robot/captain-shrederator/
reading robot information
processed link 187: https://battlebots.com/robot/chronic/
reading robot information
writing: counter-revolution_stats.pkl
writing: counter-revolution_match_history.pkl
processed link 188: https://battlebots.com/robot/counter-revolution/
reading robot information
writing: hypershock_stats.pkl
writing: hypershock_match_history.pkl
processed link 189: https://battlebots.com/robot/hypershock/
reading robot information
writing: lock-jaw_stats.pkl
writing: lock-jaw_match_history.pkl
processed link 190: https://battlebots.com/robot/lock-jaw/
reading robot information
writing: nightmare_stats.pkl
writing: nightmare_match_history.pkl
processed link 191: https://battlebots.com/robot/nightmare/
reading robot information
writing: overhaul_stats.pkl
writing: overhaul_match_history.pkl
processed link 192: https://battlebots.com/robot/overhaul/
reading robot information
writing: radioactive_stats.pkl
writi

In [None]:
#df = pd.read_parquet('robot_data_captain-shredderator-wcvii.parquet')

In [None]:
# Read nested dataframe from Parquet file
#pf = fp.ParquetFile('robot_data.parquet')
#df = pf.to_pandas()

In [None]:
# Access the 'stats' dataframe for robot 'robot1'
#stats_df = df.loc['robot1', 'stats']