In [None]:
# File: ScrabbleScraper.ipynb
# Version: Python 3.9.7
# Author: Anthony Gallante
# Date: January 4, 2024
# Description: Python script to download annotated Scrabble games from www.cross-tables.com
#              and save them in the 'Scrabble Games' directory.

# Importing necessary libraries
import requests, os, bs4

# Variable Declaration
game_id = 45400                                  # Initial game ID to start downloading from
download_button_text = 'Download .gcg game file' # Text to identify the download button on the website
site_url = 'https://www.cross-tables.com/'       # Base URL for the website

# Creating a directory for storing downloaded files (if it doesn't exist)
directory = 'Scrabble Games'
os.makedirs(directory, exist_ok=True)

# Headers to simulate a user agent (preventing potential blocking from the website)
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' + \
             'AppleWebKit/537.36 (KHTML, like Gecko)' + \
             'Chrome/102.0.0.0' + \
             'Safari/537.36'
headers = {'user-agent': user_agent}

# Loop through games with decreasing IDs until game_id is not greater than 0
while game_id > 0:

    # Constructing the URL for the annotated game using the current game_id
    url = site_url + 'annotated.php?u=' + str(game_id) + '#100#'
    print(f'Navigating to Game {game_id} ...')

    # Sending a request to the website and getting the response
    res = requests.get(url, headers=headers)
    res.raise_for_status() # Raise an exception for bad responses

    # Creating a BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(res.text)

    # Extracting the text from the first paragraph on the website
    website_text = soup.select('p')[0].text

    # Checking if the game is available on the website
    if 'There is no annotated game with that id.' in website_text:
        print(f'Game {game_id} unavailable.')

    else:
            # Extracting the download URL for the .gcg file
            download_url = site_url + soup.find('a', text=download_button_text).get('href')
            
            # Sending a request to the download URL and getting the response
            res = requests.get(download_url, headers=headers)
            res.raise_for_status() # Raise an exception for bad responses

            # Opening each .gcg file and writing to the "Scrabble Games" directory in binary mode
            print(f'Downloading .gcg file for game {game_id}.')
            gcg_file = open(os.path.join(directory, os.path.basename(download_url)), 'wb')
            
            # Writing the file in chunks
            for chunk in res.iter_content(100_000):
                gcg_file.write(chunk)
            gcg_file.close()
    
    # Decreasing the game_id for the next iteration
    game_id -= 1
