In [1]:
# To start with, I'll parse the HTML file to understand its structure and content.
# I will use BeautifulSoup to parse the HTML and identify the scene shifts and characters.

import os
import requests
from bs4 import BeautifulSoup

# Let's start by parsing the uploaded HTML file to find all the links corresponding to the episodes.
# Each episode link has a text format like "1x01", "1x02", etc.

# Load and parse the HTML file to extract the links
file_path = '../files/Game of Thrones Transcripts Index - Transcripts - Forever Dreaming.html'

with open(file_path, 'r', encoding='utf-8') as file:
    index_html_content = file.read()

# Parse the content using BeautifulSoup
index_soup = BeautifulSoup(index_html_content, 'html.parser')

# Function to extract all episode links from the page
def extract_all_episode_links(soup):
    episode_links = {}

    # Find all 'a' tags with href attribute
    for link in soup.find_all('a', href=True):
        link_text = link.text.strip()

        # Check if the link text matches the episode format (e.g., "1x01") or the special case for season 4 (e.g., "4.05")
        if link_text:
            season_number = None
            episode_number = None

            if 'x' in link_text and len(link_text.split('x')[0]) <= 2 and link_text.split('x')[1].isdigit():
                season_number, episode_number = link_text.split('x')
            elif '.' in link_text and len(link_text.split('.')[0]) == 1 and link_text.split('.')[1].isdigit():
                season_number, episode_number = link_text.split('.')

            if season_number and episode_number:
                season_number = int(season_number)
                # Correct the URL by combining the base URL with the href attribute
                # Ensure that only the part after '/viewtopic.php' is appended to the base URL
                url = 'https://transcripts.foreverdreaming.org' + link['href'][link['href'].find('/viewtopic.php'):]

                # Organize links by season
                if season_number not in episode_links:
                    episode_links[season_number] = []
                episode_links[season_number].append(url)

    return episode_links

# Extracting the episode links
episode_links = extract_all_episode_links(index_soup)

# Display the first few links to verify
{season: links[:2] for season, links in episode_links.items()}  # show first two links of each season for brevity

{1: ['https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=7743',
  'https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=7871'],
 2: ['https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=7872',
  'https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=7914'],
 3: ['https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=7885',
  'https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=7913'],
 4: ['https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=10931',
  'https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=10964'],
 5: ['https://transcripts.foreverdreaming.org/viewtopic.php?f=201&t=16452',
  'https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=17753'],
 6: ['https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=26857',
  'https://transcripts.foreverdreaming.org/viewtopic.php?f=67&t=26957']}

In [2]:
# Make sure to replace 'your_directory_path' with the actual path where you want the folders to be created.
base_directory_path = '../files/Game_of_Thrones_Transcripts'

# This function will take the base directory and the links dictionary to download and organize the transcripts.
def download_transcripts(base_directory, episode_links):
    for season, links in episode_links.items():
        # Create a directory for the season
        season_directory = os.path.join(base_directory, f"Season_{season}")
        os.makedirs(season_directory, exist_ok=True)

        # Counter for episodes
        episode_count = 1

        for link in links:
            # Make a request to the episode link
            response = requests.get(link)
            # Generate the filename using season and episode count
            filename = f"season_{season}_episode_{episode_count}.html"
            file_path = os.path.join(season_directory, filename)

            # Write the content to the file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"Downloaded {filename} in {season_directory}")

            # Increment the episode count
            episode_count += 1
            
# Extracting the episode links from the HTML file (as done previously)

# Calling the download function
download_transcripts(base_directory_path, episode_links)

Downloaded season_1_episode_1.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_2.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_3.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_4.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_5.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_6.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_7.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_8.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_9.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_1_episode_10.html in ../files/Game_of_Thrones_Transcripts/Season_1
Downloaded season_2_episode_1.html in ../files/Game_of_Thrones_Transcripts/Season_2
Downloaded season_2_episode_2.html in ../files/Game_of_Thrones_Transcripts/

In [3]:
# Load the HTML content
file_path = '../files/Game_of_Thrones_Transcripts/Season_1/season_1_episode_1.html'

# Open the HTML file and read the content
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Let's proceed to identify scene shifts and characters.
# The scene shifts seem to be indicated by specific phrases, so we will look for those.
# We will also need to extract the character names which precede their dialogue.

# This function will try to detect scene shifts and character lines.
def parse_scenes_and_characters(soup):
    scenes = []
    current_scene = []
    characters_in_scene = set()

    for element in soup.descendants:
        if isinstance(element, str):  # we are only interested in text, not tags
            text = element.strip()
            if "scene shifts" in text.lower() or "scene changes" in text.lower():
                # When a scene shift is detected, we start a new scene
                if characters_in_scene:  # only add if the scene has characters
                    scenes.append(list(characters_in_scene))
                    characters_in_scene = set()
            else:
                # This is a heuristic: assuming that lines with ":" are character lines
                if ':' in text:
                    character_line = text.split(':', 1)[0].strip()
                    # We add characters to the current scene set
                    if character_line.isupper():  # Assuming all character names are in uppercase
                        characters_in_scene.add(character_line)
    
    # Adding the last scene if it's not empty
    if characters_in_scene:
        scenes.append(list(characters_in_scene))

    return scenes

# Now let's parse the HTML content to identify the scenes and characters.
scenes_with_characters = parse_scenes_and_characters(soup)

# Let's see the first few scenes to verify our parsing
scenes_with_characters[:5]  # showing only the first 5 scenes for brevity



[['WILL', 'WAYMAR ROYCE', 'NOTE', 'ROYCE', 'GARED'],
 ['JON'],
 ['SEPTA MORDANE',
  'JON',
  'CATELYN',
  'NED',
  'ROBB',
  'JON/ROBB',
  'CASSEL',
  'SANSA'],
 ['BRAN', 'JON', 'NED', 'WILL'],
 ['JON', 'THEON', 'NED', 'ROBB', 'BRAN', 'CASSEL']]