In [7]:
# To start with, I'll parse the HTML file to understand its structure and content.
# I will use BeautifulSoup to parse the HTML and identify the scene shifts and characters.

import os
import requests
from bs4 import BeautifulSoup

# Let's start by parsing the uploaded HTML file to find all the links corresponding to the episodes.
# Each episode link has a text format like "1x01", "1x02", etc.

# The link to the specific episode transcript page
episode_url = "https://transcripts.foreverdreaming.org/viewtopic.php?t=7739"

# Make a request to get the content of the episode page
response = requests.get(episode_url)

# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')  # Use response.text for the HTML content


print(soup)

# Function to extract all episode links from the page
def extract_episode_links(soup):
    episode_links = {}

    # Find all 'a' tags with href attribute
    for link in soup.find_all('a', href=True):
        link_text = link.text.strip()
        # Check if the link text matches the episode format (e.g., "1x01")
        if link_text and 'x' in link_text and len(link_text.split('x')[0]) <= 2 and link_text.split('x')[1].isdigit():
            season_number = int(link_text.split('x')[0])
            # Correct the URL by combining the base URL with the href attribute
            # Ensure that only the part after '/viewtopic.php' is appended to the base URL
            url = 'https://transcripts.foreverdreaming.org' + link['href'][link['href'].find('/viewtopic.php'):]
            # Organize links by season
            if season_number not in episode_links:
                episode_links[season_number] = []
            episode_links[season_number].append(url)

    return episode_links

# Extracting the episode links
episode_links = extract_episode_links(soup)

# Display the first few links to verify
{season: links[:2] for season, links in episode_links.items()}  # show first two links of each season for brevity
# Make sure to replace 'your_directory_path' with the actual path where you want the folders to be created.
base_directory_path = 'files/Game_of_Thrones_Transcripts'

# This function will take the base directory and the links dictionary to download and organize the transcripts.
def download_transcripts(base_directory, episode_links):
    for season, links in episode_links.items():
        # Create a directory for the season
        season_directory = os.path.join(base_directory, f"Season_{season}")
        os.makedirs(season_directory, exist_ok=True)

        # Counter for episodes
        episode_count = 1

        for link in links:
            # Make a request to the episode link
            response = requests.get(link)
            # Generate the filename using season and episode count
            filename = f"season_{season}_episode_{episode_count}.html"
            file_path = os.path.join(season_directory, filename)

            # Write the content to the file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"Downloaded {filename} in {season_directory}")

            # Increment the episode count
            episode_count += 1
            
# Extracting the episode links from the HTML file (as done previously)

# Calling the download function
download_transcripts(base_directory_path, episode_links)

<!DOCTYPE html>

<html dir="ltr" lang="en-gb">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport">
<!-- Chrome, Firefox OS and Opera -->
<meta content="#107c10" name="theme-color"/>
<!-- Windows Phone -->
<meta content="#107c10" name="msapplication-navbutton-color"/>
<!-- iOS Safari -->
<meta content="#107c10" name="apple-mobile-web-app-status-bar-style"/>
<title>Game of Thrones Transcripts Index - Transcripts - Forever Dreaming</title>
<link href="/app.php/feed?sid=806d6e0f499d4ca0b5459f04b15a623e" rel="alternate" title="Feed - Transcripts - Forever Dreaming" type="application/atom+xml"/>
<link href="https://transcripts.foreverdreaming.org/viewtopic.php?t=7739" rel="canonical"/>
<!--
	phpBB style name: Project Durango
	Based on style:   prosilver (this is the default phpBB3 style)
	Original author:  Tom Beddard ( http://www.subBlue.com/ )
	Modified by:	MannixMD @MannixMD
-->
<link hr

In [13]:
import os
import re
import pandas as pd

 
def read_csv(file_path):
    return pd.read_csv(file_path, encoding='utf-8')

texts = read_csv("../Game_of_Thrones_Script.csv")

# Initialize an empty dictionary to hold the character dialogue information
character_dict = {}

# Iterate over each row in the DataFrame
for index, row in texts.iterrows():
    # Extract character name, season, episode, title, and dialogue
    name = row['Name']
    season = row['Season']
    episode = row['Episode']
    title = row['Episode Title']
    sentence = row['Sentence']

    # Initialize a dictionary for the character if it doesn't exist
    if name not in character_dict:
        character_dict[name] = []

    # Append the dialogue information to the character's list
    character_dict[name].append({
        'Season': season,
        'Episode': episode,
        'Episode Title': title,
        'Sentence': sentence
    })

print(len(character_dict))
print(character_dict.keys())

# Save the dictionary keys as a txt file
with open('character_names.txt', 'w') as file:
    for character in character_dict.keys():
        file.write(f"{character}\n")



565
dict_keys(['waymar royce', 'will', 'gared', 'royce', 'jon snow', 'septa mordane', 'sansa stark', 'eddard stark', 'robb stark', 'jonrobb', 'cassel', 'catelyn stark', 'bran stark', 'theon greyjoy', 'jaime lannister', 'cersei lannister', 'maester luwin', 'luwin', 'arya stark', 'robert baratheon', 'tyrion lannister', 'ros', 'viserys', 'daenerys targaryen', 'maid', 'illyrio', 'benjen', 'a voice', 'jorah mormont', 'khal drogo', 'sandor clegane', 'doreah', 'irri', 'viserys targaryen', 'joffrey lannister', 'myrcella baratheon', 'benjen stark', 'assassin', 'jhiqui', 'rodrick cassel', 'mycah', 'soldier', 'jory cassel', 'kings landing guard', 'varys', 'renly baratheon', 'petyr baelish', 'grand maester pycelle', 'old nan', 'guard', 'alliser thorne', 'jeor mormont', 'grenn', 'pyp', 'rast', 'barristan selmy', 'lancel lannister', 'rakharo', 'yoren', 'rhakaro', 'maester aemon', 'syrio forel', 'sam tarly', 'janos slynt', 'hugh of vale', 'tobho mott', 'gendry baratheon', 'marillion', 'masha heddle',