In [1]:
import requests
import json
from bs4 import BeautifulSoup
import re
from os import path, makedirs

In [2]:
baseurl = "https://edersoncorbari.github.io/friends-scripts/season/"
seasons = range(1, 11)

In [62]:
def fetch_script(baseurl, season, episode, check_combine=False, unprocessed=True) -> tuple[list[str], int]:
    if check_combine:
        url = f"{baseurl}{season:02d}{episode:02d}-{season:02d}{episode+1:02d}.html"
    else:
        url = f"{baseurl}{season:02d}{episode:02d}.html"

    response = requests.get(url)
    if response.status_code != requests.codes.ok:
        print(f"Failed to fetch {url}")
        return ([], 1)
    
    soup = BeautifulSoup(response.text, 'html.parser')
    html_elements = soup.body.find_all('p')
    
    if unprocessed:
        unchanged_lines = [ele.get_text().strip() for ele in html_elements if ele.get_text()]
        return (unchanged_lines, 0)
    
    # Basic preprocessing to clean up the text
    lines = [ele.get_text().strip().replace('\n', ' ').replace('\r', ' ') for ele in html_elements if ele.get_text()]
    lines = [re.sub(r'\s+', ' ', re.sub(r'\([^)]*\)', '', line)).strip() + "\n" for line in lines if len(line) < 2000]
    return (lines, 0)
    

In [63]:
def fetch_all_scripts(baseurl: str, seasons: range, output_dir: str) -> None:
    makedirs(output_dir, exist_ok=True)
    offset = 0

    for season in seasons:
        print(f"Fetching scripts for season {season}")
        for episode in range(1, 25): # There are up to 24 episodes per season
            print(f"Episode {episode}")
            episode += offset

            # Try normal episode first
            lines, status = fetch_script(baseurl, season, episode, check_combine=False)
            if status == 1:
                # Try combined episode if normal not found
                offset += 1
                lines, status = fetch_script(baseurl, season, episode, check_combine=True)
            if status == 1:
                # No script found, likely end of season
                break
            # Save the script to a file - Lines will not be empty
            filename = f"{output_dir}/s{season:02d}e{episode:02d}.txt"
            with open(filename, 'w', encoding='utf-8') as f:
                f.writelines(lines)

In [64]:
# This function call will fetch all scripts and save them in the "scripts" directory
# fetch_all_scripts(baseurl, seasons, "scripts")

In [91]:
def post_process_scripts(seasons: range, input_dir: str, output_dir: str) -> None:
    assert path.exists(input_dir), f"Input directory {input_dir} does not exist."
    makedirs(output_dir, exist_ok=True)

    scene_number = 0
    last_three_lines = []
    main_characters = ['Ross', 'Rachel', 'Chandler', 'Monica', 'Joey', 'Phoebe', 'All']
    processed_data = []

    for season in seasons:
        print(f"Post-processing scripts for season {season}")
        for episode in range(1, 25):
            episode_started = False
            filename = f"{input_dir}/s{season:02d}e{episode:02d}.txt"
            if not path.exists(filename):
                continue
            
            with open(filename, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            for i, line in enumerate(lines):
                line = line.strip()
                if (line == ' ' or line == ''):
                    continue  # Skip empty lines
                
                if line.startswith('[Scene:') or line.startswith('[scene:'):
                    if not episode_started:
                        episode_started = True
                        continue

                    scene_number += 1
                    last_three_lines = []
                    continue

                if not episode_started:
                    continue

                character = set(line.split(':')[0].split(',')).union(set(line.split(':')[0].split('and')))

                # Remove lines which are not dialogue lines
                if len(character) == 0:
                    continue
                
                # If the current line is not by one of the main characters, skip it
                if character.isdisjoint(set(main_characters)):
                    continue

                for char in character:
                    if char not in main_characters[:-1]:
                        continue
                    
                    curr_sample = {
                        'scene_id': f's{season:02d}e{episode:02d}sc{scene_number:02d}',
                        'lines': last_three_lines,
                        'next_speaker': char,
                        'next_line': line.split(':')[1] if ':' in line else line,
                    }
                    processed_data.append(curr_sample)

                # Hopefully, only dialogure is left now
                last_three_lines = last_three_lines[-2:] + [line]
                
    # Save the processed data to a JSON file
    output_filename = f"{output_dir}/processed_scripts.json"
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, indent=4, ensure_ascii=False)

In [92]:
post_process_scripts(seasons, "scripts", "processed_scripts")

Post-processing scripts for season 1
Post-processing scripts for season 2
Post-processing scripts for season 3
Post-processing scripts for season 4
Post-processing scripts for season 5
Post-processing scripts for season 6
Post-processing scripts for season 7
Post-processing scripts for season 8
Post-processing scripts for season 9
Post-processing scripts for season 10


In [21]:
line.split(':')[0].split(',')

['Ross']