In [None]:
import re
import chardet
import os
import csv

In [None]:
# function to detect file encoding

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    result = chardet.detect(raw_data)
    encoding = result.get('encoding')

    # condition to respect ascii or no encoding
    if encoding is None:
        print("No encoding detected, defaulting to utf-8")
        encoding = 'utf-8'
    elif encoding.lower() == 'ascii':
        print("Detected ASCII encoding, defaulting to utf-8")
        encoding = 'utf-8'
    else:
        print(f"Detected encoding: {encoding}")
        
    return encoding

In [None]:
# function to read screenplay

def read_screenplay(file_path):
    encoding = detect_encoding(file_path)
    if encoding.lower() == 'ascii':
        encoding = 'utf-8'  # fallback to UTF-8 if ASCII is detected
    with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
        text = file.read()
    print("Screenplay text loaded.")
    return text

In [None]:
# function to identify scenes

def identify_scenes(text, title):
    
    # compile the regular expressions
    ext_pattern = re.compile(r'\bEXT[.\:\s\-\–]', re.MULTILINE)
    int_pattern = re.compile(r'INT[.\:\s\-\–]', re.MULTILINE)
    uppercase_pattern = re.compile(r'^[A-Z0-9\s:\(\)\-\.\:]+$', re.MULTILINE)
    fade_pattern = re.compile(r'\bFADE OUT[.\:\s\-\–]', re.MULTILINE)
    cut_pattern = re.compile(r'\bCUT TO[.\:\s\-\–]', re.MULTILINE)
    dissolve_pattern = re.compile(r'\bDISSOLVE[.\:\s\-\–]', re.MULTILINE)
    smash_pattern = re.compile(r'\bSMASH CUT[.\:\s\-\–]', re.MULTILINE)
    scene_pattern = re.compile(r'(?m)^\[Scene:?\s.*?\]$', re.MULTILINE)
    
    # split text into lines
    lines = text.splitlines()
    
    # remove leading whitespace from each line
    lines = [line.lstrip() for line in lines]
    
    # list to store matched lines
    matches = []
    
    # iterate through each line and search for EXT and INT matches
    # number matches to ensure uniqueness
    match_counter = 1
    for line in lines:
        if ext_pattern.search(line) or int_pattern.search(line):
            # append scene heading and scene counter
            matches.append(f"{line} SCENE{match_counter:03d}")
            match_counter += 1
    
    # if less than 150 EXT and INT matches are found, include uppercase lines
    if len(matches) < 150:
        for line in lines:
            if uppercase_pattern.match(line) and line not in matches:
                words = line.split()
                if len(words) >= 3:  # Ensure it consists of at least three words
                    matches.append(f"{line} SCENE{match_counter:03d}")
                    match_counter += 1
    
    # if still less than 150 matches, include fade, cut, dissolve and smash
    if len(matches) < 150:
        for line in lines:
            if (fade_pattern.search(line) or cut_pattern.search(line) or
                dissolve_pattern.search(line) or smash_pattern.search(line) or scene_pattern.search(line)) and line not in matches:
                matches.append(f"{line} SCENE{match_counter:03d}")
                match_counter += 1
    
    print(f"{title}: {len(matches)} matches found")
    
    return matches

In [None]:
# function to identify scenes

def extract_scenes(text, matches):
    # dictionary to store scene heading (key) and scene content (value)
    scenes = {}

    # split up scene heading and number again and find line in screenplay
    for i in range(len(matches)):
        scene_title = matches[i]
        numbered_scene_title = scene_title.split(' SCENE')[0]
        scene_id = scene_title.split(' SCENE')[1]
        start_pos = text.find(numbered_scene_title)

        # define scene start and end position
        if i + 1 < len(matches):
            next_scene_title = matches[i + 1].split(' SCENE')[0]
            end_pos = text.find(next_scene_title, start_pos + len(numbered_scene_title))
        else:
            end_pos = len(text)

        # create scene text from scene title to next scene title
        scene_text = text[start_pos:end_pos].strip()
        unique_scene_title = f"{scene_id} {numbered_scene_title}"
        scenes[unique_scene_title] = scene_text

    print(f"Extracted {len(scenes)} scenes")  # debugging statement
    
    return scenes

In [None]:
# function to clean scene text

def clean_scene_text(scene_text):
    # remove leading and trailing whitespace from each line
    lines = scene_text.splitlines()
    cleaned_lines = [re.sub(r'\s+', ' ', line.strip()) for line in lines]
    cleaned_text = "\n".join(cleaned_lines)
    return cleaned_text

In [None]:
# function to save scene separated text to new file

def save_scenes_to_file(file_path, scenes):
    # determine new file path
    new_dir = 'data/screenplay_data/data/scene_separated_texts'
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    
    filename = os.path.basename(file_path)
    new_file_path = os.path.join(new_dir, filename)
    
    # write scenes to new file
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(f"Scene count: {len(scenes)}\n\n")
        
        for i, (scene_title, scene_content) in enumerate(scenes.items(), start=1):
            cleaned_scene_content = clean_scene_text(scene_content)
            file.write("=" * 50 + "\n") # 50 * = as scene separation
            file.write(f"{cleaned_scene_content}\n\n")
    
    print(f"Scene-separated screenplay saved to {new_file_path}")

In [None]:
# function that combines previous functions

def process_screenplay(file_path):
    # extract movie title from filename
    filename = os.path.basename(file_path)
    title = filename.split('_')[0]
    
    # read screenplay text
    text = read_screenplay(file_path)
    
    # identify scene headings
    scene_headings = identify_scenes(text, title)
    
    # extract scenes
    scenes = extract_scenes(text, scene_headings)
    
    # save formatted text to new file
    save_scenes_to_file(file_path, scenes)
    
    return title, len(scene_headings)  # return title and number of scenes for summary

In [None]:
# function to process all screenplays in the raw_texts folder and create summary file

def process_all_screenplays(folder_path, summary_file_path):
    summary = []

    # iterate over screenplays in folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")
            
            try:
                # process screenplay and get title and scene count
                title, scene_count = process_screenplay(file_path)
                
                # append to summary list
                summary.append((title, scene_count))
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
    
    print("Summary list:", summary)  # debugging addition

    try:
        # save summary to csv
        with open(summary_file_path, 'w', encoding='utf-8', newline='') as summary_file:
            writer = csv.writer(summary_file)
            writer.writerow(['Title', 'Scene Count'])
            writer.writerows(summary)
        
        print(f"Summary of processed files saved to {summary_file_path}")
    except Exception as e:
        print(f"Error saving summary to CSV: {e}")

In [None]:
# run the function

folder_path = 'data/screenplay_data/data/raw_texts'
summary_file_path = 'data/screenplay_data/data/scene_separated_texts/processed_files_summary.csv'
process_all_screenplays(folder_path, summary_file_path)