In [None]:
import re
import chardet
import os

In [None]:
# function to detect file encoding

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    result = chardet.detect(raw_data)
    encoding = result.get('encoding')

    # condition to respect ascii or no encoding
    if encoding is None:
        print("No encoding detected, defaulting to utf-8")
        encoding = 'utf-8'
    elif encoding.lower() == 'ascii':
        print("Detected ASCII encoding, defaulting to utf-8")
        encoding = 'utf-8'
    else:
        print(f"Detected encoding: {encoding}")
        
    return encoding

In [None]:
# function to read screenplay

def read_screenplay(file_path):
    encoding = detect_encoding(file_path)
    if encoding.lower() == 'ascii':
        encoding = 'utf-8'  # fallback to UTF-8 if ASCII is detected
    with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
        text = file.read()
    print("Screenplay text loaded.")
    return text

In [None]:
# function to identify scenes in screenplay

def identify_scenes(text, title):
    
    # regular expressions

    # regex for EXT followed by . or :
    ext_pattern = re.compile(r'\bEXT[.|:]', re.MULTILINE)
    # regex for EXT followed by . or :
    int_pattern = re.compile(r'\bINT[.|:]', re.MULTILINE)
    # regex for lines with only uppercase letters, numbers and symbols
    uppercase_pattern = re.compile(r'^[A-Z0-9\s:\(\)\-]+$', re.MULTILINE)
    
    # split text into lines
    lines = text.splitlines()
    
    # List to store matched lines
    matches = []
    
    # iterate through each line and search for EXT and INT matches
    for line in lines:
        if ext_pattern.search(line) or int_pattern.search(line):
            matches.append(line)
    
    # if less than 30 EXT and INT matches are found, look for uppercase lines
    if len(matches) < 30:
        for line in lines:
            if uppercase_pattern.match(line):
                words = line.split()
                if len(words) >= 3:  # ensure line consists of at least three words
                    matches.append(line)

    # print the title and number of matches found
    print(f"{title}: {len(matches)} matches found")
    
    return matches

In [None]:
# function to extract scenes

def extract_scenes(text, matches):
    # dictionary to store scene heading (key) and scene content (value)
    scenes = {}
    
    num_matches = len(matches)

    # iterate through matches and find scene titles in screenplay
    for i in range(num_matches):
        scene_title = matches[i]
        start_pos = text.find(scene_title)

        # define scene start and end position
        if i + 1 < num_matches:
            next_scene_title = matches[i + 1]
            end_pos = text.find(next_scene_title, start_pos + len(scene_title))
        else:
            end_pos = len(text)

        # create scene text from scene title to next scene title
        scene_text = text[start_pos:end_pos].strip()
        scenes[scene_title] = scene_text
    
    return scenes

In [None]:
# function to clean scene text

def clean_scene_text(scene_text):
    # remove leading and trailing whitespace from each line
    lines = scene_text.splitlines()
    cleaned_lines = [re.sub(r'\s+', ' ', line.strip()) for line in lines]
    cleaned_text = "\n".join(cleaned_lines)
    return cleaned_text

In [None]:
# function to save scene separated text to new file

def save_scenes_to_file(file_path, scenes):
    # determine new file path
    new_dir = 'data/screenplay_data/data/scene_separated_texts'
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    
    filename = os.path.basename(file_path)
    new_file_path = os.path.join(new_dir, filename)
    
    # write scenes to the new file
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(f"Scene count: {len(scenes)}\n\n")

        for i, (scene_title, scene_content) in enumerate(scenes.items(), start=1):
            cleaned_scene_content = clean_scene_text(scene_content)
            file.write("=" * 50 + "\n") # 50 * = as scene separation
            file.write(f"{cleaned_scene_content}\n\n")
    
    print(f"Scene-separated screenplay saved to {new_file_path}")

In [None]:
# function that combines previous functions

def process_screenplay(file_path):
    # extract movie title from filename
    filename = os.path.basename(file_path)
    title = filename.split('_')[0]
    
    # read screenplay text
    text = read_screenplay(file_path)
    
    # identify scene headings
    scene_headings = identify_scenes(text, title)
    
    # extract scenes
    scenes = extract_scenes(text, scene_headings)
    
    # save the formatted text to a new file
    save_scenes_to_file(file_path, scenes)
    
    return title, len(scenes)  # return title and number of scenes for summary

In [None]:
# function to process all screenplays in the raw_texts folder and create summary file

def process_all_screenplays(folder_path, summary_file_path):
    # empty list for summary
    summary = []

    # iterate over screenplays in folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")
            
            # process screenplay and get title and scene count
            title, scene_count = process_screenplay(file_path)
            
            # append to summary list
            summary.append((title, scene_count))
    
    # save summary to csv file
    with open(summary_file_path, 'w', encoding='utf-8', newline='') as summary_file:
        writer = csv.writer(summary_file)
        writer.writerow(['Title', 'Scene Count'])
        writer.writerows(summary)
    
    print(f"Summary of processed files saved to {summary_file_path}")

In [None]:
# run the function

folder_path = 'data/screenplay_data/data/raw_texts/raw_texts'
summary_file_path = 'data/screenplay_data/data/scene_separated_texts/processed_files_summary.csv'
process_all_screenplays(folder_path, summary_file_path)