In [3]:
import re
import pandas as pd
import os
import csv
from charset_normalizer import from_path

# Function to read screenplay

def read_screenplay(file_path):
    result = from_path(file_path).best()
    with open(file_path, 'r', encoding=result.encoding) as file:
        text = file.read()
    print("Screenplay text loaded.")
    return text

# Function to identify scenes using DataFrame
def identify_scenes(text, title):
    # Compile the regular expressions
    ext_pattern = re.compile(r'\bEXT[.\:\s\-\–\,]', re.MULTILINE)
    int_pattern = re.compile(r'INT[.\:\s\-\–\,]', re.MULTILINE)
    uppercase_pattern = re.compile(r'^[A-Z0-9\s:\(\)\-\.\:\,]+$', re.MULTILINE)
    fade_pattern = re.compile(r'\bFADE OUT[.\:\s\-\–\,]', re.MULTILINE)
    cut_pattern = re.compile(r'\bCUT TO[.\:\s\-\–\,]', re.MULTILINE)
    dissolve_pattern = re.compile(r'\bDISSOLVE[.\:\s\-\–\,]', re.MULTILINE)
    smash_pattern = re.compile(r'\bSMASH CUT[.\:\s\-\–\,]', re.MULTILINE)
    scene_pattern = re.compile(r'(?m)^\[Scene:?\s.*?\,\]$', re.MULTILINE)
    
    # Split text into lines
    lines = text.splitlines()
    
    # Create a DataFrame from lines
    df = pd.DataFrame(lines, columns=['line'])
    
    # Add a column to store matches
    df['match'] = None
    
    # Define match function
    def match_line(line, pattern, match_type):
        if pattern.search(line):
            return match_type
        return None
    
    # First pass: EXT and INT matches
    df['match'] = df['line'].apply(lambda x: match_line(x, ext_pattern, 'EXT') or match_line(x, int_pattern, 'INT') or match_line(x, scene_pattern, 'SCENE')
                                   or match_line(x, fade_pattern, 'FADE OUT') or match_line(x, cut_pattern, 'CUT TO'))
    
    # Second pass: Uppercase lines if less than 150 matches found
    if df['match'].count() < 150:
        df['match'] = df.apply(lambda x: 'UPPERCASE' if (uppercase_pattern.match(x['line']) and pd.isna(x['match']) and len(x['line'].split()) >= 3) else x['match'], axis=1)
    
    # Third pass: Fade, cut, dissolve, smash, and scene pattern matches if still less than 150 matches
    if df['match'].count() < 150:
        df['match'] = df.apply(lambda x: (
            'FADE OUT' if fade_pattern.search(x['line']) else
            'CUT TO' if cut_pattern.search(x['line']) else
            'DISSOLVE' if dissolve_pattern.search(x['line']) else
            'SMASH CUT' if smash_pattern.search(x['line']) else
            'SCENE' if scene_pattern.search(x['line']) else
            x['match']
        ), axis=1)
    
    # Collect matches
    matches = []
    match_counter = 1
    for index, row in df.iterrows():
        if pd.notna(row['match']):
            matches.append(f"{row['line']} SCENE{match_counter:03d}")
            match_counter += 1
    
    print(f"{title}: {len(matches)} matches found")
    
    return matches

# Function to extract scenes
def extract_scenes(text, matches):
    scenes = {}
    for i in range(len(matches)):
        scene_title = matches[i]
        numbered_scene_title = scene_title.split(' SCENE')[0]
        scene_id = scene_title.split(' SCENE')[1]
        start_pos = text.find(numbered_scene_title)

        if i + 1 < len(matches):
            next_scene_title = matches[i + 1].split(' SCENE')[0]
            end_pos = text.find(next_scene_title, start_pos + len(numbered_scene_title))
        else:
            end_pos = len(text)

        scene_text = text[start_pos:end_pos].strip()
        unique_scene_title = f"{scene_id} {numbered_scene_title}"
        scenes[unique_scene_title] = scene_text

    print(f"Extracted {len(scenes)} scenes")
    
    return scenes

# Function to clean scene text
def clean_scene_text(scene_text):
    lines = scene_text.splitlines()
    cleaned_lines = [re.sub(r'\s+', ' ', line.strip()) for line in lines]
    cleaned_text = "\n".join(cleaned_lines)
    return cleaned_text

# Function to save scene separated text to new file
def save_scenes_to_file(file_path, scenes):
    new_dir = 'data/screenplay_data/data/scene_separated_texts'
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    
    filename = os.path.basename(file_path)
    new_file_path = os.path.join(new_dir, filename)
    
    with open(new_file_path, 'w', encoding='utf-8') as file:
        file.write(f"Scene count: {len(scenes)}\n\n")
        
        for i, (scene_title, scene_content) in enumerate(scenes.items(), start=1):
            cleaned_scene_content = clean_scene_text(scene_content)
            file.write("=" * 50 + "\n")
            file.write(f"{cleaned_scene_content}\n\n")
    
    print(f"Scene-separated screenplay saved to {new_file_path}")

# Function that combines previous functions
def process_screenplay(file_path):
    filename = os.path.basename(file_path)
    title = filename.split('_')[0]
    
    text = read_screenplay(file_path)
    scene_headings = identify_scenes(text, title)
    scenes = extract_scenes(text, scene_headings)
    save_scenes_to_file(file_path, scenes)
    
    return title, len(scene_headings)

# Function to process all screenplays in the raw_texts folder and create summary file
def process_all_screenplays(folder_path, summary_file_path):
    summary = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")
            
            try:
                title, scene_count = process_screenplay(file_path)
                summary.append((title, scene_count))
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
    
    print("Summary list:", summary)

    try:
        with open(summary_file_path, 'w', encoding='utf-8', newline='') as summary_file:
            writer = csv.writer(summary_file)
            writer.writerow(['Title', 'Scene Count'])
            writer.writerows(summary)
        
        print(f"Summary of processed files saved to {summary_file_path}")
    except Exception as e:
        print(f"Error saving summary to CSV: {e}")

In [4]:
# Run the function
folder_path = 'data/screenplay_data/data/raw_texts/raw_texts'
summary_file_path = 'data/screenplay_data/data/scene_separated_texts/processed_files_summary.csv'
process_all_screenplays(folder_path, summary_file_path)

Processing file: 10 Cloverfield Lane_1179933.txt
Screenplay text loaded.
10 Cloverfield Lane: 133 matches found
Extracted 133 scenes
Scene-separated screenplay saved to data/screenplay_data/data/scene_separated_texts\10 Cloverfield Lane_1179933.txt
Processing file: 10 Things I Hate About You_0147800.txt
Screenplay text loaded.
10 Things I Hate About You: 122 matches found
Extracted 122 scenes
Scene-separated screenplay saved to data/screenplay_data/data/scene_separated_texts\10 Things I Hate About You_0147800.txt
Processing file: 12 Monkeys_0114746.txt
Screenplay text loaded.
12 Monkeys: 176 matches found
Extracted 176 scenes
Scene-separated screenplay saved to data/screenplay_data/data/scene_separated_texts\12 Monkeys_0114746.txt
Processing file: 12 Years a Slave_2024544.txt
Screenplay text loaded.
12 Years a Slave: 155 matches found
Extracted 155 scenes
Scene-separated screenplay saved to data/screenplay_data/data/scene_separated_texts\12 Years a Slave_2024544.txt
Processing file: 12