In [1]:
import os
import re
import numpy as np
import pandas as pd
from charset_normalizer import from_path

In [2]:
# function to extract number of words in scene
def extract_scene_lengths(file_path):
    result = from_path(file_path).best()
    with open(file_path, 'r', encoding=result.encoding) as file:
        lines = file.readlines()
    text = ''.join(lines[1:])
    scenes = text.split('=' * 50)
    scene_lengths = [len(scene.strip().split()) for scene in scenes if scene.strip()]
    return scene_lengths

# function to get mean length of scenes and standard deviation from mean
def analyze_scene_lengths(scene_lengths):
    mean_length = np.mean(scene_lengths)
    std_length = np.std(scene_lengths)
    return mean_length, std_length

# function to calculate coefficient of variation
def coherence_classifier(mean_length, std_length):
    coefficient_of_variation = std_length / mean_length
    return coefficient_of_variation

# function to process all screenplays
def process_scene_lengths(folder_path):
    coefficients_of_variation = {}
    
    # get all .txt files in folder
    files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

    for filename in files:
        file_path = os.path.join(folder_path, filename)
        
        # extract scene lengths
        scene_lengths = extract_scene_lengths(file_path)
        
        if scene_lengths:
            # analyze scene lengths
            mean_length, std_length = analyze_scene_lengths(scene_lengths)
            coefficient_of_variation = coherence_classifier(mean_length, std_length)
            
            # check if filename matches pattern
            match = re.search(r'_0*(\d+)\.txt$', filename)
            if match:
                # extract imdbid from filename
                imdbid = int(match.group(1))
                coefficients_of_variation[imdbid] = coefficient_of_variation
    
    return coefficients_of_variation

In [3]:
# read csv file
df = pd.read_csv('data/01_movie_metadata.csv')

# folder containing screenplay files
screenplay_folder = 'data/screenplay_data/data/scene_separated_texts'

# process scene lengths and get coefficients of variation
coefficients_of_variation = process_scene_lengths(screenplay_folder)

# add new column for scene length coefficient of variation
df['scene_length_cv'] = df['imdbid'].map(coefficients_of_variation)

In [4]:
df.isna().sum()

imdbid                                0
title                                 0
year                                  0
age_rating                            0
genre                                 0
description                           0
director                              0
runtime_minutes                       0
production_budget                     0
domestic_gross                        0
worldwide_gross                       0
financial_success                     0
ROI                                   0
age_rating_number                     0
genre_action                          0
genre_adventure                       0
genre_animation                       0
genre_biography                       0
genre_comedy                          0
genre_crime                           0
genre_drama                           0
genre_family                          0
genre_fantasy                         0
genre_film-noir                       0
genre_history                         0


In [5]:
df = df.dropna(subset=['scene_length_cv'])
df.shape

(1211, 41)

In [6]:
# save updated dataframe
df.to_csv('data/02_movie_metadata.csv', index=False)