In [3]:
import re
import pandas as pd
import networkx as nx
from charset_normalizer import from_path

In [4]:
def calculate_screenplay_metrics(file_path):
    try:
        # Load the screenplay
        result = from_path(file_path).best()
        with open(file_path, 'r', encoding=result.encoding) as file:
            screenplay = file.read()
    
        # Adjust the regex pattern to better capture character dialogues
        character_dialogue_pattern = re.compile(r'\n\s*([A-Z][A-Z\s]+)\s*\n\s*([^\n]+)')
        dialogues = character_dialogue_pattern.findall(screenplay)

        # Convert to DataFrame
        dialogue_df = pd.DataFrame(dialogues, columns=['Character', 'Dialogue'])

        # Filter out non-character entries from dialogues
        character_name_pattern = re.compile(r'\n\s*([A-Z][A-Z\s]+)\s*\n')
        potential_characters = character_name_pattern.findall(screenplay)
        character_counts = pd.Series(potential_characters).value_counts()
        character_threshold = 5  # Adjust this threshold as needed
        characters = character_counts[character_counts > character_threshold].index.tolist()
        dialogue_df = dialogue_df[dialogue_df['Character'].isin(characters)]

        # Create an interaction matrix for all characters
        all_characters = dialogue_df['Character'].unique()
        interaction_matrix_all = pd.DataFrame(0, index=all_characters, columns=all_characters)

        # Populate the interaction matrix by considering adjacent dialogues
        for i in range(len(dialogue_df) - 1):
            char1 = dialogue_df.iloc[i]['Character']
            char2 = dialogue_df.iloc[i + 1]['Character']
            if char1 != char2:
                interaction_matrix_all.loc[char1, char2] += 1
                interaction_matrix_all.loc[char2, char1] += 1

        # Create a NetworkX graph from the interaction matrix
        G_all = nx.from_pandas_adjacency(interaction_matrix_all)

        # Calculate degree centrality
        degree_centrality = nx.degree_centrality(G_all)
        average_degree_centrality = sum(degree_centrality.values()) / len(degree_centrality)

        # Calculate closeness centrality
        closeness_centrality = nx.closeness_centrality(G_all)
        average_closeness_centrality = sum(closeness_centrality.values()) / len(closeness_centrality)

        # Calculate betweenness centrality
        betweenness_centrality = nx.betweenness_centrality(G_all)
        average_betweenness_centrality = sum(betweenness_centrality.values()) / len(betweenness_centrality)

        # Interaction diversity: number of unique characters each character interacts with
        interaction_diversity = (interaction_matrix_all > 0).sum(axis=1)
        average_interaction_diversity = interaction_diversity.mean()

        # Normalized interaction coefficient (by total number of interactions)
        total_interactions = interaction_matrix_all.sum().sum()
        normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))

        # Create a DataFrame to store these coefficients
        screenplay_metrics = {
            'average_degree_centrality': average_degree_centrality,
            'average_closeness_centrality': average_closeness_centrality,
            'average_betweenness_centrality': average_betweenness_centrality,
            'average_interaction_diversity': average_interaction_diversity,
            'normalized_interaction_coefficient': normalized_interaction_coefficient
        }

    except ZeroDivisionError:
        print(f"ZeroDivisionError for file: {file_path}")
        screenplay_metrics = {
            'average_degree_centrality': 0,
            'average_closeness_centrality': 0,
            'average_betweenness_centrality': 0,
            'average_interaction_diversity': 0,
            'normalized_interaction_coefficient': 0
        }
    
    return screenplay_metrics

In [5]:
import os
import pandas as pd

# Read csv file
df = pd.read_csv('data/01_movie_metadata.csv')

# Folder containing screenplay files
screenplay_folder = 'data/screenplay_data/data/raw_texts/raw_texts'

# Get all .txt files in folder
files = [f for f in os.listdir(screenplay_folder) if f.endswith('.txt')]

# Initialize dictionaries to store metrics for each screenplay
metrics_dict = {
    'average_degree_centrality': {},
    'average_closeness_centrality': {},
    'average_betweenness_centrality': {},
    'average_interaction_diversity': {},
    'normalized_interaction_coefficient': {}
}

In [6]:
df.head()

Unnamed: 0,imdbid,title,year,age_rating,genre,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross,...,genre_horror,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_sport,genre_thriller,genre_war,genre_western
0,120770,A Night at the Roxbury,1998,PG-13,"Comedy, Music, Romance",John Fortenberry,82.0,17000000.0,30331165.0,30331160.0,...,0,1,0,0,1,0,0,0,0,0
1,132512,At First Sight,1999,PG-13,"Drama, Romance",Irwin Winkler,128.0,40000000.0,22365133.0,22365130.0,...,0,0,0,0,1,0,0,0,0,0
2,118661,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",Jeremiah S. Chechik,143.0,225000000.0,623357910.0,1515100000.0,...,0,0,0,0,0,1,0,0,0,0
3,118715,The Big Lebowski,1998,R,"Comedy, Crime","Joel Coen, \nEthan Coen",117.0,15000000.0,17498804.0,46189570.0,...,0,0,0,0,0,0,0,0,0,0
4,112573,Braveheart,1995,R,"Biography, Drama, History",Mel Gibson,178.0,72000000.0,75545647.0,209045200.0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
for filename in files:
    file_path = os.path.join(screenplay_folder, filename)

    # Check if filename matches pattern
    match = re.search(r'_0*(\d+)\.txt$', filename)
    if match:
        # Extract imdbid from filename
        imdbid = int(match.group(1))

        # Calculate metrics for the screenplay
        metrics = calculate_screenplay_metrics(file_path)

        # Store metrics in dictionary
        for key, value in metrics.items():
            metrics_dict[key][imdbid] = value

# Add new columns to the dataframe
for key in metrics_dict:
    df[key] = df['imdbid'].map(metrics_dict[key])

ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\29th Street Highlander 2 The Quickening Billy Bathgate Year of the Gun_6099126.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Aladdin_0103639.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\All Is Lost_2017038.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\An American Tragedy_0021607.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Anchorman The Legend of Ron Burgundy_0357413.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Armageddon_0120591.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Battleship Potemkin_0015648.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Benny Joon_0106387.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Blue Jasmine_2334873.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Bram Stoker s Dracula_0103874.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Breakin_0086998.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Bringing Ashley Home_1765730.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Charlie Chan Carries On_0021733.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Charlie Chan s Chance_0022755.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Charlie Chan s Courage_0024969.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Charlie Chan s Greatest Case_0023881.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Count Dracula_0065569.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Dawn of the Dead_0077402.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Enter the Void_1191111.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Every I Know What You Did Last Summer Movie Ranked_13256474.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Everybody Comes to Nick s_1218598.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Fantasia 2000_0120910.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Fletch_0089155.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Flightplan_0408790.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Fogg s Millions_0221183.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Friday the 13th The Final Chapter_0087298.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Frost Portrait of a Vampire_0212277.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Gentleman in Mufti_0258525.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Going for the Gold The Bill Johnson Story_0089212.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Gone with the Wind_0031381.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Halloween_0296665.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Halo_2934286.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Happy End_5304464.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Highway_0165361.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Horror of Dracula_0051554.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Hot romance_6048960.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Inside Llewyn Davis_2042568.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Johnny M

  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Legend_0089469.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\London After Midnight_0018097.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Moneyball_3024352.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Mulan_0120762.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Napoleon Dynamite_0374900.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\National Lampoon s Vacation_0085995.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Nightbreed_0100260.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Nightcrawler_1964955.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Norbit_0477051.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Now and Then_0114011.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\One Eight Seven_0118531.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Poetic Justice_0107840.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Pok mon Mewtwo Returns_0304564.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Practical Magic_0120791.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Pride and Prejudice_1332374.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Pump Up the Volume_0100436.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Pygmalion_0030637.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Romy and Michele s High School Reunion_0120032.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Saboteur_0035279.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Sabrina_0047437.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Shock Treatment_0083067.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Sidewalks of New York_0164167.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Singin in the Rain_0045152.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Southgate to Brighton_7547872.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Spaceballs_0094012.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Spitfire_5913184.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Spring Breakers_2101441.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Star Wars Ep 1 The Phantom Menace w Ryan Paul from Cold Callers Comedy Podcast_12764338.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Star Wars Episode I The Phantom Menace_0120915.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Star Wars Episode III Revenge of the Sith_0121766.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Strange Brew_0086373.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Stranger Than Paradise_0088184.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Streetwise_0088196.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Suburbicon_0491175.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Sweeney Todd_0479760.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Tarzan

  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Messenger_0790712.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The NeverEnding Story_0088323.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Ninth Gate_0142688.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Old Dark House_0057379.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Passion of Joan of Arc_0019254.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Peanut Butter Falcon_4364194.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Phantom Menace_6892482.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Phantom of the Opera_0293508.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Reason We Are Not Detectives_4408712.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Red Turtle_3666024.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Rescuers Down Under_0100477.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Road_0898367.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Spectacular Now_1714206.txt


  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Thin Man_0025878.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Thin Man_0820597.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Thomas Crown Affair_0155267.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The Treasure of the Sierra Madre_0040897.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\The White Ribbon_1149362.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Things to Come_0028358.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\This Is Spinal Tap_0088258.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Three Wishes_0310227.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\To Catch a Thief_0048728.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_text

  normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))


ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Witchery_0096453.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Withnail I_0094336.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Wonderland_0335563.txt
ZeroDivisionError for file: data/screenplay_data/data/raw_texts/raw_texts\Working Girl_0096463.txt


In [8]:
print("Updated DataFrame with metrics:")
print(df.head())

Updated DataFrame with metrics:
   imdbid                   title  year age_rating                      genre  \
0  120770  A Night at the Roxbury  1998      PG-13     Comedy, Music, Romance   
1  132512          At First Sight  1999      PG-13             Drama, Romance   
2  118661            The Avengers  1998      PG-13  Action, Adventure, Sci-Fi   
3  118715        The Big Lebowski  1998          R              Comedy, Crime   
4  112573              Braveheart  1995          R  Biography, Drama, History   

                  director  runtime_minutes  production_budget  \
0         John Fortenberry             82.0         17000000.0   
1            Irwin Winkler            128.0         40000000.0   
2      Jeremiah S. Chechik            143.0        225000000.0   
3  Joel Coen, \nEthan Coen            117.0         15000000.0   
4               Mel Gibson            178.0         72000000.0   

   domestic_gross  worldwide_gross  ...  genre_sci-fi  genre_sport  \
0      3033116

In [9]:
df = df[df['average_degree_centrality'] != 0]

In [10]:
df.shape

(1213, 39)

In [11]:
# Save the updated dataframe
df.to_csv('data/01_movie_metadata.csv', index=False)