In [None]:
import re
import pandas as pd
import networkx as nx
from charset_normalizer import from_path

In [None]:
def calculate_screenplay_metrics(file_path):
    try:
        # Load the screenplay
        result = from_path(file_path).best()
        with open(file_path, 'r', encoding=result.encoding) as file:
            screenplay = file.read()
    
        # Adjust the regex pattern to better capture character dialogues
        character_dialogue_pattern = re.compile(r'\n\s*([A-Z][A-Z\s]+)\s*\n\s*([^\n]+)')
        dialogues = character_dialogue_pattern.findall(screenplay)

        # Convert to DataFrame
        dialogue_df = pd.DataFrame(dialogues, columns=['Character', 'Dialogue'])

        # Filter out non-character entries from dialogues
        character_name_pattern = re.compile(r'\n\s*([A-Z][A-Z\s]+)\s*\n')
        potential_characters = character_name_pattern.findall(screenplay)
        character_counts = pd.Series(potential_characters).value_counts()
        character_threshold = 5  # Adjust this threshold as needed
        characters = character_counts[character_counts > character_threshold].index.tolist()
        dialogue_df = dialogue_df[dialogue_df['Character'].isin(characters)]

        # Create an interaction matrix for all characters
        all_characters = dialogue_df['Character'].unique()
        interaction_matrix_all = pd.DataFrame(0, index=all_characters, columns=all_characters)

        # Populate the interaction matrix by considering adjacent dialogues
        for i in range(len(dialogue_df) - 1):
            char1 = dialogue_df.iloc[i]['Character']
            char2 = dialogue_df.iloc[i + 1]['Character']
            if char1 != char2:
                interaction_matrix_all.loc[char1, char2] += 1
                interaction_matrix_all.loc[char2, char1] += 1

        # Create a NetworkX graph from the interaction matrix
        G_all = nx.from_pandas_adjacency(interaction_matrix_all)

        # Calculate degree centrality
        degree_centrality = nx.degree_centrality(G_all)
        average_degree_centrality = sum(degree_centrality.values()) / len(degree_centrality)

        # Calculate closeness centrality
        closeness_centrality = nx.closeness_centrality(G_all)
        average_closeness_centrality = sum(closeness_centrality.values()) / len(closeness_centrality)

        # Calculate betweenness centrality
        betweenness_centrality = nx.betweenness_centrality(G_all)
        average_betweenness_centrality = sum(betweenness_centrality.values()) / len(betweenness_centrality)

        # Interaction diversity: number of unique characters each character interacts with
        interaction_diversity = (interaction_matrix_all > 0).sum(axis=1)
        average_interaction_diversity = interaction_diversity.mean()

        # Normalized interaction coefficient (by total number of interactions)
        total_interactions = interaction_matrix_all.sum().sum()
        normalized_interaction_coefficient = total_interactions / (len(all_characters) * (len(all_characters) - 1))

        # Create a DataFrame to store these coefficients
        screenplay_metrics = {
            'average_degree_centrality': average_degree_centrality,
            'average_closeness_centrality': average_closeness_centrality,
            'average_betweenness_centrality': average_betweenness_centrality,
            'average_interaction_diversity': average_interaction_diversity,
            'normalized_interaction_coefficient': normalized_interaction_coefficient
        }

    except ZeroDivisionError:
        print(f"ZeroDivisionError for file: {file_path}")
        screenplay_metrics = {
            'average_degree_centrality': 0,
            'average_closeness_centrality': 0,
            'average_betweenness_centrality': 0,
            'average_interaction_diversity': 0,
            'normalized_interaction_coefficient': 0
        }
    
    return screenplay_metrics

In [None]:
import os
import pandas as pd

# Read csv file
df = pd.read_csv('data/movie_metadata_final.csv')

# Folder containing screenplay files
screenplay_folder = 'data/screenplay_data/data/raw_texts/raw_texts'

# Get all .txt files in folder
files = [f for f in os.listdir(screenplay_folder) if f.endswith('.txt')]

# Initialize dictionaries to store metrics for each screenplay
metrics_dict = {
    'average_degree_centrality': {},
    'average_closeness_centrality': {},
    'average_betweenness_centrality': {},
    'average_interaction_diversity': {},
    'normalized_interaction_coefficient': {}
}

In [None]:
df.head()

In [None]:
for filename in files:
    file_path = os.path.join(screenplay_folder, filename)

    # Check if filename matches pattern
    match = re.search(r'_0*(\d+)\.txt$', filename)
    if match:
        # Extract imdbid from filename
        imdbid = int(match.group(1))

        # Calculate metrics for the screenplay
        metrics = calculate_screenplay_metrics(file_path)

        # Store metrics in dictionary
        for key, value in metrics.items():
            metrics_dict[key][imdbid] = value

# Add new columns to the dataframe
for key in metrics_dict:
    df[key] = df['imdbid'].map(metrics_dict[key])

In [None]:
print("Updated DataFrame with metrics:")
print(df.head())

In [None]:
df = df[df['average_degree_centrality'] != 0]

In [None]:
df.shape

In [None]:
# Save the updated dataframe
df.to_csv('data/01_movie_metadata.csv', index=False)