In [None]:
import pandas as pd
import os
import re
import spacy
import logging
from charset_normalizer import from_path
from gensim.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the spacy model once
nlp = spacy.load('en_core_web_sm')

def read_screenplay(file_path):
    try:
        result = from_path(file_path).best()
        with open(file_path, 'r', encoding=result.encoding) as file:
            lines = file.readlines()
        logging.info(f"Successfully read file {file_path}")
        return ''.join(lines[1:])
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {str(e)}")
        return None

def separate_scenes(text):
    try:
        scenes = []
        raw_scenes = [scene.strip() for scene in text.split('=' * 50) if scene.strip()]
        for raw_scene in raw_scenes:
            scene_lines = raw_scene.split('\n')
            scene_text = '\n'.join(scene_lines[1:]).strip()
            scenes.append(scene_text)
        logging.info(f"Separated text into {len(scenes)} scenes")
        return scenes
    except Exception as e:
        logging.error(f"Error separating scenes: {str(e)}")
        return []

def merge_short_scenes(scenes, min_words=100):
    merged_scenes = []
    current_scene = ""
    for scene in scenes:
        current_scene_word_count = len(current_scene.split())
        scene_word_count = len(scene.split())
        if current_scene_word_count + scene_word_count < min_words:
            current_scene += " " + scene
        else:
            if current_scene:
                merged_scenes.append(current_scene.strip())
            current_scene = scene
    if current_scene:
        merged_scenes.append(current_scene.strip())
    logging.info(f"Merged scenes into {len(merged_scenes)} longer scenes")
    return merged_scenes

def identify_character_names(text):
    character_name_pattern = re.compile(r'\n\s*([A-Z][A-Z\s]+)\s*\n')
    potential_characters = character_name_pattern.findall(text)
    cleaned_characters = [re.sub(r'\s+$', '', char) for char in potential_characters]
    return cleaned_characters

def preprocess_text(text):
    character_names = identify_character_names(text)
    for name in character_names:
        text = text.replace(name, '')
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\W', ' ', text)
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha]
    return ' '.join(tokens)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in STOPWORDS] for doc in texts]

def train_lda_model(texts, num_topics=10, passes=10):
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
    return lda_model, dictionary, corpus

def get_dominant_topic(lda_model, text, dictionary):
    bow = dictionary.doc2bow(text)
    topic_distribution = lda_model.get_document_topics(bow)
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    return dominant_topic

def assign_dominant_topics(texts, lda_model, dictionary):
    dominant_topics = [get_dominant_topic(lda_model, text, dictionary) for text in texts]
    return dominant_topics

def compute_topic_coherence(dominant_topics, lda_model):
    topic_vectors = lda_model.get_topics()
    topic_vectors = topic_vectors / np.linalg.norm(topic_vectors, axis=1, keepdims=True)  # Normalize vectors
    
    # Get vectors for the dominant topics
    scene_vectors = [topic_vectors[topic] for topic in dominant_topics]
    
    # Compute pairwise cosine similarities
    similarities = cosine_similarity(scene_vectors)
    
    # Compute the average similarity (excluding self-similarity)
    num_scenes = len(scene_vectors)
    sum_similarities = np.sum(similarities) - num_scenes  # Subtract diagonal (self-similarity)
    avg_similarity = sum_similarities / (num_scenes * (num_scenes - 1))
    
    return avg_similarity

def process_screenplay(filename, screenplay_folder):
    try:
        file_path = os.path.join(screenplay_folder, filename)
        pattern = re.compile(r'_0*(\d+)\.txt$')
        match = pattern.search(filename)
        if not match:
            logging.warning(f"Could not extract imdbid from filename: {filename}")
            return None, None

        imdbid = match.group(1)
        text = read_screenplay(file_path)
        if text is None:
            return None, None

        scenes = separate_scenes(text)
        merged_scenes = merge_short_scenes(scenes, min_words=100)
        preprocessed_scenes = [preprocess_text(scene) for scene in merged_scenes]
        preprocessed_scenes = remove_stopwords(preprocessed_scenes)
        lda_model, dictionary, corpus = train_lda_model(preprocessed_scenes, num_topics=10, passes=10)
        dominant_topics = assign_dominant_topics(preprocessed_scenes, lda_model, dictionary)
        overall_coherence = compute_topic_coherence(dominant_topics, lda_model)
        
        logging.info(f"Processed {filename}: imdbid={imdbid}, coherence={overall_coherence}")
        return int(imdbid), overall_coherence
    except Exception as e:
        logging.error(f"Error processing {filename}: {str(e)}")
        return None, None

In [None]:
if __name__ == "__main__":
    try:
        # Load the metadata dataframe
        df = pd.read_csv('data/movie_metadata_final.csv')

        # Folder containing screenplay files
        screenplay_folder = 'data/screenplay_data/data/scene_separated_texts'

        # List to store results
        results = []

        # Iterate over screenplay files sequentially
        for filename in os.listdir(screenplay_folder):
            if filename.endswith('.txt'):
                result = process_screenplay(filename, screenplay_folder)
                if result[0] is not None and result[1] is not None:
                    results.append(result)

        # Create a new DataFrame to store imdbid and overall_coherence
        coherence_df = pd.DataFrame(results, columns=['imdbid', 'overall_coherence'])

        # Drop rows where imdbid or overall_coherence is None
        coherence_df.dropna(inplace=True)

        # Save the results DataFrame
        coherence_df.to_csv('data/movie_coherence_scores.csv', index=False)

        # Merge the coherence scores back into the original metadata DataFrame
        df = df.merge(coherence_df, on='imdbid', how='left')

        # Save the updated metadata DataFrame
        df.to_csv('data/movie_metadata_with_coherence.csv', index=False)
        
        logging.info("Finished processing all screenplays and saved results.")
    except Exception as e:
        logging.error(f"Error in main execution: {str(e)}")