In [None]:
import pandas as pd
import os
import re
import spacy
from gensim.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

In [None]:
# Load the spacy model
nlp = spacy.load('en_core_web_sm')

def read_screenplay(file_path):
    # read the screenplay file and return its content
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Join lines again without first line
    text = ''.join(lines[1:])
    
    return text

def separate_scenes(text):
    # separate screenplay into scenes
    scenes = []
    raw_scenes = [scene.strip() for scene in text.split('=' * 50) if scene.strip()]
    
    for raw_scene in raw_scenes:
        # Split each scene into lines and exclude the first line
        scene_lines = raw_scene.split('\n')
        scene_text = '\n'.join(scene_lines[1:]).strip()
        scenes.append(scene_text)
    
    return scenes

def merge_short_scenes(scenes, min_words=100):
    # merge scenes shorter than min_words with next scene
    merged_scenes = []
    current_scene = ""
    
    for scene in scenes:
        # Calculate the number of words in the current scene
        current_scene_word_count = len(current_scene.split())
        scene_word_count = len(scene.split())
        
        # If the current scene combined with the next one is shorter than min_words, merge them
        if current_scene_word_count + scene_word_count < min_words:
            current_scene += " " + scene
        else:
            # If the current scene is non-empty, add it to merged_scenes
            if current_scene:
                merged_scenes.append(current_scene.strip())
            current_scene = scene
    
    # Add the last accumulated scene if it's non-empty
    if current_scene:
        merged_scenes.append(current_scene.strip())
    
    return merged_scenes

def remove_named_entities(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.ent_type_]
    return ' '.join(tokens)

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = remove_named_entities(text)  # Remove named entities
    return text

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in STOPWORDS] for doc in texts]

def train_lda_model(texts, num_topics=10, passes=10):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
    
    return lda_model, dictionary, corpus

def get_dominant_topic(lda_model, corpus):
    dominant_topics = []
    for bow in corpus:
        topic_probabilities = lda_model.get_document_topics(bow)
        dominant_topic = max(topic_probabilities, key=lambda x: x[1])
        dominant_topics.append(dominant_topic)
    return dominant_topics

def compute_coherence_for_scene(scene_text, lda_model, dictionary):
    tokens = simple_preprocess(scene_text, deacc=True)
    corpus = [dictionary.doc2bow(tokens)]
    coherence_model = CoherenceModel(model=lda_model, texts=[tokens], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return coherence_score

def compute_overall_coherence(merged_scenes):
    preprocessed_scenes = [preprocess_text(scene) for scene in merged_scenes]
    preprocessed_scenes = remove_stopwords(preprocessed_scenes)
    
    lda_model, dictionary, corpus = train_lda_model(preprocessed_scenes, num_topics=10, passes=10)
    
    scene_coherences = []
    for scene_text in merged_scenes:
        coherence = compute_coherence_for_scene(scene_text, lda_model, dictionary)
        scene_coherences.append(coherence)
    
    overall_coherence = sum(scene_coherences) / len(scene_coherences)
    return overall_coherence

In [None]:
# Load the metadata dataframe
df = pd.read_csv('data/02_movie_metadata.csv')

# Folder containing screenplay files
screenplay_folder = 'data/screenplay_data/data/scene_separated_texts'

# Regular expression pattern to extract imdbid from filename
pattern = re.compile(r'_0*(\d+)\.txt$')

In [None]:
# Iterate over all screenplay files
for filename in os.listdir(screenplay_folder):
    if filename.endswith('.txt'):
        # Extract imdbid from filename
        match = pattern.search(filename)
        if match:
            imdbid = match.group(1)
            # Read and process the screenplay
            file_path = os.path.join(screenplay_folder, filename)
            text = read_screenplay(file_path)
            scenes = separate_scenes(text)
            merged_scenes = merge_short_scenes(scenes, min_words=100)
            overall_coherence = compute_overall_coherence(merged_scenes)
            # Write the coherence coefficient value in the respective row of the dataframe
            df.loc[df['imdbid'] == int(imdbid), 'overall_coherence'] = overall_coherence

In [None]:
# save updated dataframe
df.to_csv('data/02_movie_metadata.csv', index=False)