## Imports and global variables

In [1]:
import os
import pandas as pd
import numpy as np
import torch

In [2]:
TIMESTAMPS_PATH = 'train/train/scene_timestamps'
FEATURES_PATH = 'train/train/features'
LABELS_PATH = 'train/train/labels'
SUBTITLES_PATH = 'train/train/subtitles'

In [3]:
# Check GPU available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load timestamps

In [4]:
if os.path.exists(TIMESTAMPS_PATH):
    files = os.listdir(TIMESTAMPS_PATH)
    # remove australia because australia subtitles are in diffrent format than everything else
    files.remove("tt0455824_australia_timestamps.csv")
else:
    print("PATH DOES NOT EXIST!")

In [5]:
def get_movie_name(file):
    return file[10:-15]

def get_movie_id(file):
    return file[:9]

In [6]:
movies = []
for file in files:
    # print(f"file name: {file}")
    movie_name = get_movie_name(file)
    movies.append(movie_name)    

In [7]:
def prepare_csv(path, movie_name):
    df = pd.read_csv(path)
    df.rename(columns={"Unnamed: 0": "scene_id"}, inplace=True)
    df["movie"] = movie_name
    return df

In [8]:
dfs = []
for idx, file in enumerate(files):
    timestamp_path = os.path.join(TIMESTAMPS_PATH, file)
    df = prepare_csv(timestamp_path, movies[idx])
    dfs.append(df)

timestamps = pd.concat(dfs, ignore_index=True)
print(timestamps.shape)
timestamps.tail()

(3729, 4)


Unnamed: 0,scene_id,start,end,movie
3724,220,6499.655,6571.893,dallas buyers club
3725,221,6574.396,6574.396,dallas buyers club
3726,224,6589.161,6609.264,dallas buyers club
3727,226,6629.701,6677.999,dallas buyers club
3728,227,6682.504,6686.675,dallas buyers club


## Load features

In [9]:
def get_feature_file(file):
    return file.replace("_timestamps", "")

In [10]:
feature_files = []
for file in files:
    feature_file = get_feature_file(file)
    feature_files.append(feature_file)    

In [11]:
dfs = []
for idx, file in enumerate(files):
    feature_path = os.path.join(FEATURES_PATH, feature_files[idx])
    df = prepare_csv(feature_path, movies[idx])
    dfs.append(df)

features = pd.concat(dfs, ignore_index=True)
print(features.shape)
features.tail()

(3729, 9)


Unnamed: 0,scene_id,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,movie
3724,220,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,dallas buyers club
3725,221,5.380333,1,5.380333,0.965066,0.936929,372.1087,0,dallas buyers club
3726,224,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,dallas buyers club
3727,226,52.761333,19,2.776912,0.9869,0.944811,1076.267498,1,dallas buyers club
3728,227,5.672333,4,1.418083,0.991266,0.952336,2085.459362,1,dallas buyers club


## Load labels

In [12]:
dfs = []
for idx, file in enumerate(files):
    # print(feature_files[idx])
    labels_path = os.path.join(LABELS_PATH, feature_files[idx])
    
    df = pd.read_csv(labels_path, keep_default_na=False)
    df.rename(
        columns={
            "Unnamed: 0": "scene_id",
            "0": "label"
        }, inplace=True
    )
    df["movie"] = movies[idx]
    
    dfs.append(df)

labels = pd.concat(dfs, ignore_index=True)
print(labels.shape)
labels.tail()

(3729, 3)


Unnamed: 0,scene_id,label,movie
3724,220,Finale,dallas buyers club
3725,221,Finale,dallas buyers club
3726,224,Final Image,dallas buyers club
3727,226,Final Image,dallas buyers club
3728,227,Final Image,dallas buyers club


## Merge dataframes

In [13]:
timestamps_features = pd.merge(timestamps, features, on=["scene_id", "movie"], how="outer")
full_data = pd.merge(timestamps_features, labels, on=["scene_id", "movie"], how="outer")

print(full_data.shape)
full_data.tail()

(3729, 12)


Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,label
3724,220,6499.655,6571.893,dallas buyers club,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,Finale
3725,221,6574.396,6574.396,dallas buyers club,5.380333,1,5.380333,0.965066,0.936929,372.1087,0,Finale
3726,224,6589.161,6609.264,dallas buyers club,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,Final Image
3727,226,6629.701,6677.999,dallas buyers club,52.761333,19,2.776912,0.9869,0.944811,1076.267498,1,Final Image
3728,227,6682.504,6686.675,dallas buyers club,5.672333,4,1.418083,0.991266,0.952336,2085.459362,1,Final Image


## Correct end column

In [14]:
full_data["end"] = full_data["start"] + full_data["s_dur"]
full_data.tail()

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,label
3724,220,6499.655,6574.354333,dallas buyers club,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,Finale
3725,221,6574.396,6579.776333,dallas buyers club,5.380333,1,5.380333,0.965066,0.936929,372.1087,0,Finale
3726,224,6589.161,6623.987333,dallas buyers club,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,Final Image
3727,226,6629.701,6682.462333,dallas buyers club,52.761333,19,2.776912,0.9869,0.944811,1076.267498,1,Final Image
3728,227,6682.504,6688.176333,dallas buyers club,5.672333,4,1.418083,0.991266,0.952336,2085.459362,1,Final Image


## Load subtitles

In [15]:
import srt

In [16]:
subtitles_files = [file.replace('.csv', '.srt') for file in feature_files]
print(subtitles_files[10])

tt0116695_jerry maguire.srt


In [17]:
subtitle_paths = [os.path.join(SUBTITLES_PATH, subtitle_file) for subtitle_file in subtitles_files]
print(subtitle_paths[10])

train/train/subtitles\tt0116695_jerry maguire.srt


In [18]:
movie_subtitles = {}

for idx, file in enumerate(files):
    movie_name = movies[idx]
    movie_subtitles[movie_name] = []
    # print(movie_name)

    if movie_name == "pretty woman":
        with open(subtitle_paths[idx], 'r', encoding='utf-16') as subtitle_file:
            movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
    else:
        with open(subtitle_paths[idx], 'r', encoding='utf-8') as subtitle_file:
            movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))


In [19]:
def print_sub_info(sub):
    print(f"Subtitle {idx + 1}")
    print(f"Start Time: {sub.start.total_seconds()}")
    print(f"End Time: {sub.end.total_seconds()}")
    print(f"Text: {sub.content}")
    print()

In [20]:
movie_name = movies[0]
print(f"MOVIE: {movie_name}")
for idx, sub in enumerate(movie_subtitles[movie_name]):
    print_sub_info(sub)

    if idx == 1: break

MOVIE: the lost weekend
Subtitle 1
Start Time: 150.32
End Time: 152.926
Text: You'd better take this along, Don.
It's gonna be cold on the farm.

Subtitle 2
Start Time: 153.04
End Time: 154.963
Text: - OK.
- How many shirts are you taking?



In [21]:
last_processed_subtitle_idx = 0
movies_scenes_subtitles = {}
missed_subtitles = 0

for scene_idx, scene_row in full_data.iterrows():
    scene_start = scene_row['start']
    scene_end = scene_row['end']
    scene_id = scene_row['scene_id']
    movie_name = scene_row['movie']

    # Create new dictionary for every movie
    if movie_name not in movies_scenes_subtitles:
        # print(f"Movie: {movie_name}")
        # print("==================================================================")
        movies_scenes_subtitles[movie_name] = {}
        last_processed_subtitle_idx = 0
    
    # print(f"Scene: {scene_id}, Start Time - {scene_start}, End Time - {scene_end}")
        
    # Craete new dictionary for every scene within movie
    movies_scenes_subtitles[movie_name][scene_id] = []

    current_movie_subtitles = movie_subtitles[movie_name]
    for idx in range(last_processed_subtitle_idx, len(current_movie_subtitles)):
        sub = current_movie_subtitles[idx]
        sub_start = sub.start.total_seconds()
        sub_end = sub.end.total_seconds()
        sub_content = sub.content

        # Some subtitles start just before scene_start
        if scene_start <= (sub_start + 0.05) and scene_end >= sub_end:
            # print(f"Subtitle {idx + 1}: Start Time - {sub_start}, End Time - {sub_end}")
            # print(f"{sub_content}")

            # Add subtitle content to the dictionary for the current scene
            movies_scenes_subtitles[movie_name][scene_id].append(sub_content)
            
        elif scene_end < sub_end:
            last_processed_subtitle_idx = idx
            break
        else:
            missed_subtitles += 1
            # print(f"Scene: {scene_id}, Start Time - {scene_start}, End Time - {scene_end}")
            # print(f"Subtitle {idx + 1}: Start Time - {sub_start}, End Time - {sub_end}")
            # print(f"{sub_content}")

# I believe that subtitles are ignored for 2 reasons
# 1. some subtitles start just before scene starts
# 2. some scenes are missing from dataset
print(f"Ignored subtitltes: {missed_subtitles}")

Ignored subtitltes: 3018


In [22]:
movies_scenes_subtitles['pretty woman'][1]

['- And you have all four. Take a look.\n- Oh!',
 "But I wouldn't trust you with\nreal gold. That's why this one's\nonly worth about a penny.",
 '- Hmm.\n- And if you wonder where\nthe other one went, watch.',
 'Penny from the ear.\nHow much for the rest?',
 "- Have you seen Edward?\n- No, I haven't. Great party, Philip.",
 'Well, my wife went to a lot of trouble.\nShe called a caterer.',
 '- Excuse me, Anne. Howard, how are ya?\n- Philip, good.',
 "Hey, I understand Edward's\ntaking over Morse Industries.",
 "- Yeah, well, he's not\nhere to get a suntan.\n- Can I get in on it?",
 '- Yeah, call me.\n- When?\n- Just call me.',
 "Uh, hi. I'm Philip Stuckey,\nEdward Lewis' lawyer.",
 "- Hey, where's\nthe guest of honour anyway?\n- Well, if I know him,"]

## Subtitles cleaning and combining

In [23]:
for movie_name, scenes in movies_scenes_subtitles.items():
    for scene_id, subtitles in scenes.items():
        cleaned_subtitles = []
        for subtitle in subtitles:
            # Remove new line characters
            cleaned_subtitle = subtitle.replace('\n', ' ').strip()
            cleaned_subtitles.append(cleaned_subtitle)
        combined_text = " ".join(cleaned_subtitles)
        movies_scenes_subtitles[movie_name][scene_id] = combined_text

movies_scenes_subtitles['pretty woman'][1]

"- And you have all four. Take a look. - Oh! But I wouldn't trust you with real gold. That's why this one's only worth about a penny. - Hmm. - And if you wonder where the other one went, watch. Penny from the ear. How much for the rest? - Have you seen Edward? - No, I haven't. Great party, Philip. Well, my wife went to a lot of trouble. She called a caterer. - Excuse me, Anne. Howard, how are ya? - Philip, good. Hey, I understand Edward's taking over Morse Industries. - Yeah, well, he's not here to get a suntan. - Can I get in on it? - Yeah, call me. - When? - Just call me. Uh, hi. I'm Philip Stuckey, Edward Lewis' lawyer. - Hey, where's the guest of honour anyway? - Well, if I know him,"

## Tokenize and Embeddings

In [None]:
# from transformers import BertTokenizer, BertModel

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased').to(device)

# embeddings = []
# for movie_name, scenes in movies_scenes_subtitles.items():
#     for scene_id, combined_text in scenes.items():
#         inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#         inputs = {k: v.to(device) for k, v in inputs.items()}  # move tensors to GPU
#         outputs = model(**inputs)
#         embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().tolist() # move outputs to CPU
#         movies_scenes_subtitles[movie_name][scene_id] = embedding

In [None]:
# movies_scenes_subtitles['pretty woman'][1]

## Merge with Full Data

In [None]:
# embeddings = []
# for index, row in full_data.iterrows():
#     movie_name = row['movie']
#     scene_id = row['scene_id']
    
#     embedding = movies_scenes_subtitles.get(movie_name, {}).get(scene_id, None)
    
#     embeddings.append(embedding)

# full_data.insert(full_data.columns.get_loc('label'), 'embedding', embeddings)
# full_data.tail()

## First Model

### Tuning hyperparameters

In [None]:
# from sklearn.model_selection import cross_val_score, RandomizedSearchCV
# from sklearn.ensemble import RandomForestClassifier

# full_data_clean = full_data.dropna()

# embeddings = np.array(full_data_clean['embedding'].tolist())

# X = full_data_clean[['s_dur', 'n_shots', 'ava_shot_dur', 'rel_id_loc', 'rel_t_loc', 'ava_char_score', 'is_prot_appear']]
# X = np.hstack((X, embeddings))
# y = full_data_clean['label']

# param_dist = {
#     'n_estimators': [210, 240, 200, 225, 250],
#     'min_samples_split': [15, 25, 35],
#     'min_samples_leaf': [10, 15, 20],
#     'max_depth': [10, 15, 20, 25, 30, 50]
# }

# random_search = RandomizedSearchCV(RandomForestClassifier(random_state=23), param_distributions=param_dist, n_iter=10, cv=5, random_state=23, n_jobs=-1)
# random_search.fit(X, y)

# print("Best parameters:", random_search.best_params_)

### Cross-Validation

In [None]:
# model = RandomForestClassifier(random_state=23, n_estimators=200, min_samples_split=15, min_samples_leaf=20, max_depth=15)
# model.fit(X, y)
# y_pred = model.predict(X)
# scores = cross_val_score(model, X, y, cv=5)
# print(f"Mean Accuracy from cross-validation: {np.mean(scores)}")

### Scores

In [None]:
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from sklearn.model_selection import cross_val_predict

# _, levels = pd.factorize(full_data_clean['label'])
# print(classification_report(y,y_pred,target_names=levels))

# accuracy = accuracy_score(y, y_pred)
# print()
# print(f"Accuracy: {accuracy}")

In [None]:
# Assuming 'model', 'X', and 'y' are already defined
# predicted = cross_val_predict(model, X, y, cv=5)
# conf_matrix = confusion_matrix(y, y_pred)

# print("Confusion Matrix:")
# print(conf_matrix)

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(figsize=(12,10))
# sns.heatmap(conf_matrix, fmt='g', ax=ax, annot=True)

## Second Model

In [63]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Prepare data

In [58]:
data = []
for movie, scenes in movies_scenes_subtitles.items():
    for scene_number, text in scenes.items():
        data.append({"movie": movie, "scene": scene_number, "text": text})
scenes_df = pd.DataFrame(data)

combined_df = pd.concat([scenes_df, full_data['label']], axis=1)

label_encoder = LabelEncoder()
combined_df['encoded_label'] = label_encoder.fit_transform(combined_df['label'])

combined_df.head()


Unnamed: 0,movie,scene,text,label,encoded_label
0,the lost weekend,0,"You'd better take this along, Don. It's gonna ...",Opening Image,13
1,the lost weekend,1,Are you sure it's in the closet? I can't find ...,Opening Image,13
2,the lost weekend,2,"- Did you find it? - Oh, sure, sure. Here it i...",Set-Up,14
3,the lost weekend,3,The new Thurber book with comical jokes and pi...,Set-Up,14
4,the lost weekend,4,Nobody? What are they playing? Brahms' 2nd Sym...,Theme Stated,15


### Train test split

In [59]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    combined_df['text'], combined_df['encoded_label'], random_state=23, test_size=0.3, stratify=combined_df['encoded_label'])

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, random_state=23, test_size=0.5, stratify=temp_labels)

### Tokenizer

In [61]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, labels):
    encodings = tokenizer(texts.to_list(), truncation=True, padding=True, return_tensors="pt")
    labels = torch.tensor(labels)
    return encodings, labels

train_encodings, train_labels = encode_texts(train_texts, train_labels.values)
val_encodings, val_labels = encode_texts(val_texts, val_labels.values)
test_encodings, test_labels = encode_texts(test_texts, test_labels.values)


### DataLoader

In [69]:
train_dataset = TensorDataset(train_encodings['input_ids'], 
                              train_encodings['attention_mask'], 
                              train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], 
                            val_encodings['attention_mask'], 
                            val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], 
                             test_encodings['attention_mask'], 
                             test_labels)

batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Trening and Validation

In [75]:
from tqdm import tqdm

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

def calculate_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Trening model
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss, total_accuracy = 0, 0

    for step, batch in enumerate(train_loader):
        b_input_ids, b_input_mask, b_labels = tuple(t for t in batch)
        
        model.zero_grad()
        b_labels = b_labels.long()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs} - Loss: {avg_train_loss}')

    # Validation
    model.eval()
    total_eval_accuracy = 0

    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = tuple(t for t in batch)
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += calculate_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f'Epoch {epoch+1}/{epochs} - Validation Accuracy: {avg_val_accuracy}')

print("Training complete")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prepare testing data

In [None]:
TEST_TIMESTAMPS_PATH = 'test/test/scene_timestamps'
TEST_FEATURES_PATH = 'test/test/features'
TEST_SUBTITLES_PATH = 'test/test/subtitles'

In [None]:
def load_test_files(path):
    if os.path.exists(path):
        return os.listdir(path)
    else:
        print("PATH DOES NOT EXIST!")
        return []

def extract_movie_info(files):
    movie_ids, movies = [], []
    for file in files:
        movie_name = get_movie_name(file)
        movie_id = get_movie_id(file)
        movie_ids.append(movie_id)
        movies.append(movie_name)
    return movie_ids, movies

def prepare_dataframes(files, path_func, data_func):
    dfs = []
    for idx, file in enumerate(files):
        path = path_func(file)
        df = data_func(path, movies[idx])
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

def merge_dataframes(df1, df2):
    test_full_data = pd.merge(df1, df2, on=["scene_id", "movie"], how="outer")
    test_full_data["end"] = test_full_data["start"] + test_full_data["s_dur"]
    return test_full_data

In [None]:
test_files = load_test_files(TEST_TIMESTAMPS_PATH)
test_features_files = load_test_files(TEST_FEATURES_PATH)
test_movies_ids, test_movies = extract_movie_info(test_files)

test_timestamps = prepare_dataframes(test_files, 
                                     lambda file: os.path.join(TEST_TIMESTAMPS_PATH, file), 
                                     prepare_csv)

test_features = prepare_dataframes(test_files, 
                                   lambda file: os.path.join(TEST_FEATURES_PATH, get_feature_file(file)), 
                                   prepare_csv)

test_full_data = merge_dataframes(test_timestamps, test_features)

print(test_full_data.shape)
test_full_data.tail()

In [None]:
def convert_files_to_subtitles(test_files):
    return [file.replace('.csv', '.srt') for file in test_files]

def load_subtitles(test_subtitle_paths, test_movies):
    test_movie_subtitles = {}
    for idx, movie_name in enumerate(test_movies):
        print(idx)
        with open(test_subtitle_paths[idx], 'r', encoding='utf-16') as subtitle_file:
            test_movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
    return test_movie_subtitles

def associate_scenes_with_subtitles(test_full_data, test_movie_subtitles):
    test_movies_scenes_subtitles = {}
    last_processed_subtitle_idx = 0

    for scene_idx, scene_row in test_full_data.iterrows():
        scene_start, scene_end = scene_row['start'], scene_row['end']
        scene_id, movie_name = scene_row['scene_id'], scene_row['movie']
        
        if movie_name not in test_movies_scenes_subtitles:
            test_movies_scenes_subtitles[movie_name] = {}
            last_processed_subtitle_idx = 0

        test_movies_scenes_subtitles[movie_name][scene_id] = []

        current_movie_subtitles = test_movie_subtitles[movie_name]
        for idx in range(last_processed_subtitle_idx, len(current_movie_subtitles)):
            sub = current_movie_subtitles[idx]
            sub_start, sub_end = sub.start.total_seconds(), sub.end.total_seconds()

            if scene_start <= (sub_start + 0.05) and scene_end >= sub_end:
                test_movies_scenes_subtitles[movie_name][scene_id].append(sub.content)
            elif scene_end < sub_end:
                last_processed_subtitle_idx = idx
                break

    return test_movies_scenes_subtitles

def calculate_embeddings(test_movies_scenes_subtitles):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)

    embeddings = {}
    for movie_name, scenes in test_movies_scenes_subtitles.items():
        for scene_id, combined_text in scenes.items():
            inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
            embeddings.setdefault(movie_name, {})[scene_id] = embedding

    return embeddings

In [None]:
test_subtitles_files = convert_files_to_subtitles(test_features_files)
test_subtitle_paths = [os.path.join(TEST_SUBTITLES_PATH, subtitle_file) for subtitle_file in test_subtitles_files]

test_movie_subtitles = load_subtitles(test_subtitle_paths, test_movies)
test_movies_scenes_subtitles = associate_scenes_with_subtitles(test_full_data, test_movie_subtitles)
movies_scenes_embeddings = calculate_embeddings(test_movies_scenes_subtitles)

embeddings = []
for index, row in test_full_data.iterrows():
    movie_name, scene_id = row['movie'], row['scene_id']
    embedding = movies_scenes_embeddings.get(movie_name, {}).get(scene_id, None)
    embeddings.append(embedding)

test_full_data.insert(test_full_data.columns.get_loc('label'), 'embedding', embeddings)
test_full_data.tail()