## Imports and global variables

In [150]:
import os
import pandas as pd
import numpy as np

In [151]:
TIMESTAMPS_PATH = 'train/train/scene_timestamps'
FEATURES_PATH = 'train/train/features'
LABELS_PATH = 'train/train/labels'
SUBTITLES_PATH = 'train/train/subtitles'

## Load timestamps

In [152]:
if os.path.exists(TIMESTAMPS_PATH):
    files = os.listdir(TIMESTAMPS_PATH)

    # print(f"List of files in {TIMESTAMPS_PATH}:")
    # for file in files:
        # print(file)

In [153]:
sample_file = files[0]
print(f"file name: {sample_file}")

movie_name = sample_file[10:-15]
print(f"movie name: {movie_name}")

file name: tt0037884_the lost weekend_timestamps.csv
movie name: the lost weekend


In [154]:
sample_timestamp_path = os.path.join(TIMESTAMPS_PATH, sample_file)

timestamps = pd.read_csv(sample_timestamp_path)
timestamps.rename(columns={"Unnamed: 0": "scene_id"}, inplace=True)
timestamps["movie"] = movie_name
timestamps.head()

Unnamed: 0,scene_id,start,end,movie
0,0,0.0,213.171,the lost weekend
1,1,218.76,228.854,the lost weekend
2,2,240.782,240.782,the lost weekend
3,3,282.616,282.616,the lost weekend
4,4,313.563,355.855,the lost weekend


## Load features

In [155]:
sample_feature_file = sample_file.replace("_timestamps", "")
sample_feature_file

'tt0037884_the lost weekend.csv'

In [156]:
sample_feature_path = os.path.join(FEATURES_PATH, sample_feature_file)

features = pd.read_csv(sample_feature_path)
features.rename(columns={"Unnamed: 0": "scene_id"}, inplace=True)
features["movie"] = movie_name
features.head()

Unnamed: 0,scene_id,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,movie
0,0,218.718333,6,36.453056,0.0,0.0,1295.579078,1,the lost weekend
1,1,21.980333,3,7.326778,0.008403,0.036189,1295.579078,1,the lost weekend
2,2,41.792333,1,41.792333,0.016807,0.039832,1256.344447,1,the lost weekend
3,3,30.905333,1,30.905333,0.02521,0.046752,1256.344447,1,the lost weekend
4,4,53.470333,4,13.367583,0.033613,0.051871,1256.344447,1,the lost weekend


## Load labels

In [157]:
sample_labels_path = os.path.join(LABELS_PATH, sample_feature_file)

labels = pd.read_csv(sample_labels_path)
labels.rename(
    columns={
        "Unnamed: 0": "scene_id",
        "0": "label"
    }, inplace=True
)

labels["movie"] = movie_name
labels.head()

Unnamed: 0,scene_id,label,movie
0,0,Opening Image,the lost weekend
1,1,Opening Image,the lost weekend
2,2,Set-Up,the lost weekend
3,3,Set-Up,the lost weekend
4,4,Theme Stated,the lost weekend


## Merge dataframes

In [158]:
timestamps_features = pd.merge(timestamps, features, on=["scene_id", "movie"])
full_data = pd.merge(timestamps_features, labels, on=["scene_id", "movie"])
full_data.head()

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,label
0,0,0.0,213.171,the lost weekend,218.718333,6,36.453056,0.0,0.0,1295.579078,1,Opening Image
1,1,218.76,228.854,the lost weekend,21.980333,3,7.326778,0.008403,0.036189,1295.579078,1,Opening Image
2,2,240.782,240.782,the lost weekend,41.792333,1,41.792333,0.016807,0.039832,1256.344447,1,Set-Up
3,3,282.616,282.616,the lost weekend,30.905333,1,30.905333,0.02521,0.046752,1256.344447,1,Set-Up
4,4,313.563,355.855,the lost weekend,53.470333,4,13.367583,0.033613,0.051871,1256.344447,1,Theme Stated


## Correct end column

In [159]:
full_data["end"] = full_data["start"] + full_data["s_dur"]
full_data

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,label
0,0,0.000,218.718333,the lost weekend,218.718333,6,36.453056,0.000000,0.000000,1295.579078,1,Opening Image
1,1,218.760,240.740333,the lost weekend,21.980333,3,7.326778,0.008403,0.036189,1295.579078,1,Opening Image
2,2,240.782,282.574333,the lost weekend,41.792333,1,41.792333,0.016807,0.039832,1256.344447,1,Set-Up
3,3,282.616,313.521333,the lost weekend,30.905333,1,30.905333,0.025210,0.046752,1256.344447,1,Set-Up
4,4,313.563,367.033333,the lost weekend,53.470333,4,13.367583,0.033613,0.051871,1256.344447,1,Theme Stated
...,...,...,...,...,...,...,...,...,...,...,...,...
98,114,5796.582,5849.051333,the lost weekend,52.469333,7,7.495619,0.957983,0.958906,1525.009777,1,Final Image
99,115,5849.093,5889.341333,the lost weekend,40.248333,3,13.416111,0.966387,0.967592,1148.518746,1,Final Image
100,116,5889.383,5943.896333,the lost weekend,54.513333,1,54.513333,0.974790,0.974257,1525.009777,1,Final Image
101,117,5943.938,5972.800333,the lost weekend,28.862333,5,5.772467,0.983193,0.983282,1525.009777,1,Final Image


## Load subtitles

In [160]:
import srt

In [161]:
sample_subtitles_file = sample_feature_file.replace('.csv', '.srt')
sample_subtitles_file

'tt0037884_the lost weekend.srt'

In [162]:
sample_subtitles_path = os.path.join(SUBTITLES_PATH, sample_subtitles_file)
sample_subtitles_path

'train/train/subtitles\\tt0037884_the lost weekend.srt'

In [163]:
subtitles = []
with open(sample_subtitles_path, 'r') as file:
    subtitles = list(srt.parse(file.read())) 

In [164]:
def print_sub_info(sub):
    print(f"Subtitle {idx + 1}")
    print(f"Start Time: {sub.start.total_seconds()}")
    print(f"End Time: {sub.end.total_seconds()}")
    print(f"Text: {sub.content}")
    print()

In [165]:
for idx, sub in enumerate(subtitles):
    print_sub_info(sub)

    if idx == 3: break

Subtitle 1
Start Time: 150.32
End Time: 152.926
Text: You'd better take this along, Don.
It's gonna be cold on the farm.

Subtitle 2
Start Time: 153.04
End Time: 154.963
Text: - OK.
- How many shirts are you taking?

Subtitle 3
Start Time: 155.08
End Time: 156.969
Text: - Three.
- I'm taking five.

Subtitle 4
Start Time: 157.08
End Time: 158.969
Text: - Five?
- Yeah, I told them at the office



In [166]:
last_processed_subtitle_idx = 0
movies_subtitles = {}

for scene_idx, scene_row in full_data.iterrows():
    scene_start = scene_row['start']
    scene_end = scene_row['end']
    scene_id = scene_row['scene_id']
    movie_name = scene_row['movie']
    
    print(f"Movie: {movie_name}")
    print(f"Scene: {scene_id}, Start Time - {scene_start}, End Time - {scene_end}")
    print("==================================================================")

    # Create new dictionary for every movie
    if movie_name not in movies_subtitles:
        movies_subtitles[movie_name] = {}

    # Craete new dictionary for every scene within movie
    movies_subtitles[movie_name][scene_id] = []
    
    for idx in range(last_processed_subtitle_idx, len(subtitles)):
        sub = subtitles[idx]
        sub_start = sub.start.total_seconds()
        sub_end = sub.end.total_seconds()
        sub_content = sub.content
        
        if scene_start <= sub_start and scene_end >= sub_end:
            print(f"Subtitle {idx + 1}: Start Time - {sub_start}, End Time - {sub_end}")
            print(f"{sub_content}")
            print()

            # Add subtitle content to the dictionary for the current scene
            movies_subtitles[movie_name][scene_id].append(sub_content)
            
        elif scene_end < sub_end:
            last_processed_subtitle_idx = idx
            break
        else:
            print("SOME OTHER CONDITION!")

Movie: the lost weekend
Scene: 0, Start Time - 0.0, End Time - 218.71833333333333
Subtitle 1: Start Time - 150.32, End Time - 152.926
You'd better take this along, Don.
It's gonna be cold on the farm.

Subtitle 2: Start Time - 153.04, End Time - 154.963
- OK.
- How many shirts are you taking?

Subtitle 3: Start Time - 155.08, End Time - 156.969
- Three.
- I'm taking five.

Subtitle 4: Start Time - 157.08, End Time - 158.969
- Five?
- Yeah, I told them at the office

Subtitle 5: Start Time - 159.08, End Time - 160.969
I might not be back until Tuesday.

Subtitle 6: Start Time - 161.08, End Time - 164.402
We'll get there this afternoon.
That's Friday, Saturday, Sunday, Monday.

Subtitle 7: Start Time - 164.52, End Time - 167.126
We'll make it
a long, wonderful weekend.

Subtitle 8: Start Time - 167.24, End Time - 168.651
Sounds long, all right.

Subtitle 9: Start Time - 168.76, End Time - 171.286
It'll be good for you, Don,
after what you've been through.

Subtitle 10: Start Time - 171.4

In [167]:
movies_subtitles['the lost weekend']

{0: ["You'd better take this along, Don.\nIt's gonna be cold on the farm.",
  '- OK.\n- How many shirts are you taking?',
  "- Three.\n- I'm taking five.",
  '- Five?\n- Yeah, I told them at the office',
  'I might not be back until Tuesday.',
  "We'll get there this afternoon.\nThat's Friday, Saturday, Sunday, Monday.",
  "We'll make it\na long, wonderful weekend.",
  'Sounds long, all right.',
  "It'll be good for you, Don,\nafter what you've been through.",
  'Trees, grass,\nsweet cider and buttermilk',
  "and water from that well\nthat's colder than any other...",
  'Wick, please. Why this emphasis\non liquids? Very dull liquids.',
  'Sorry, Don.',
  "I think it'd be a good idea\nif we took along my typewriter.",
  '- What for?\n- To write.',
  "I'm gonna write there,\nget started on that novel.",
  '- You really feel up to writing?\n- Why not?',
  "After what you've been through.",
  "I haven't touched the stuff\nfor 10 days now.",
  "I know.\nI know you haven't, Don.",
  '- Where

## Subtitles cleaning and combining

In [168]:
for movie_name, scenes in movies_subtitles.items():
    for scene_id, subtitles in scenes.items():
        cleaned_subtitles = []
        for subtitle in subtitles:
            # Remove new line characters and dashes
            cleaned_subtitle = subtitle.replace('\n', ' ').strip()
            cleaned_subtitles.append(cleaned_subtitle)
        combined_text = " ".join(cleaned_subtitles)
        movies_subtitles[movie_name][scene_id] = combined_text

movies_subtitles['the lost weekend'][0]

"You'd better take this along, Don. It's gonna be cold on the farm. - OK. - How many shirts are you taking? - Three. - I'm taking five. - Five? - Yeah, I told them at the office I might not be back until Tuesday. We'll get there this afternoon. That's Friday, Saturday, Sunday, Monday. We'll make it a long, wonderful weekend. Sounds long, all right. It'll be good for you, Don, after what you've been through. Trees, grass, sweet cider and buttermilk and water from that well that's colder than any other... Wick, please. Why this emphasis on liquids? Very dull liquids. Sorry, Don. I think it'd be a good idea if we took along my typewriter. - What for? - To write. I'm gonna write there, get started on that novel. - You really feel up to writing? - Why not? After what you've been through. I haven't touched the stuff for 10 days now. I know. I know you haven't, Don. - Where is the typewriter? - In the living room. In the closet. Kind of towards the back."

## Tokenize and Embeddings

In [169]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

embeddings = []
for movie_name, scenes in movies_subtitles.items():
    for scene_id, combined_text in scenes.items():
        inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
        movies_subtitles[movie_name][scene_id] = embedding

In [170]:
movies_subtitles['the lost weekend'][0]

[0.04265003651380539,
 0.09052932262420654,
 0.3750017285346985,
 -0.027731701731681824,
 0.32529956102371216,
 0.07981876283884048,
 0.04016394168138504,
 0.7127708792686462,
 -0.12875647842884064,
 -0.26669394969940186,
 0.12519410252571106,
 -0.3048456907272339,
 -0.24864239990711212,
 0.02924877032637596,
 -0.11349087208509445,
 0.5400148034095764,
 0.21039463579654694,
 -0.09099382162094116,
 -0.1361260861158371,
 0.34978634119033813,
 0.4703497290611267,
 -0.08998460322618484,
 0.0726136565208435,
 0.6392247080802917,
 0.433660626411438,
 0.15803928673267365,
 0.05033893138170242,
 -0.029054099693894386,
 -0.1713666468858719,
 -0.3491516709327698,
 0.39464908838272095,
 0.026236260309815407,
 -0.3017357289791107,
 -0.17341116070747375,
 0.22681188583374023,
 -0.1017628014087677,
 -0.06641979515552521,
 -0.1900654137134552,
 0.01860838383436203,
 0.12258177995681763,
 -0.583093523979187,
 -0.26915350556373596,
 -0.22954396903514862,
 -0.06598791480064392,
 -0.1136033833026886,
 0.

## Merge with Full Data

In [171]:
embeddings = []
for index, row in full_data.iterrows():
    movie_name = row['movie']
    scene_id = row['scene_id']
    
    embedding = movies_subtitles.get(movie_name, {}).get(scene_id, None)
    
    embeddings.append(embedding)

full_data.insert(full_data.columns.get_loc('label'), 'embedding', embeddings)
full_data.head()

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,embedding,label
0,0,0.0,218.718333,the lost weekend,218.718333,6,36.453056,0.0,0.0,1295.579078,1,"[0.04265003651380539, 0.09052932262420654, 0.3...",Opening Image
1,1,218.76,240.740333,the lost weekend,21.980333,3,7.326778,0.008403,0.036189,1295.579078,1,"[-0.0005472705815918744, -0.09439108520746231,...",Opening Image
2,2,240.782,282.574333,the lost weekend,41.792333,1,41.792333,0.016807,0.039832,1256.344447,1,"[0.021815838292241096, 0.10321275889873505, 0....",Set-Up
3,3,282.616,313.521333,the lost weekend,30.905333,1,30.905333,0.02521,0.046752,1256.344447,1,"[0.1034255176782608, -0.006280767731368542, 0....",Set-Up
4,4,313.563,367.033333,the lost weekend,53.470333,4,13.367583,0.033613,0.051871,1256.344447,1,"[0.07503195106983185, -0.03006332740187645, 0....",Theme Stated


## First Model

### Tuning hyperparameters

In [174]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
print(len(full_data))
full_data_clean = full_data.dropna()
print(len(full_data_clean))

embeddings = np.array(full_data_clean['embedding'].tolist())

X = full_data_clean[['s_dur', 'n_shots', 'ava_shot_dur', 'rel_id_loc', 'rel_t_loc', 'ava_char_score', 'is_prot_appear']]
X = np.hstack((X, embeddings))
y = full_data_clean['label']

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=23), param_distributions=param_dist, n_iter=10, cv=5, random_state=23, n_jobs=-1)
random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)

103
97




Best parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 10}


### Cross-Validation

In [175]:
model = RandomForestClassifier(random_state=23, n_estimators=200, min_samples_split=10, min_samples_leaf=4, max_depth=10)
scores = cross_val_score(model, X, y, cv=5)
print(f"Mean Accuracy from cross-validation: {np.mean(scores)}")



Mean Accuracy from cross-validation: 0.3405263157894737
