## Imports and global variables

In [22]:
import os
import re
import pandas as pd
import numpy as np
import torch

In [23]:
TIMESTAMPS_PATH = 'train/train/scene_timestamps'
FEATURES_PATH = 'train/train/features'
LABELS_PATH = 'train/train/labels'
SUBTITLES_PATH = 'train/train/subtitles'

TEST_TIMESTAMPS_PATH = 'test/test/scene_timestamps'
TEST_FEATURES_PATH = 'test/test/features'
TEST_SUBTITLES_PATH = 'test/test/subtitles'

In [24]:
# Check GPU available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load data

### Helper functions

In [43]:
def load_files(path):
    if os.path.exists(path):
        return os.listdir(path)
    else:
        print("PATH DOES NOT EXIST!")
        return []

In [26]:
def get_movie_name(file):
    return file[10:-15]

def get_movie_id(file):
    return file[:9]

In [64]:
def get_feature_file(file):
    return file.replace("_timestamps", "")

In [28]:
def prepare_csv(path, movie_name):
    df = pd.read_csv(path)
    df.rename(columns={"Unnamed: 0": "scene_id"}, inplace=True)
    df["movie"] = movie_name
    return df

In [70]:
def extract_movie_info(files):
    movie_ids, movies = [], []
    for file in files:
        movie_name = get_movie_name(file)
        movie_id = get_movie_id(file)
        movie_ids.append(movie_id)
        movies.append(movie_name)
    return movie_ids, movies

def prepare_dataframes(files, movies, path_func, data_func):
    dfs = []
    for idx, file in enumerate(files):
        path = path_func(file)
        df = data_func(path, movies[idx])
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

def merge_dataframes(df1, df2):
    test_full_data = pd.merge(df1, df2, on=["scene_id", "movie"])
    test_full_data["end"] = test_full_data["start"] + test_full_data["s_dur"]
    return test_full_data

### Files

In [56]:
train_files = load_files(TIMESTAMPS_PATH)
# remove australia because australia subtitles are in diffrent format than everything else
train_files.remove("tt0455824_australia_timestamps.csv")

test_files = load_files(TEST_TIMESTAMPS_PATH)

### Movies

In [57]:
train_movies_ids, train_movies = extract_movie_info(train_files)
test_movies_ids, test_movies = extract_movie_info(test_files)

### Timestamps

In [61]:
train_timestamps = prepare_dataframes(
    train_files,
    train_movies,
    lambda file: os.path.join(TIMESTAMPS_PATH, file), 
    prepare_csv
)

test_timestamps = prepare_dataframes(
    test_files,
    test_movies,
    lambda file: os.path.join(TEST_TIMESTAMPS_PATH, file), 
    prepare_csv
)

### Features

In [63]:
train_features = prepare_dataframes(
    train_files,
    train_movies,
    lambda file: os.path.join(FEATURES_PATH, get_feature_file(file)), 
    prepare_csv
)

test_features = prepare_dataframes(
    test_files,
    test_movies,
    lambda file: os.path.join(TEST_FEATURES_PATH, get_feature_file(file)), 
    prepare_csv
)

### Labels

In [66]:
dfs = []
for idx, file in enumerate(train_files):
    # print(feature_files[idx])
    labels_path = os.path.join(LABELS_PATH, get_feature_file(file))
    
    df = pd.read_csv(labels_path, keep_default_na=False)
    df.rename(
        columns={
            "Unnamed: 0": "scene_id",
            "0": "label"
        }, inplace=True
    )
    df["movie"] = movies[idx]
    
    dfs.append(df)

train_labels = pd.concat(dfs, ignore_index=True)
# print(train_labels.shape)
# train_labels.tail()

### Merge dataframes

In [84]:
train_merged = merge_dataframes(train_timestamps, train_features)
# additionaly merge labels to train data
train_full_data = pd.merge(train_merged, train_labels, on=["scene_id", "movie"], how="outer")
print(train_full_data.shape)

test_full_data = merge_dataframes(test_timestamps, test_features)
print(test_full_data.shape)


(3729, 12)
(2470, 11)


In [12]:
# dfs = []
# for idx, file in enumerate(files):
#     # print(feature_files[idx])
#     labels_path = os.path.join(LABELS_PATH, feature_files[idx])
    
#     df = pd.read_csv(labels_path, keep_default_na=False)
#     df.rename(
#         columns={
#             "Unnamed: 0": "scene_id",
#             "0": "label"
#         }, inplace=True
#     )
#     df["movie"] = movies[idx]
    
#     dfs.append(df)

# labels = pd.concat(dfs, ignore_index=True)
# print(labels.shape)
# labels.tail()

(3729, 3)


Unnamed: 0,scene_id,label,movie
3724,220,Finale,dallas buyers club
3725,221,Finale,dallas buyers club
3726,224,Final Image,dallas buyers club
3727,226,Final Image,dallas buyers club
3728,227,Final Image,dallas buyers club


In [74]:
# timestamps_features = pd.merge(timestamps, features, on=["scene_id", "movie"], how="outer")
# full_data = pd.merge(timestamps_features, labels, on=["scene_id", "movie"], how="outer")

# print(full_data.shape)
# full_data.tail()

In [14]:
# full_data["end"] = full_data["start"] + full_data["s_dur"]
# full_data.tail()

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,label
3724,220,6499.655,6574.354333,dallas buyers club,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,Finale
3725,221,6574.396,6579.776333,dallas buyers club,5.380333,1,5.380333,0.965066,0.936929,372.1087,0,Finale
3726,224,6589.161,6623.987333,dallas buyers club,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,Final Image
3727,226,6629.701,6682.462333,dallas buyers club,52.761333,19,2.776912,0.9869,0.944811,1076.267498,1,Final Image
3728,227,6682.504,6688.176333,dallas buyers club,5.672333,4,1.418083,0.991266,0.952336,2085.459362,1,Final Image


## Ssubtitles

In [85]:
import srt

### Helper runctions

In [120]:
problematic_movies = ["the ugly truth", "the social network", "the girl with the dragon tattoo"]

In [131]:
def convert_files_to_subtitles(files):
    return [file.replace('.csv', '.srt') for file in files]

def load_subtitles(paths, movies):
    movie_subtitles = {}
    for idx, movie_name in enumerate(movies):
        # print(movie_name)
        
        if movie_name == "pretty woman":
            with open(paths[idx], 'r', encoding='utf-16') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
        elif movie_name in problematic_movies:
            with open(paths[idx], 'r', encoding='utf-8', errors='replace') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
        else:
            with open(paths[idx], 'r', encoding='utf-8') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
                
    return movie_subtitles

def associate_scenes_with_subtitles(full_data, movie_subtitles):
    movies_scenes_subtitles = {}
    last_processed_subtitle_idx = 0
    missed_subtitles = 0

    for scene_idx, scene_row in full_data.iterrows():
        scene_start, scene_end = scene_row['start'], scene_row['end']
        scene_id, movie_name = scene_row['scene_id'], scene_row['movie']
        
        # Create new dictionary for every movie
        if movie_name not in movies_scenes_subtitles:
            movies_scenes_subtitles[movie_name] = {}
            last_processed_subtitle_idx = 0

        # Craete new dictionary for every scene within movie
        movies_scenes_subtitles[movie_name][scene_id] = []

        current_movie_subtitles = movie_subtitles[movie_name]
        for idx in range(last_processed_subtitle_idx, len(current_movie_subtitles)):
            sub = current_movie_subtitles[idx]
            sub_start, sub_end = sub.start.total_seconds(), sub.end.total_seconds()

            # Some subtitles start just before scene_start
            if scene_start <= (sub_start + 0.05) and scene_end >= sub_end:
                # Add subtitle content to the dictionary for the current scene
                movies_scenes_subtitles[movie_name][scene_id].append(sub.content)
            elif scene_end < sub_end:
                last_processed_subtitle_idx = idx
                break
            else:
                missed_subtitles += 1

    print(f"Ignored subtitltes: {missed_subtitles}")
    return movies_scenes_subtitles

In [135]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [136]:
def calculate_embeddings(movies_scenes_subtitles):
    embeddings = {}
    for movie_name, scenes in movies_scenes_subtitles.items():
        for scene_id, combined_text in scenes.items():
            inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
            embeddings.setdefault(movie_name, {})[scene_id] = embedding

    return embeddings

### Get subtitle files

In [129]:
train_features_files = load_files(FEATURES_PATH)
# remove australia because australia subtitles are in diffrent format than everything else
train_features_files.remove('tt0455824_australia.csv')
test_features_files = load_files(TEST_FEATURES_PATH)

train_subtitles_files = convert_files_to_subtitles(train_features_files)
train_subtitle_paths = [os.path.join(SUBTITLES_PATH, subtitle_file) for subtitle_file in train_subtitles_files]

test_subtitles_files = convert_files_to_subtitles(test_features_files)
test_subtitle_paths = [os.path.join(TEST_SUBTITLES_PATH, subtitle_file) for subtitle_file in test_subtitles_files]

### Load subtitles 

In [130]:
train_movie_subtitles = load_subtitles(train_subtitle_paths, train_movies)
test_movie_subtitles = load_subtitles(test_subtitle_paths, test_movies)

### Associate Scenes with subtitles

In [133]:
train_movies_scenes_subtitles = associate_scenes_with_subtitles(train_full_data, train_movie_subtitles)
test_movies_scenes_subtitles = associate_scenes_with_subtitles(test_full_data, test_movie_subtitles)

Ignored subtitltes: 3018
Ignored subtitltes: 3165


## Subtitles cleaning and combining

In [23]:
for movie_name, scenes in movies_scenes_subtitles.items():
    for scene_id, subtitles in scenes.items():
        cleaned_subtitles = []
        for subtitle in subtitles:
            # Remove new line characters
            cleaned_subtitle = subtitle.replace('\n', ' ').strip()
            cleaned_subtitle = re.sub(r'<.*?>', '', cleaned_subtitle)
            cleaned_subtitle = re.sub(r'♪', '', cleaned_subtitle)
            cleaned_subtitles.append(cleaned_subtitle)
        combined_text = " ".join(cleaned_subtitles)
        movies_scenes_subtitles[movie_name][scene_id] = combined_text

In [24]:
movies_scenes_subtitles['four weddings and a funeral'][3]

" Although I can't dismiss   The memory of her kiss   I guess he's not for me "

## Tokenize and Embeddings

In [None]:
# from transformers import BertTokenizer, BertModel

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased').to(device)

# embeddings = []
# for movie_name, scenes in movies_scenes_subtitles.items():
#     for scene_id, combined_text in scenes.items():
#         inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
#         inputs = {k: v.to(device) for k, v in inputs.items()}  # move tensors to GPU
#         outputs = model(**inputs)
#         embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().tolist() # move outputs to CPU
#         movies_scenes_subtitles[movie_name][scene_id] = embedding

In [None]:
# movies_scenes_subtitles['pretty woman'][1]

## Merge with Full Data

In [25]:
# embeddings = []
# for index, row in full_data.iterrows():
#     movie_name = row['movie']
#     scene_id = row['scene_id']
    
#     embedding = movies_scenes_subtitles.get(movie_name, {}).get(scene_id, None)
    
#     embeddings.append(embedding)

# full_data.insert(full_data.columns.get_loc('label'), 'embedding', embeddings)
# full_data.tail()

from sklearn.preprocessing import LabelEncoder

data = []
for movie, scenes in movies_scenes_subtitles.items():
    for scene_number, text in scenes.items():
        data.append({"movie": movie, "scene": scene_number, "text": text})
scenes_df = pd.DataFrame(data)

full_data.insert(full_data.columns.get_loc('label'), 'text', scenes_df['text'])

label_encoder = LabelEncoder()
full_data['encoded_label'] = label_encoder.fit_transform(full_data['label'])

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,text,label,encoded_label
0,0,0.000,218.718333,the lost weekend,218.718333,6,36.453056,0.000000,0.000000,1295.579078,1,"You'd better take this along, Don. It's gonna ...",Opening Image,13
1,1,218.760,240.740333,the lost weekend,21.980333,3,7.326778,0.008403,0.036189,1295.579078,1,Are you sure it's in the closet? I can't find ...,Opening Image,13
2,2,240.782,282.574333,the lost weekend,41.792333,1,41.792333,0.016807,0.039832,1256.344447,1,"- Did you find it? - Oh, sure, sure. Here it i...",Set-Up,14
3,3,282.616,313.521333,the lost weekend,30.905333,1,30.905333,0.025210,0.046752,1256.344447,1,The new Thurber book with comical jokes and pi...,Set-Up,14
4,4,313.563,367.033333,the lost weekend,53.470333,4,13.367583,0.033613,0.051871,1256.344447,1,Nobody? What are they playing? Brahms' 2nd Sym...,Theme Stated,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3724,220,6499.655,6574.354333,dallas buyers club,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,to be mentally healthy or physically healthy. ...,Finale,9
3725,221,6574.396,6579.776333,dallas buyers club,5.380333,1,5.380333,0.965066,0.936929,372.108700,0,DAVID: We lost. (SIGHS),Finale,9
3726,224,6589.161,6623.987333,dallas buyers club,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,What?,Final Image,8
3727,226,6629.701,6682.462333,dallas buyers club,52.761333,19,2.776912,0.986900,0.944811,1076.267498,1,"MAN ON PA: Yes, indeed, the number one spectat...",Final Image,8


## Calculate embeddings

In [None]:
train_movies_scenes_embeddings = calculate_embeddings(train_movies_scenes_subtitles)

train_embeddings = []
for index, row in train_full_data.iterrows():
    movie_name, scene_id = row['movie'], row['scene_id']
    embedding = train_movies_scenes_embeddings.get(movie_name, {}).get(scene_id, None)
    train_embeddings.append(embedding)

test_full_data.insert(test_full_data.columns.get_loc('label'), 'embedding', train_embeddings)
print(test_full_data.tail())

In [None]:
test_movies_scenes_embeddings = calculate_embeddings(test_movies_scenes_subtitles)

test_embeddings = []
for index, row in test_full_data.iterrows():
    movie_name, scene_id = row['movie'], row['scene_id']
    embedding = test_movies_scenes_embeddings.get(movie_name, {}).get(scene_id, None)
    test_embeddings.append(embedding)

test_full_data.insert(test_full_data.columns.get_loc('label'), 'embedding', test_embeddings)
print(test_full_data.tail())

In [None]:
full_data.head()

## First Model

### Tuning hyperparameters

In [None]:
# from sklearn.model_selection import cross_val_score, RandomizedSearchCV
# from sklearn.ensemble import RandomForestClassifier

# full_data_clean = full_data.dropna()

# embeddings = np.array(full_data_clean['embedding'].tolist())

# X = full_data_clean[['s_dur', 'n_shots', 'ava_shot_dur', 'rel_id_loc', 'rel_t_loc', 'ava_char_score', 'is_prot_appear']]
# X = np.hstack((X, embeddings))
# y = full_data_clean['label']

# param_dist = {
#     'n_estimators': [210, 240, 200, 225, 250],
#     'min_samples_split': [15, 25, 35],
#     'min_samples_leaf': [10, 15, 20],
#     'max_depth': [10, 15, 20, 25, 30, 50]
# }

# random_search = RandomizedSearchCV(RandomForestClassifier(random_state=23), param_distributions=param_dist, n_iter=10, cv=5, random_state=23, n_jobs=-1)
# random_search.fit(X, y)

# print("Best parameters:", random_search.best_params_)

### Cross-Validation

In [None]:
# model = RandomForestClassifier(random_state=23, n_estimators=200, min_samples_split=15, min_samples_leaf=20, max_depth=15)
# model.fit(X, y)
# y_pred = model.predict(X)
# scores = cross_val_score(model, X, y, cv=5)
# print(f"Mean Accuracy from cross-validation: {np.mean(scores)}")

### Scores

In [None]:
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# from sklearn.model_selection import cross_val_predict

# _, levels = pd.factorize(full_data_clean['label'])
# print(classification_report(y,y_pred,target_names=levels))

# accuracy = accuracy_score(y, y_pred)
# print()
# print(f"Accuracy: {accuracy}")

In [None]:
# Assuming 'model', 'X', and 'y' are already defined
# predicted = cross_val_predict(model, X, y, cv=5)
# conf_matrix = confusion_matrix(y, y_pred)

# print("Confusion Matrix:")
# print(conf_matrix)

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(figsize=(12,10))
# sns.heatmap(conf_matrix, fmt='g', ax=ax, annot=True)

## Second Model

In [27]:
from transformers import BertModel, DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


### Train test split

In [28]:
X = full_data.drop(columns=['movie', 'encoded_label'])
y = full_data['encoded_label']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=23, test_size=0.3, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state=23, test_size=0.5, stratify=y_temp)

train_texts = X_train['text']
val_texts = X_val['text']
test_texts = X_test['text']

train_labels = y_train
val_labels = y_val
test_labels = y_test


### Tokenizer

In [29]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def encode_texts(texts, labels):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, return_tensors="pt")
    labels = torch.tensor(labels)
    return encodings, labels

train_encodings, train_labels = encode_texts(train_texts, train_labels.values)
val_encodings, val_labels = encode_texts(val_texts, val_labels.values)
test_encodings, test_labels = encode_texts(test_texts, test_labels.values)

### DataLoader

In [33]:
train_labels

tensor([12,  1,  9,  ...,  1,  9,  9], dtype=torch.int32)

In [34]:
train_dataset = TensorDataset(train_encodings['input_ids'], 
                              train_encodings['attention_mask'], 
                              train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], 
                            val_encodings['attention_mask'], 
                            val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], 
                             test_encodings['attention_mask'], 
                             test_labels)

batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Trening and Validation

In [35]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

def calculate_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Trening Model
epochs = 12
for epoch in range(epochs):
    model.train()
    total_loss, total_accuracy = 0, 0

    for step, batch in enumerate(train_loader):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        
        model.zero_grad()
        b_labels = b_labels.long()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs} - Loss: {avg_train_loss}')

    # Validation
    model.eval()
    total_eval_accuracy = 0

    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += calculate_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_loader)
    print(f'Epoch {epoch+1}/{epochs} - Validation Accuracy: {avg_val_accuracy}')

print("Training complete")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/12 - Loss: 2.4764768867361235
Epoch 1/12 - Validation Accuracy: 0.1625
Epoch 2/12 - Loss: 2.446542373842704
Epoch 2/12 - Validation Accuracy: 0.1529761904761905


### Testing Model

In [None]:
model.eval()
total_test_accuracy = 0

for batch in test_loader:
    b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    total_test_accuracy += calculate_accuracy(logits, label_ids)

avg_test_accuracy = total_test_accuracy / len(test_loader)
print(f'Test Accuracy: {avg_test_accuracy}')

## Prepare testing data