## Imports and global variables

In [1]:
import os
import re
import pandas as pd
import numpy as np
import torch

In [2]:
TIMESTAMPS_PATH = 'train/train/scene_timestamps'
FEATURES_PATH = 'train/train/features'
LABELS_PATH = 'train/train/labels'
SUBTITLES_PATH = 'train/train/subtitles'

TEST_TIMESTAMPS_PATH = 'test/test/scene_timestamps'
TEST_FEATURES_PATH = 'test/test/features'
TEST_SUBTITLES_PATH = 'test/test/subtitles'

In [3]:
# Check GPU available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load data

### Helper functions

In [4]:
def load_files(path):
    if os.path.exists(path):
        return os.listdir(path)
    else:
        print("PATH DOES NOT EXIST!")
        return []

In [5]:
def get_movie_name(file):
    return file[10:-15]

def get_movie_id(file):
    return file[:9]

In [6]:
def get_feature_file(file):
    return file.replace("_timestamps", "")

In [7]:
def prepare_csv(path, movie_name):
    df = pd.read_csv(path)
    df.rename(columns={"Unnamed: 0": "scene_id"}, inplace=True)
    df["movie"] = movie_name
    return df

In [8]:
def extract_movie_info(files):
    movie_ids, movies = [], []
    for file in files:
        movie_name = get_movie_name(file)
        movie_id = get_movie_id(file)
        movie_ids.append(movie_id)
        movies.append(movie_name)
    return movie_ids, movies

def prepare_dataframes(files, movies, path_func, data_func):
    dfs = []
    for idx, file in enumerate(files):
        path = path_func(file)
        df = data_func(path, movies[idx])
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

def merge_dataframes(df1, df2):
    test_full_data = pd.merge(df1, df2, on=["scene_id", "movie"])
    test_full_data["end"] = test_full_data["start"] + test_full_data["s_dur"]
    return test_full_data

### Files

In [9]:
train_files = load_files(TIMESTAMPS_PATH)
# remove australia because australia subtitles are in diffrent format than everything else
train_files.remove("tt0455824_australia_timestamps.csv")

test_files = load_files(TEST_TIMESTAMPS_PATH)

### Movies

In [10]:
train_movies_ids, train_movies = extract_movie_info(train_files)
test_movies_ids, test_movies = extract_movie_info(test_files)

### Timestamps

In [11]:
train_timestamps = prepare_dataframes(
    train_files,
    train_movies,
    lambda file: os.path.join(TIMESTAMPS_PATH, file), 
    prepare_csv
)

test_timestamps = prepare_dataframes(
    test_files,
    test_movies,
    lambda file: os.path.join(TEST_TIMESTAMPS_PATH, file), 
    prepare_csv
)

### Features

In [12]:
train_features = prepare_dataframes(
    train_files,
    train_movies,
    lambda file: os.path.join(FEATURES_PATH, get_feature_file(file)), 
    prepare_csv
)

test_features = prepare_dataframes(
    test_files,
    test_movies,
    lambda file: os.path.join(TEST_FEATURES_PATH, get_feature_file(file)), 
    prepare_csv
)

### Labels

In [13]:
dfs = []
for idx, file in enumerate(train_files):
    # print(feature_files[idx])
    labels_path = os.path.join(LABELS_PATH, get_feature_file(file))
    
    df = pd.read_csv(labels_path, keep_default_na=False)
    df.rename(
        columns={
            "Unnamed: 0": "scene_id",
            "0": "label"
        }, inplace=True
    )
    df["movie"] = train_movies[idx]
    
    dfs.append(df)

train_labels = pd.concat(dfs, ignore_index=True)
# print(train_labels.shape)
# train_labels.tail()

### Merge dataframes

In [14]:
train_merged = merge_dataframes(train_timestamps, train_features)
# additionaly merge labels to train data
train_full_data = pd.merge(train_merged, train_labels, on=["scene_id", "movie"], how="outer")
print(train_full_data.shape)

test_full_data = merge_dataframes(test_timestamps, test_features)
print(test_full_data.shape)

(3729, 12)
(2470, 11)


## Join Scenes with subtitles

In [15]:
import srt

### Helper runctions

In [16]:
train_movie_scenes_count = train_full_data.groupby('movie').count()['scene_id']
# print(train_movie_scenes_count)
test_movie_scenes_count = test_full_data.groupby('movie').count()['scene_id']

In [17]:
train_full_data

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,label
0,0,0.000,218.718333,the lost weekend,218.718333,6,36.453056,0.000000,0.000000,1295.579078,1,Opening Image
1,1,218.760,240.740333,the lost weekend,21.980333,3,7.326778,0.008403,0.036189,1295.579078,1,Opening Image
2,2,240.782,282.574333,the lost weekend,41.792333,1,41.792333,0.016807,0.039832,1256.344447,1,Set-Up
3,3,282.616,313.521333,the lost weekend,30.905333,1,30.905333,0.025210,0.046752,1256.344447,1,Set-Up
4,4,313.563,367.033333,the lost weekend,53.470333,4,13.367583,0.033613,0.051871,1256.344447,1,Theme Stated
...,...,...,...,...,...,...,...,...,...,...,...,...
3724,220,6499.655,6574.354333,dallas buyers club,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,Finale
3725,221,6574.396,6579.776333,dallas buyers club,5.380333,1,5.380333,0.965066,0.936929,372.108700,0,Finale
3726,224,6589.161,6623.987333,dallas buyers club,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,Final Image
3727,226,6629.701,6682.462333,dallas buyers club,52.761333,19,2.776912,0.986900,0.944811,1076.267498,1,Final Image


In [18]:
problematic_movies = ["the ugly truth", "the social network", "the girl with the dragon tattoo"]

In [19]:
def convert_files_to_subtitles(files):
    return [file.replace('.csv', '.srt') for file in files]

def load_subtitles(paths, movies):
    movie_subtitles = {}
    for idx, movie_name in enumerate(movies):
        # print(movie_name)
        
        if movie_name == "pretty woman":
            with open(paths[idx], 'r', encoding='utf-16') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
        elif movie_name in problematic_movies:
            with open(paths[idx], 'r', encoding='utf-8', errors='replace') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
        else:
            with open(paths[idx], 'r', encoding='utf-8') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
                
    return movie_subtitles

def associate_scenes_with_subtitles(full_data, movie_subtitles):
    movies_scenes_subtitles = {}
    additional_data = []
    last_processed_subtitle_idx = 0
    missed_subtitles = 0

    for scene_idx, scene_row in full_data.iterrows():
        scene_start, scene_end = scene_row['start'], scene_row['end']
        scene_id, movie_name = scene_row['scene_id'], scene_row['movie']
        
        # Create new dictionary for every movie
        if movie_name not in movies_scenes_subtitles:
            movies_scenes_subtitles[movie_name] = {}
            last_processed_subtitle_idx = 0

        # Craete new dictionary for every scene within movie
        movies_scenes_subtitles[movie_name][scene_id] = []

        current_movie_subtitles = movie_subtitles[movie_name]
        sentence_count = 0
        for idx in range(last_processed_subtitle_idx, len(current_movie_subtitles)):
            sub = current_movie_subtitles[idx]
            sub_start, sub_end = sub.start.total_seconds(), sub.end.total_seconds()

            # Some subtitles start just before scene_start
            if scene_start <= (sub_start + 0.05) and scene_end >= sub_end:
                # Add subtitle content to the dictionary for the current scene
                movies_scenes_subtitles[movie_name][scene_id].append(sub.content)
                sentence_count += sub.content.count('.') + sub.content.count('!') + sub.content.count('?')
            elif scene_end < sub_end:
                last_processed_subtitle_idx = idx
                break
            else:
                missed_subtitles += 1
        
        additional_data.append({
            'movie': movie_name,
            'scene_id': scene_id,
            'sentence_count': sentence_count,
        })

    print(f"Ignored subtitltes: {missed_subtitles}")
    additional_df = pd.DataFrame(additional_data)
    return movies_scenes_subtitles, additional_df

In [20]:
def format_data(name, data):
    formatted_data = "{:.0f}".format(data)
    # print(f"{name}{formatted_data}")
    return name + formatted_data


def map_scene_location_to_category(x):
    scene_mapping = {
        (0, 1): "Opening",
        (2, 15): "Setup",
        (16, 19): "Debate",
        (20, 49): "Story",
        (50, 75): "BadGuys",
        (76, 89): "Ending",
        (90, 99): "Finale",
        (99, float('inf')): "FinalImage"
    }

    for percentage_range, category in scene_mapping.items():
        if percentage_range[0] <= x <= percentage_range[1]:
            return category

    return "InvalidValue"


def associate_scenes_with_subtitles_extra_info(full_data, movie_subtitles, movie_scenes_count):
    movies_scenes_subtitles = {}
    last_processed_subtitle_idx = 0
    missed_subtitles = 0

    for scene_idx, scene_row in full_data.iterrows():
        scene_start, scene_end = scene_row['start'], scene_row['end']
        scene_id, movie_name = scene_row['scene_id'], scene_row['movie']
        scene_location, time_location = scene_row['rel_id_loc'], scene_row['rel_t_loc'] 	
        prot_appear = scene_row['is_prot_appear']
        scene_count = movie_scenes_count[movie_name]

        movie_name_without_spaces = movie_name.replace(" ", "")
        category = map_scene_location_to_category(int(scene_location * 100))
        
        scene_location_result = format_data("SceneLocation", scene_location * 100)
        time_location_result = format_data("TimeLocation", time_location * 100)
        category_result = f"category{category}"
        movie_name_result = f"MovieName{movie_name_without_spaces}"
        scene_start_result = format_data("SceneStart", scene_start)
        scene_end_result = format_data("SceneEnd", scene_end)
        prot_appear_result = format_data("ProtAppear", prot_appear)
        scene_count_result = format_data("SceneCount", scene_count)
        
        # Create new dictionary for every movie
        if movie_name not in movies_scenes_subtitles:
            movies_scenes_subtitles[movie_name] = {}
            last_processed_subtitle_idx = 0

        # Craete new dictionary for every scene within movie
        movies_scenes_subtitles[movie_name][scene_id] = []
        movies_scenes_subtitles[movie_name][scene_id].append(scene_location_result)
        movies_scenes_subtitles[movie_name][scene_id].append(time_location_result)
        movies_scenes_subtitles[movie_name][scene_id].append(category_result)
        movies_scenes_subtitles[movie_name][scene_id].append(movie_name_result)
        movies_scenes_subtitles[movie_name][scene_id].append(scene_start_result)
        movies_scenes_subtitles[movie_name][scene_id].append(scene_end_result)
        movies_scenes_subtitles[movie_name][scene_id].append(prot_appear_result)
        movies_scenes_subtitles[movie_name][scene_id].append(scene_count_result)

        current_movie_subtitles = movie_subtitles[movie_name]
        for idx in range(last_processed_subtitle_idx, len(current_movie_subtitles)):
            sub = current_movie_subtitles[idx]
            sub_start, sub_end = sub.start.total_seconds(), sub.end.total_seconds()

            # Some subtitles start just before scene_start
            if scene_start <= (sub_start + 0.05) and scene_end >= sub_end:
                # Add subtitle content to the dictionary for the current scene
                movies_scenes_subtitles[movie_name][scene_id].append(sub.content)
            elif scene_end < sub_end:
                last_processed_subtitle_idx = idx
                break
            else:
                missed_subtitles += 1
        
    print(f"Ignored subtitltes: {missed_subtitles}")
    return movies_scenes_subtitles

### Get subtitle files

In [21]:
train_features_files = load_files(FEATURES_PATH)
# remove australia because australia subtitles are in diffrent format than everything else
train_features_files.remove('tt0455824_australia.csv')
test_features_files = load_files(TEST_FEATURES_PATH)

train_subtitles_files = convert_files_to_subtitles(train_features_files)
train_subtitle_paths = [os.path.join(SUBTITLES_PATH, subtitle_file) for subtitle_file in train_subtitles_files]

test_subtitles_files = convert_files_to_subtitles(test_features_files)
test_subtitle_paths = [os.path.join(TEST_SUBTITLES_PATH, subtitle_file) for subtitle_file in test_subtitles_files]

### Load subtitles 

In [22]:
train_movie_subtitles = load_subtitles(train_subtitle_paths, train_movies)
test_movie_subtitles = load_subtitles(test_subtitle_paths, test_movies)

### Associate Scenes with subtitles

In [23]:
train_movies_scenes_subtitles, additional_train_data = associate_scenes_with_subtitles(train_full_data, train_movie_subtitles)
test_movies_scenes_subtitles, additional_test_data = associate_scenes_with_subtitles(test_full_data, test_movie_subtitles)

Ignored subtitltes: 3018
Ignored subtitltes: 3165


## Subtitles cleaning and combining

### Helper functions

In [24]:
def clean_subtitles(movies_scenes_subtitles):
    for movie_name, scenes in movies_scenes_subtitles.items():
        for scene_id, subtitles in scenes.items():
            cleaned_subtitles = []
            for subtitle in subtitles:
                # Remove new line characters
                cleaned_subtitle = subtitle.replace('\n', ' ').strip()
                cleaned_subtitle = re.sub(r'<.*?>', '', cleaned_subtitle)
                cleaned_subtitle = re.sub(r'♪', '', cleaned_subtitle)
                cleaned_subtitles.append(cleaned_subtitle)
            
            combined_text = " ".join(cleaned_subtitles)
            movies_scenes_subtitles[movie_name][scene_id] = combined_text
            
    return movies_scenes_subtitles

def clean_subtitles_only_words(movies_scenes_subtitles):
    for movie_name, scenes in movies_scenes_subtitles.items():
        for scene_id, subtitles in scenes.items():
            cleaned_subtitles = []
            for subtitle in subtitles:
                cleaned_subtitle = subtitle.replace('\n', ' ').strip()
                cleaned_subtitle = re.sub(r'<.*?>', '', cleaned_subtitle)
                cleaned_subtitle = re.sub(r'[^a-zA-Z0-9\s\']', '', cleaned_subtitle)
                cleaned_subtitles.append(cleaned_subtitle)
            
            combined_text = " ".join(cleaned_subtitles)
            movies_scenes_subtitles[movie_name][scene_id] = combined_text
            
    return movies_scenes_subtitles

In [25]:
clean_train_movies_scenes_subtitles = clean_subtitles(train_movies_scenes_subtitles)
clean_train_movies_scenes_subtitles['four weddings and a funeral'][3]

" Although I can't dismiss   The memory of her kiss   I guess he's not for me "

In [26]:
clean_test_movies_scenes_subtitles = clean_subtitles(test_movies_scenes_subtitles)
clean_test_movies_scenes_subtitles['gone girl'][3]

"His majesty prefers not to be moistened. I got you a present. Oh. I hated this game. You loved it. You loved it. Thank you. I'll add it to the collection. Can you pour me a bourbon? (SIGHS) What's up, Jitters? (BREATHES DEEPLY) Well, if you're not going to talk... I'm gonna have to fill the silence... with another excruciating story by Margo Dunne. I could tell you about my recent customer service experience... changing Internet service providers. I like that one. Or how about the time... I saw that woman who looked exactly like my friend Monica? But it wasn't Monica. It was a total stranger. Who was also named Monica. Made it kind of interesting. It's great. I'm just having a bad day. Amy? It's our anniversary. Five years. Five? That came fast. And furious."

## Tokenize and Embeddings

### Helper functions

In [27]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
def calculate_embeddings(movies_scenes_subtitles):
    embeddings = {}
    for movie_name, scenes in movies_scenes_subtitles.items():
        for scene_id, combined_text in scenes.items():
            inputs = tokenizer(combined_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
            embeddings.setdefault(movie_name, {})[scene_id] = embedding

    return embeddings

In [29]:
train_movies_scenes_embeddings = calculate_embeddings(clean_train_movies_scenes_subtitles)

train_embeddings = []
for index, row in train_full_data.iterrows():
    movie_name, scene_id = row['movie'], row['scene_id']
    embedding = train_movies_scenes_embeddings.get(movie_name, {}).get(scene_id, None)
    train_embeddings.append(embedding)

train_full_data.insert(train_full_data.columns.get_loc('label'), 'embedding', train_embeddings)
train_full_data.tail()

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,embedding,label
3724,220,6499.655,6574.354333,dallas buyers club,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,"[0.031814027577638626, 0.13735565543174744, 0....",Finale
3725,221,6574.396,6579.776333,dallas buyers club,5.380333,1,5.380333,0.965066,0.936929,372.1087,0,"[0.20158801972866058, 0.11704124510288239, 0.2...",Finale
3726,224,6589.161,6623.987333,dallas buyers club,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,"[0.31051066517829895, 0.04721043258905411, -0....",Final Image
3727,226,6629.701,6682.462333,dallas buyers club,52.761333,19,2.776912,0.9869,0.944811,1076.267498,1,"[0.051616471260786057, -0.009972779080271721, ...",Final Image
3728,227,6682.504,6688.176333,dallas buyers club,5.672333,4,1.418083,0.991266,0.952336,2085.459362,1,"[0.3686937689781189, 0.022891951724886894, 0.2...",Final Image


In [30]:
test_movies_scenes_embeddings = calculate_embeddings(test_movies_scenes_subtitles)

test_embeddings = []
for index, row in test_full_data.iterrows():
    movie_name, scene_id = row['movie'], row['scene_id']
    embedding = test_movies_scenes_embeddings.get(movie_name, {}).get(scene_id, None)
    test_embeddings.append(embedding)

test_full_data['embedding'] = test_embeddings
test_full_data.tail()

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,embedding
2465,216,8437.638,8460.952333,gone girl,23.314333,10,2.331433,0.972973,0.943599,1400.736946,1,"[-0.06389153748750687, 0.0513589084148407, 0.5..."
2466,217,8460.994,8543.284333,gone girl,82.290333,20,4.114517,0.977477,0.946211,1400.736946,1,"[0.2014482319355011, 0.047157373279333115, 0.2..."
2467,218,8543.326,8565.640333,gone girl,22.314333,8,2.789292,0.981982,0.955418,1400.736946,1,"[0.38489431142807007, 0.030371684581041336, 0...."
2468,219,8565.682,8607.307333,gone girl,41.625333,9,4.625037,0.986486,0.957918,1094.34439,1,"[0.07813607901334763, -0.12171444296836853, 0...."
2469,220,8607.349,8649.390333,gone girl,42.041333,6,7.006889,0.990991,0.962578,1029.305716,1,"[0.40521693229675293, 0.03550683706998825, 0.2..."


## Add additionaly data

In [31]:
train_full_data = pd.merge(train_full_data, additional_train_data, on=["movie", "scene_id"])
test_full_data = pd.merge(test_full_data, additional_test_data, on=["movie", "scene_id"])

## First Model

### Tuning hyperparameters

In [None]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

# embeddings = np.array(train_full_data['embedding'].tolist())
# pca = PCA(n_components=10)
# reduced_embeddings = pca.fit_transform(embeddings)

X = train_full_data[['scene_id', 'rel_id_loc', 'rel_t_loc', 'ava_char_score', 'ava_shot_dur', 'is_prot_appear']]
movie_encoder = LabelEncoder()
X['movie'] = movie_encoder.fit_transform(train_full_data['movie'])
y = train_full_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23, test_size=0.3, stratify=y)

param_dist = {
    'n_estimators': [210, 240, 200, 225, 250],
    'min_samples_split': [15, 25, 35],
    'min_samples_leaf': [10, 15, 20],
    'max_depth': [10, 15, 20, 25, 30, 50]
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=23), param_distributions=param_dist, n_iter=10, cv=5, random_state=23, n_jobs=-1)
random_search.fit(X, y)
# random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)

### Cross-Validation

In [None]:
best_params = random_search.best_params_
model = RandomForestClassifier(**best_params, random_state=23)
model.fit(X, y)
# model.fit(X_train, y_train)

y_pred = model.predict(X)
# y_pred = model.predict(X_test)

scores = cross_val_score(model, X, y, cv=5)
# scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"Mean Accuracy from cross-validation on training set: {np.mean(scores)}")

### Scores

In [None]:
_, levels = pd.factorize(train_full_data['label'])
print(classification_report(y,y_pred,target_names=levels))
# print(classification_report(y_test,y_pred,target_names=levels))

accuracy = accuracy_score(y, y_pred)
# accuracy = accuracy_score(y_test, y_pred)
print()
print(f"Accuracy: {accuracy}")

In [None]:
# Assuming 'model', 'X', and 'y' are already defined
predicted = cross_val_predict(model, X, y, cv=5)
conf_matrix = confusion_matrix(y, predicted)

print("Confusion Matrix:")
print(conf_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(conf_matrix, fmt='g', ax=ax, annot=True)

### Feature importances

In [None]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)

feature_names = [f"feature {i}" for i in range(X.shape[1])]
feature_names = X.columns
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

### Predict for real test data

In [None]:
# embeddings = np.array(test_full_data['embedding'].tolist())

X = test_full_data[['scene_id', 'rel_id_loc', 'rel_t_loc', 'ava_char_score', 'ava_shot_dur', 'is_prot_appear']]
movie_encoder = LabelEncoder()
X['movie'] = movie_encoder.fit_transform(test_full_data['movie'])
# X = np.hstack((X, embeddings))

y_pred = model.predict(X)

In [None]:
# print("Shape of X:", X.shape)
# print("Shape of embeddings:", embeddings.shape)
print(y_pred.shape)
print(test_full_data.shape)

## Second Model

In [None]:
from collections import Counter
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

X = train_full_data[['s_dur','n_shots','ava_shot_dur','rel_id_loc','rel_t_loc','ava_char_score','is_prot_appear','sentence_count']]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_full_data['label'])


X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=23, test_size=0.3, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state=23, test_size=0.5, stratify=y_temp)

counts = Counter(y_train)
undersample_strategy = {label: min(200, count) for label, count in counts.items()}

oversample_strategy = {label: 200 for label in np.unique(y_train)}

under = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=23)
over = SMOTE(sampling_strategy=oversample_strategy, random_state=23)

X_train_under, y_train_under = under.fit_resample(X_train, y_train)
X_train_balanced, y_train_balanced = over.fit_resample(X_train_under, y_train_under)

test_class_weights = compute_class_weight('balanced', classes=np.unique(y_train_balanced), y=y_train_balanced)
test_class_weights_dict = dict(zip(np.unique(y_train_balanced), test_class_weights))
test_sample_weight = np.array([test_class_weights_dict[class_label] for class_label in y_train_balanced])

param_dist = {
    'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05],
    'reg_lambda': [0.1, 0.5, 1.0, 1.5, 2.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [5, 6, 7, 8, 9, 10],
    'gamma': [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'early_stopping_rounds': [2, 4, 6, 8, 10, 12, 14]
}

xgb = XGBClassifier(random_state=23)

random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=10, cv=5, random_state=23, n_jobs=-1)
random_search.fit(X_train_balanced, y_train_balanced, eval_set=[(X_val, y_val)], verbose=True, sample_weight=test_sample_weight)

print("Best parameters:", random_search.best_params_)

In [None]:
best_params = random_search.best_params_
model = XGBClassifier(**best_params, random_state=23)
model.fit(X_train_balanced, y_train_balanced, eval_set=[(X_val, y_val)], verbose=True, sample_weight=test_sample_weight)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print(classification_report(y_test, y_pred))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, y_pred)

print("\nConfusion Matrix:")
sns.heatmap(conf_matrix, annot=True, fmt='g')

In [None]:
importances = model.feature_importances_
feature_names = X.columns
# feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using XGBoost")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

plt.show()

In [None]:
X = test_full_data[['s_dur','n_shots','ava_shot_dur','rel_id_loc','rel_t_loc','ava_char_score','is_prot_appear','sentence_count']]

y_pred = model.predict(X)
y_pred_labels = label_encoder.inverse_transform(y_pred)
print(y_pred_labels.shape)
print(test_full_data.shape)

## Text classification using BERT (Third Model)

### Helper functions

In [32]:
unique_labels = train_full_data['label'].unique()
label_to_int_mapping = {label: i for i, label in enumerate(unique_labels)}
int_to_label_mapping = {i: label for label, i in label_to_int_mapping.items()}

def map_label_to_int(label):
    return label_to_int_mapping[label]


def map_label_from_int(label):
    return int_to_label_mapping[label] 

In [33]:
def transform_subtitles(subtitles, df):
    transformed_subtitles = []
    for movie_name, scenes in subtitles.items():
        for scene_id, text in scenes.items():
            # Get the label from train_full_data based on movie and scene_id
            label = df[(df['movie'] == movie_name) & (df['scene_id'] == scene_id)]['label'].values[0]
            label = map_label_to_int(label)
            
            result = {'text': text, 'label': label}
            transformed_subtitles.append(result)
    
    return transformed_subtitles


def transform_validation_subtitles(subtitles, df):
    transformed_subtitles = []
    for movie_name, scenes in subtitles.items():
        for scene_id, text in scenes.items():
            result = {'text': text}
            transformed_subtitles.append(result)
    
    return transformed_subtitles
    

def print_head_transformed_subtitles(subtiltes):
    for i in range(min(5, len(subtiltes))):
        print(subtiltes[i])

In [34]:
train_subtitles = associate_scenes_with_subtitles_extra_info(train_full_data, train_movie_subtitles, train_movie_scenes_count)
clean_train_subtitles = clean_subtitles_only_words(train_subtitles)
train_transformed_subtitles = transform_subtitles(clean_train_subtitles, train_full_data)

Ignored subtitltes: 3018


In [None]:
# train_transformed_subtitles

In [35]:
validation_subtitles = associate_scenes_with_subtitles_extra_info(test_full_data, test_movie_subtitles, test_movie_scenes_count)
clean_validation_subtitles = clean_subtitles_only_words(validation_subtitles)
validation_transformed_subtitles = transform_validation_subtitles(clean_validation_subtitles, test_full_data)

Ignored subtitltes: 3165


In [36]:
from sklearn.model_selection import train_test_split

# Extract 'text' and 'label' from each dictionary
texts = [item['text'] for item in train_transformed_subtitles]
labels = [item['label'] for item in train_transformed_subtitles]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)

# # Create training and testing datasets
train_data = [{'text': text, 'label': label} for text, label in zip(train_texts, train_labels)]
test_data = [{'text': text, 'label': label} for text, label in zip(test_texts, test_labels)]

In [37]:
from datasets import Dataset
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
validation_dataset = Dataset.from_list(validation_transformed_subtitles)

print(train_dataset)
print(test_dataset)
print(validation_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 2610
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1119
})
Dataset({
    features: ['text'],
    num_rows: 2470
})


In [47]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer

def choose_model(name):
    if name == "distilbert": 
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=16)
    elif name == "bert":
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=16)
    elif name == "roberta":
        tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=16)
    elif name == "roberta-large":
        tokenizer = AutoTokenizer.from_pretrained("roberta-large")
        model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=16)
    else:
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=16)

    return model, tokenizer

model, tokenizer = choose_model("roberta-large")

config.json: 100%|██████████| 482/482 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 9.54MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.80MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.23MB/s]
model.safetensors: 100%|██████████| 1.42G/1.42G [01:03<00:00, 22.4MB/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
# print(stop_words)

In [48]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def preprocess_function(examples):
    examples["text"] = [remove_stopwords(text) for text in examples["text"]]
    tokenized_text = tokenizer(examples["text"], truncation=True)
    
    return tokenized_text

In [49]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_validation_dataset = validation_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

Map:   0%|          | 0/2610 [00:00<?, ? examples/s]

Map: 100%|██████████| 2610/2610 [00:00<00:00, 7023.29 examples/s]
Map: 100%|██████████| 1119/1119 [00:00<00:00, 16222.04 examples/s]
Map: 100%|██████████| 2470/2470 [00:00<00:00, 15945.71 examples/s]


In [50]:
tokenized_test_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1119
})

In [51]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [52]:
import evaluate
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=18,
    weight_decay=0.01,
    logging_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.evaluate()

In [None]:
predictions = trainer.predict(tokenized_validation_dataset)

In [None]:
predicted_probabilities = predictions.predictions
predicted_label_ids = np.argmax(predicted_probabilities, axis=-1)

label_predictions = [map_label_from_int(label) for label in predicted_label_ids]
print(f"first 10 predictions: {label_predictions[:10]} \nlast 10 predictions: {label_predictions[-10:]}")

## Result data

In [53]:
result_df = pd.DataFrame(columns=['Id', 'Label'])

def fill_result_df(full_data, movies, movies_ids, predictions):
    for idx, scene in full_data.iterrows():
        movie_name = scene['movie']
        movie_index = movies.index(movie_name)
        movie_id = movies_ids[movie_index]
        scene_id = scene['scene_id']
        movie_scene_id = f"{movie_id}_{scene_id}"

        pred_label = predictions[idx]
        result_df.loc[idx] = [movie_scene_id, pred_label]

In [None]:
# First model (RF)
# fill_result_df(test_full_data, test_movies, test_movies_ids, y_pred_labels)

# Third model (Bert)
fill_result_df(test_full_data, test_movies, test_movies_ids, label_predictions)


result_df.head(1)

In [None]:
result_df.to_csv('output_roberta_18.csv', index=False)

## Save model

In [None]:
from transformers import BertConfig

folder_name = "models"

def save_model(model, tokenizer, name):
    model_folder_path = f"./{folder_name}/{name}" 
    model.save_model(model_folder_path)
    tokenizer.save_pretrained(model_folder_path)
    config = BertConfig.from_pretrained(model_folder_path)
    config.save_pretrained(model_folder_path)
    
save_model(trainer, tokenizer, "model_reoberta_18")

## Clean memory

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import gc
gc.collect()

## Save combined clean data in file

In [None]:
output_file_path = "output_combined_file.txt"  # Set your desired output file

with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for movie, scenes in clean_train_movies_scenes_subtitles.items():
        output_file.write(f"======= {movie} =========\n\n")
        output_file.write(str(scenes))

print("Combined text file saved successfully.")


In [None]:
output_file_path = "output_combined_file2.txt"  # Set your desired output file

with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for movie, scenes in clean_train_subtitles.items():
        output_file.write(f"======= {movie} =========\n\n")
        output_file.write(str(scenes))

print("Combined text file saved successfully.")


## Models Voting

In [76]:
from collections import Counter

def merge_csv_files(folder_path, output_file, file_accuracies):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    id_labels = {}

    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        current_df = pd.read_csv(file_path, keep_default_na=False)

        weight = file_accuracies.get(csv_file, 1) if file_accuracies else 1

        for _, row in current_df.iterrows():
            current_id = row['Id']
            current_label = row['Label']

            if current_id not in id_labels:
                id_labels[current_id] = {'labels': Counter()}

            id_labels[current_id]['labels'][current_label] += 1 * weight

    merged_data = {'Id': [], 'Label': []}
    for current_id, labels in id_labels.items():
        if current_id == 'tt0822832_84':
            print(current_id, labels)
        most_common_label = get_most_common_label(labels)

        merged_data['Id'].append(current_id)
        merged_data['Label'].append(most_common_label)

    merged_df = pd.DataFrame(merged_data)
    merged_df.to_csv(output_file, index=False)

def get_most_common_label(data):
    label_counts = data['labels']
    most_common_label = max(label_counts, key=label_counts.get)
    return most_common_label

In [77]:
folder_path = 'models_voting/'
output_file = 'output_voting.csv'

file_accuracies = {'output_bert_5.csv': 0.47035, 'output_bert_10.csv': 0.47503, 'output_bert_15.csv': 0.47777, 'output_bert_30.csv': 0.44383,
                   'output_roberta_12.csv': 0.47035, 'output_roberta_15.csv': 0.48517, 'output_roberta_50.csv': 0.45555, 'output_random_forest.csv': 0.48439,
                   'output_roberta_large_12.csv': 0.43837}

merge_csv_files(folder_path, output_file, file_accuracies)

tt0822832_84 {'labels': Counter({'None': 1.9009, 'Dark Night of the Soul': 1.84436, 'All Is Lost': 0.45555})}
