## Imports and global variables

In [3]:
# !du -h --max-depth=1 /kaggle/working
# !rm -rf /kaggle/working
# !rm -rf /kaggle/working/results_fold*
# !ls  /kaggle/working/results

In [2]:
# !pip install srt --target=/kaggle/working/
# !pip install srt

# !pip install numpy==1.18
# !pip install scipy==1.1.0
# !pip install scikit-learn==0.21.3

# !pip install datasets --upgrade --target=/kaggle/working/
# !pip install datasets --upgrade

# !pip install evaluate --target=/kaggle/working/
# !pip install evaluate

In [2]:
import datasets
print(datasets.__version__)

2.16.1


In [3]:
import os
import re
import pandas as pd
import numpy as np
import torch

In [4]:
STORY_BEATS_PATH = '/kaggle/input/story-beats-2'

TIMESTAMPS_PATH = f"{STORY_BEATS_PATH}/train/train/scene_timestamps"
FEATURES_PATH = f"{STORY_BEATS_PATH}/train/train/features"
LABELS_PATH = f"{STORY_BEATS_PATH}/train/train/labels"
SUBTITLES_PATH = f"{STORY_BEATS_PATH}/train/train/subtitles"

TEST_TIMESTAMPS_PATH = f"{STORY_BEATS_PATH}/test/test/scene_timestamps"
TEST_FEATURES_PATH = f"{STORY_BEATS_PATH}/test/test/features"
TEST_SUBTITLES_PATH = f"{STORY_BEATS_PATH}/test/test/subtitles"

In [5]:
# Check GPU available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load data

### Helper functions

In [6]:
def load_files(path):
    if os.path.exists(path):
        files = os.listdir(path)
        return sorted(files)
    else:
        print("PATH DOES NOT EXIST!")
        return []

In [7]:
def get_movie_name(file):
    return file[10:-15]

def get_movie_id(file):
    return file[:9]

In [8]:
def get_feature_file(file):
    return file.replace("_timestamps", "")

In [9]:
def prepare_csv(path, movie_name):
    df = pd.read_csv(path)
    df.rename(columns={"Unnamed: 0": "scene_id"}, inplace=True)
    df["movie"] = movie_name
    return df

In [10]:
def extract_movie_info(files):
    movie_ids, movies = [], []
    for file in files:
        movie_name = get_movie_name(file)
        movie_id = get_movie_id(file)
        movie_ids.append(movie_id)
        movies.append(movie_name)
    return movie_ids, movies

def prepare_dataframes(files, movies, path_func, data_func):
    dfs = []
    for idx, file in enumerate(files):
        path = path_func(file)
        df = data_func(path, movies[idx])
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

def merge_dataframes(df1, df2):
    test_full_data = pd.merge(df1, df2, on=["scene_id", "movie"])
    test_full_data["end"] = test_full_data["start"] + test_full_data["s_dur"]
    return test_full_data

## Convert ass to srt

### Files

In [11]:
train_files = load_files(TIMESTAMPS_PATH)
# remove australia because australia subtitles are in diffrent format than everything else
# train_files.remove("tt0455824_australia_timestamps.csv")

test_files = load_files(TEST_TIMESTAMPS_PATH)

### Movies

In [12]:
train_movies_ids, train_movies = extract_movie_info(train_files)
test_movies_ids, test_movies = extract_movie_info(test_files)

### Timestamps

In [13]:
train_timestamps = prepare_dataframes(
    train_files,
    train_movies,
    lambda file: os.path.join(TIMESTAMPS_PATH, file), 
    prepare_csv
)

test_timestamps = prepare_dataframes(
    test_files,
    test_movies,
    lambda file: os.path.join(TEST_TIMESTAMPS_PATH, file), 
    prepare_csv
)

### Features

In [14]:
train_features = prepare_dataframes(
    train_files,
    train_movies,
    lambda file: os.path.join(FEATURES_PATH, get_feature_file(file)), 
    prepare_csv
)

test_features = prepare_dataframes(
    test_files,
    test_movies,
    lambda file: os.path.join(TEST_FEATURES_PATH, get_feature_file(file)), 
    prepare_csv
)

### Labels

In [15]:
dfs = []
for idx, file in enumerate(train_files):
    # print(feature_files[idx])
    labels_path = os.path.join(LABELS_PATH, get_feature_file(file))
    
    df = pd.read_csv(labels_path, keep_default_na=False)
    df.rename(
        columns={
            "Unnamed: 0": "scene_id",
            "0": "label"
        }, inplace=True
    )
    df["movie"] = train_movies[idx]
    
    dfs.append(df)

train_labels = pd.concat(dfs, ignore_index=True)
# print(train_labels.shape)
# train_labels.tail()

### Merge dataframes

In [16]:
train_merged = merge_dataframes(train_timestamps, train_features)
# additionaly merge labels to train data
train_full_data = pd.merge(train_merged, train_labels, on=["scene_id", "movie"], how="outer")
print(train_full_data.shape)

test_full_data = merge_dataframes(test_timestamps, test_features)
print(test_full_data.shape)

(3978, 12)
(2470, 11)


## Join Scenes with subtitles

In [17]:
import srt

### Helper runctions

In [18]:
train_movie_scenes_count = train_full_data.groupby('movie').count()['scene_id']
# print(train_movie_scenes_count)
test_movie_scenes_count = test_full_data.groupby('movie').count()['scene_id']

In [19]:
train_full_data

Unnamed: 0,scene_id,start,end,movie,s_dur,n_shots,ava_shot_dur,rel_id_loc,rel_t_loc,ava_char_score,is_prot_appear,label
0,0,0.000,218.718333,the lost weekend,218.718333,6,36.453056,0.000000,0.000000,1295.579078,1,Opening Image
1,1,218.760,240.740333,the lost weekend,21.980333,3,7.326778,0.008403,0.036189,1295.579078,1,Opening Image
2,2,240.782,282.574333,the lost weekend,41.792333,1,41.792333,0.016807,0.039832,1256.344447,1,Set-Up
3,3,282.616,313.521333,the lost weekend,30.905333,1,30.905333,0.025210,0.046752,1256.344447,1,Set-Up
4,4,313.563,367.033333,the lost weekend,53.470333,4,13.367583,0.033613,0.051871,1256.344447,1,Theme Stated
...,...,...,...,...,...,...,...,...,...,...,...,...
3973,220,6499.655,6574.354333,dallas buyers club,74.699333,16,4.668708,0.960699,0.926278,656.478716,1,Finale
3974,221,6574.396,6579.776333,dallas buyers club,5.380333,1,5.380333,0.965066,0.936929,372.108700,0,Finale
3975,224,6589.161,6623.987333,dallas buyers club,34.826333,6,5.804389,0.978166,0.939033,928.370357,1,Final Image
3976,226,6629.701,6682.462333,dallas buyers club,52.761333,19,2.776912,0.986900,0.944811,1076.267498,1,Final Image


In [20]:
problematic_movies = ["the ugly truth", "the social network", "the girl with the dragon tattoo"]

In [21]:
def convert_files_to_subtitles(files):
    return [file.replace('.csv', '.srt') for file in files]

def load_subtitles(paths, movies):
    movie_subtitles = {}
    for idx, movie_name in enumerate(movies):
#         print(movie_name)
        if movie_name == "pretty woman":
            with open(paths[idx], 'r', encoding='utf-16') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
        elif movie_name in problematic_movies:
            with open(paths[idx], 'r', encoding='utf-8', errors='replace') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
        else:
            with open(paths[idx], 'r', encoding='utf-8') as subtitle_file:
                movie_subtitles[movie_name] = list(srt.parse(subtitle_file.read()))
                
    return movie_subtitles

def associate_scenes_with_subtitles(full_data, movie_subtitles):
    movies_scenes_subtitles = {}
    additional_data = []
    last_processed_subtitle_idx = 0
    missed_subtitles = 0

    for scene_idx, scene_row in full_data.iterrows():
        scene_start, scene_end = scene_row['start'], scene_row['end']
        scene_id, movie_name = scene_row['scene_id'], scene_row['movie']
        
        # Create new dictionary for every movie
        if movie_name not in movies_scenes_subtitles:
            movies_scenes_subtitles[movie_name] = {}
            last_processed_subtitle_idx = 0

        # Craete new dictionary for every scene within movie
        movies_scenes_subtitles[movie_name][scene_id] = []

        current_movie_subtitles = movie_subtitles[movie_name]
        sentence_count = 0
        for idx in range(last_processed_subtitle_idx, len(current_movie_subtitles)):
            sub = current_movie_subtitles[idx]
            sub_start, sub_end = sub.start.total_seconds(), sub.end.total_seconds()

            # Some subtitles start just before scene_start
            if scene_start <= (sub_start + 0.05) and scene_end >= sub_end:
                # Add subtitle content to the dictionary for the current scene
                movies_scenes_subtitles[movie_name][scene_id].append(sub.content)
                sentence_count += sub.content.count('.') + sub.content.count('!') + sub.content.count('?')
            elif scene_end < sub_end:
                last_processed_subtitle_idx = idx
                break
            else:
                missed_subtitles += 1
        
        additional_data.append({
            'movie': movie_name,
            'scene_id': scene_id,
            'sentence_count': sentence_count,
        })

    print(f"Ignored subtitltes: {missed_subtitles}")
    additional_df = pd.DataFrame(additional_data)
    return movies_scenes_subtitles, additional_df

In [22]:
def format_data(name, data):
    formatted_data = "{:.0f}".format(data)
    # print(f"{name}{formatted_data}")
    return name + formatted_data


def map_scene_location_to_category(x):
    scene_mapping = {
        (0, 1): "Opening",
        (2, 15): "Setup",
        (16, 19): "Debate",
        (20, 49): "Story",
        (50, 75): "BadGuys",
        (76, 89): "Ending",
        (90, 99): "Finale",
        (99, float('inf')): "FinalImage"
    }

    for percentage_range, category in scene_mapping.items():
        if percentage_range[0] <= x <= percentage_range[1]:
            return category

    return "InvalidValue"


def associate_scenes_with_subtitles_extra_info(full_data, movie_subtitles, movie_scenes_count):
    movies_scenes_subtitles = {}
    last_processed_subtitle_idx = 0
    missed_subtitles = 0

    for scene_idx, scene_row in full_data.iterrows():
        scene_start, scene_end = scene_row['start'], scene_row['end']
        scene_id, movie_name = scene_row['scene_id'], scene_row['movie']
        scene_location, time_location = scene_row['rel_id_loc'], scene_row['rel_t_loc'] 	
        prot_appear = scene_row['is_prot_appear']
        scene_count = movie_scenes_count[movie_name]

        movie_name_without_spaces = movie_name.replace(" ", "")
        category = map_scene_location_to_category(int(scene_location * 100))
        
        scene_location_result = format_data("SceneLocation", scene_location * 100)
        time_location_result = format_data("TimeLocation", time_location * 100)
        category_result = f"category{category}"
        movie_name_result = f"MovieName{movie_name_without_spaces}"
        scene_start_result = format_data("SceneStart", scene_start)
        scene_end_result = format_data("SceneEnd", scene_end)
        prot_appear_result = format_data("ProtAppear", prot_appear)
        scene_count_result = format_data("SceneCount", scene_count)
        
        # Create new dictionary for every movie
        if movie_name not in movies_scenes_subtitles:
            movies_scenes_subtitles[movie_name] = {}
            last_processed_subtitle_idx = 0

        # Craete new dictionary for every scene within movie
        movies_scenes_subtitles[movie_name][scene_id] = []
        movies_scenes_subtitles[movie_name][scene_id].append(scene_location_result)
        movies_scenes_subtitles[movie_name][scene_id].append(time_location_result)
        movies_scenes_subtitles[movie_name][scene_id].append(category_result)
        movies_scenes_subtitles[movie_name][scene_id].append(movie_name_result)
        movies_scenes_subtitles[movie_name][scene_id].append(scene_start_result)
        movies_scenes_subtitles[movie_name][scene_id].append(scene_end_result)
        movies_scenes_subtitles[movie_name][scene_id].append(prot_appear_result)
        movies_scenes_subtitles[movie_name][scene_id].append(scene_count_result)

        current_movie_subtitles = movie_subtitles[movie_name]
        for idx in range(last_processed_subtitle_idx, len(current_movie_subtitles)):
            sub = current_movie_subtitles[idx]
            sub_start, sub_end = sub.start.total_seconds(), sub.end.total_seconds()

            # Some subtitles start just before scene_start
            if scene_start <= (sub_start + 0.05) and scene_end >= sub_end:
                # Add subtitle content to the dictionary for the current scene
                movies_scenes_subtitles[movie_name][scene_id].append(sub.content)
            elif scene_end < sub_end:
                last_processed_subtitle_idx = idx
                break
            else:
                missed_subtitles += 1
        
    print(f"Ignored subtitltes: {missed_subtitles}")
    return movies_scenes_subtitles

### Get subtitle files

In [23]:
train_features_files = load_files(FEATURES_PATH)
# remove australia because australia subtitles are in diffrent format than everything else
# train_features_files.remove('tt0455824_australia.csv')
test_features_files = load_files(TEST_FEATURES_PATH)

train_subtitles_files = convert_files_to_subtitles(train_features_files)
train_subtitle_paths = [os.path.join(SUBTITLES_PATH, subtitle_file) for subtitle_file in train_subtitles_files]

test_subtitles_files = convert_files_to_subtitles(test_features_files)
test_subtitle_paths = [os.path.join(TEST_SUBTITLES_PATH, subtitle_file) for subtitle_file in test_subtitles_files]

### Load subtitles 

In [24]:
train_movie_subtitles = load_subtitles(train_subtitle_paths, train_movies)
test_movie_subtitles = load_subtitles(test_subtitle_paths, test_movies)

### Associate Scenes with subtitles

In [25]:
# train_movies_scenes_subtitles, additional_train_data = associate_scenes_with_subtitles(train_full_data, train_movie_subtitles)
# test_movies_scenes_subtitles, additional_test_data = associate_scenes_with_subtitles(test_full_data, test_movie_subtitles)

## Subtitles cleaning and combining

### Helper functions

In [26]:
def clean_subtitles(movies_scenes_subtitles):
    for movie_name, scenes in movies_scenes_subtitles.items():
        for scene_id, subtitles in scenes.items():
            cleaned_subtitles = []
            for subtitle in subtitles:
                # Remove new line characters
                cleaned_subtitle = subtitle.replace('\n', ' ').strip()
                cleaned_subtitle = re.sub(r'<.*?>', '', cleaned_subtitle)
                cleaned_subtitle = re.sub(r'♪', '', cleaned_subtitle)
                cleaned_subtitles.append(cleaned_subtitle)
            
            combined_text = " ".join(cleaned_subtitles)
            movies_scenes_subtitles[movie_name][scene_id] = combined_text
            
    return movies_scenes_subtitles

def clean_subtitles_only_words(movies_scenes_subtitles):
    for movie_name, scenes in movies_scenes_subtitles.items():
        for scene_id, subtitles in scenes.items():
            cleaned_subtitles = []
            for subtitle in subtitles:
                cleaned_subtitle = subtitle.replace('\n', ' ').strip()
                cleaned_subtitle = re.sub(r'<.*?>', '', cleaned_subtitle)
                cleaned_subtitle = re.sub(r'[^a-zA-Z0-9\s\']', '', cleaned_subtitle)
                cleaned_subtitles.append(cleaned_subtitle)
            
            combined_text = " ".join(cleaned_subtitles)
            movies_scenes_subtitles[movie_name][scene_id] = combined_text
            
    return movies_scenes_subtitles

In [27]:
# clean_train_movies_scenes_subtitles = clean_subtitles(train_movies_scenes_subtitles)
# clean_train_movies_scenes_subtitles['four weddings and a funeral'][3]

In [28]:
# clean_test_movies_scenes_subtitles = clean_subtitles(test_movies_scenes_subtitles)
# clean_test_movies_scenes_subtitles['gone girl'][3]

## Text classification using BERT (Fourth Model)

### Helper functions

In [29]:
unique_labels = train_full_data['label'].unique()
label_to_int_mapping = {label: i for i, label in enumerate(unique_labels)}
int_to_label_mapping = {i: label for label, i in label_to_int_mapping.items()}

def map_label_to_int(label):
    return label_to_int_mapping[label]


def map_label_from_int(label):
    return int_to_label_mapping[label] 

In [30]:
def transform_subtitles(subtitles, df):
    transformed_subtitles = []
    for movie_name, scenes in subtitles.items():
        for scene_id, text in scenes.items():
            # Get the label from train_full_data based on movie and scene_id
            label = df[(df['movie'] == movie_name) & (df['scene_id'] == scene_id)]['label'].values[0]
            label = map_label_to_int(label)
            
            result = {'text': text, 'label': label}
            transformed_subtitles.append(result)
    
    return transformed_subtitles


def transform_validation_subtitles(subtitles, df):
    transformed_subtitles = []
    for movie_name, scenes in subtitles.items():
        for scene_id, text in scenes.items():
            result = {'text': text}
            transformed_subtitles.append(result)
    
    return transformed_subtitles
    

def print_head_transformed_subtitles(subtiltes):
    for i in range(min(5, len(subtiltes))):
        print(subtiltes[i])

In [31]:
train_subtitles = associate_scenes_with_subtitles_extra_info(train_full_data, train_movie_subtitles, train_movie_scenes_count)
# train_subtitles, _ = associate_scenes_with_subtitles(train_full_data, train_movie_subtitles)
clean_train_subtitles = clean_subtitles_only_words(train_subtitles)
train_transformed_subtitles = transform_subtitles(clean_train_subtitles, train_full_data)

Ignored subtitltes: 3204


In [32]:
# train_transformed_subtitles

In [33]:
validation_subtitles = associate_scenes_with_subtitles_extra_info(test_full_data, test_movie_subtitles, test_movie_scenes_count)
# validation_subtitles, _ = associate_scenes_with_subtitles(test_full_data, test_movie_subtitles)
clean_validation_subtitles = clean_subtitles_only_words(validation_subtitles)
validation_transformed_subtitles = transform_validation_subtitles(clean_validation_subtitles, test_full_data)

Ignored subtitltes: 3165


In [34]:
from sklearn.model_selection import train_test_split

# Extract 'text' and 'label' from each dictionary
texts = [item['text'] for item in train_transformed_subtitles]
labels = [item['label'] for item in train_transformed_subtitles]

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.3, random_state=42)

# # Create training and testing datasets
train_data = [{'text': text, 'label': label} for text, label in zip(train_texts, train_labels)]
test_data = [{'text': text, 'label': label} for text, label in zip(test_texts, test_labels)]
# data = [{'text': text, 'label': label} for text, label in zip(texts, labels)]



### For cross-validation

In [35]:
# texts = [item['text'] for item in train_transformed_subtitles]
# labels = [item['label'] for item in train_transformed_subtitles]

# data = [{'text': text, 'label': label} for text, label in zip(texts, labels)]

### No cross validation

In [36]:
from datasets import Dataset
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)
validation_dataset = Dataset.from_list(validation_transformed_subtitles)

print(train_dataset)
print(test_dataset)
print(validation_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 2784
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1194
})
Dataset({
    features: ['text'],
    num_rows: 2470
})


In [53]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer

def choose_model(name):
    if name == "distilbert": 
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=16)
    elif name == "bert":
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=16)
    elif name == "bert-large":
        tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
        model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=16)
    elif name == "roberta":
        tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=16)
    elif name == "roberta-large":
        tokenizer = AutoTokenizer.from_pretrained("roberta-large")
        model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=16)
    else:
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=16)

    return model, tokenizer

model, tokenizer = choose_model("bert")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# import nltk
# import subprocess

# # Download and unzip wordnet
# try:
#     nltk.data.find('wordnet.zip')
# except:
#     nltk.download('wordnet', download_dir='/kaggle/working/')
#     command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
#     subprocess.run(command.split())
#     nltk.data.path.append('/kaggle/working/')

In [39]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# nltk.data.path.append("/root/nltk_data")

# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def remove_stopwords_and_lemmatize(text):
    words = text.split()
    filtered_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


def remove_stopwords(text):
    words = text.split()
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


def preprocess_function(examples):
#     examples["text"] = [remove_stopwords_and_lemmatize(text) for text in examples["text"]]
    examples["text"] = [remove_stopwords(text) for text in examples["text"]]
    tokenized_text = tokenizer(examples["text"], truncation=True)
    
    return tokenized_text

In [40]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenized_validation_dataset = validation_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/1194 [00:00<?, ? examples/s]

Map:   0%|          | 0/2470 [00:00<?, ? examples/s]

In [41]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2784
})

In [42]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [43]:
import evaluate
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [44]:
%env JOBLIB_TEMP_FOLDER=/tmp

env: JOBLIB_TEMP_FOLDER=/tmp


## Cross Validation

In [45]:
# from sklearn.model_selection import KFold

# num_folds = 5
# kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [46]:
# all_predictions = []
# for fold, (train_index, val_index) in enumerate(kf.split(tokenized_train_dataset)):
#     print(f"Training Fold {fold + 1}/{num_folds}")

#     model, _ = choose_model("roberta")

#     train_dataset_fold = tokenized_train_dataset.select(train_index)
#     val_dataset_fold = tokenized_train_dataset.select(val_index)
    
#     training_args_fold = TrainingArguments(
#         output_dir=f"./results_fold_{fold}",
#         learning_rate=2e-5,
#         per_device_train_batch_size=12,
#         per_device_eval_batch_size=12,
#         gradient_accumulation_steps=1,
#         num_train_epochs=18,
#         weight_decay=0.01,
#         save_total_limit=2,
#         load_best_model_at_end=True,
#         save_strategy="epoch",
#         evaluation_strategy="epoch",
#         metric_for_best_model="eval_accuracy"
#     )

#     trainer_fold = Trainer(
#         model=model,
#         args=training_args_fold,
#         train_dataset=train_dataset_fold,
#         eval_dataset=val_dataset_fold,
#         tokenizer=tokenizer,
#         data_collator=data_collator,
#         compute_metrics=compute_metrics
#     )

#     trainer_fold.train()
    
#     predictions = trainer_fold.predict(val_dataset_fold)

#     all_predictions.append(predictions.predictions)

In [47]:
# # Combine predictions from all folds
# all_predictions_combined = np.array(all_predictions) 
# # Use np.argmax along axis 2 to get the index with the most votes for each instance
# majority_votes = np.argmax(np.sum(all_predictions_combined, axis=0), axis=1)
# final_predictions = majority_votes.tolist()

In [48]:
# final_predictions

## Split train and test data

In [54]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=15,
    weight_decay=0.01,
#     logging_steps=100,
    save_total_limit = 2,
#     save_strategy="no",
#     save_steps=99999999999999999,
    load_best_model_at_end = True,
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    metric_for_best_model = "eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.705404,0.445561
2,1.990400,1.531736,0.485762
3,1.564600,1.415444,0.497487
4,1.564600,1.321403,0.536013
5,1.367400,1.280802,0.553601
6,1.234900,1.20381,0.587102
7,1.234900,1.205725,0.597152
8,1.104900,1.161402,0.613903
9,0.999900,1.14284,0.626466
10,0.999900,1.14462,0.628141


TrainOutput(global_step=5220, training_loss=1.1267297591286145, metrics={'train_runtime': 933.0523, 'train_samples_per_second': 44.756, 'train_steps_per_second': 5.595, 'total_flos': 3476253174826752.0, 'train_loss': 1.1267297591286145, 'epoch': 15.0})

In [65]:
# folder_path = "/kaggle/working/results"
# def load_model(model_name):
#     model_path = f"{folder_path}/{model_name}"
#     tokenizer = AutoTokenizer.from_pretrained(model_path)
#     model = AutoModelForSequenceClassification.from_pretrained(model_path)
    
#     return tokenizer, model
    
# # loaded_model, loaded_tokenizer = load_model("checkpoint-3052")

In [55]:
trainer.evaluate()

{'eval_loss': 1.1428395509719849,
 'eval_accuracy': 0.626465661641541,
 'eval_runtime': 6.7185,
 'eval_samples_per_second': 177.72,
 'eval_steps_per_second': 22.327,
 'epoch': 15.0}

In [56]:
predictions = trainer.predict(tokenized_validation_dataset)

In [57]:
predicted_probabilities = predictions.predictions
predicted_label_ids = np.argmax(predicted_probabilities, axis=-1)

label_predictions = [map_label_from_int(label) for label in predicted_label_ids]
print(f"first 5 predictions: {label_predictions[:30]} \nlast 5 predictions: {label_predictions[-30:]}")

first 5 predictions: ['Opening Image', 'Opening Image', 'Opening Image', 'Set-Up', 'Set-Up', 'Set-Up', 'Set-Up', 'Set-Up', 'Set-Up', 'Set-Up', 'Set-Up', 'Debate', 'Set-Up', 'Debate', 'Debate', 'Debate', 'Debate', 'Debate', 'Debate', 'None', 'Debate', 'Debate', 'Debate', 'B Story', 'B Story', 'B Story', 'B Story', 'Fun and Games', 'Fun and Games', 'B Story'] 
last 5 predictions: ['Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Finale', 'Final Image', 'Final Image', 'Final Image', 'Final Image', 'Final Image', 'Final Image']


## Result data

In [58]:
result_df = pd.DataFrame(columns=['Id', 'Label'])

def fill_result_df(full_data, movies, movies_ids, predictions):
    for idx, scene in full_data.iterrows():
        movie_name = scene['movie']
        movie_index = movies.index(movie_name)
        movie_id = movies_ids[movie_index]
        scene_id = scene['scene_id']
        movie_scene_id = f"{movie_id}_{scene_id}"

        pred_label = predictions[idx]
        result_df.loc[idx] = [movie_scene_id, pred_label]

In [59]:
fill_result_df(test_full_data, test_movies, test_movies_ids, label_predictions)

result_df.head()

Unnamed: 0,Id,Label
0,tt0822832_1,Opening Image
1,tt0822832_2,Opening Image
2,tt0822832_4,Opening Image
3,tt0822832_6,Set-Up
4,tt0822832_7,Set-Up


In [101]:
result_df.to_csv('output_bert__eval_loss.csv', index=False)

## Save model

In [196]:
from transformers import BertConfig

folder_name = "results"

def save_model(model, tokenizer, name):
    model_folder_path = f"./{folder_name}/{name}" 
    model.save_model(model_folder_path)
    tokenizer.save_pretrained(model_folder_path)
    config = BertConfig.from_pretrained(model_folder_path)
    config.save_pretrained(model_folder_path)
    
# save_model(trainer, tokenizer, "model_large_roberta_5")

## Clean memory

In [138]:
# import torch
# torch.cuda.empty_cache()

In [139]:
# import gc
# gc.collect()

## Save combined clean data in file

In [140]:
# output_file_path = "output_combined_file.txt"  # Set your desired output file

# with open(output_file_path, 'w', encoding='utf-8') as output_file:
#     for movie, scenes in clean_train_movies_scenes_subtitles.items():
#         output_file.write(f"======= {movie} =========\n\n")
#         output_file.write(str(scenes))

# print("Combined text file saved successfully.")


In [141]:
# output_file_path = "output_combined_file2.txt"  # Set your desired output file

# with open(output_file_path, 'w', encoding='utf-8') as output_file:
#     for movie, scenes in clean_train_subtitles.items():
#         output_file.write(f"======= {movie} =========\n\n")
#         output_file.write(str(scenes))

# print("Combined text file saved successfully.")
