# Import, variables

In [117]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np
import re

import random 
from collections import Counter

from sklearn.metrics.pairwise import cosine_similarity as skl_cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
from sklearn.preprocessing import MinMaxScaler

from tqdm.auto import tqdm, trange

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
import re

from transformers import BertTokenizer, BertModel

%matplotlib inline
%config InlineBackend.figure_format = "retina"

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import joblib

In [118]:
from IPython.display import clear_output

In [119]:
FILMS_DATA_PATH = "data/films.json"
nltk.download('stopwords')
STOP_WORDS = set(stopwords.words("english"))

PRETRAINED_SG_MODEL = True
PRETRAINED_LDA_MODEL = True
PRETRAINED_BLENDINGNN = True

PRETRAINED_SG_PATH = "models/sgmodel.pth"
PRETRAINED_LDA_PATH = "models/lda_model.pkl"
PRETRAINED_BLENDINGNN_PATH = "models/film_prepare_nn_weights_final.pth"

GET_SG_EMB = True
GET_BERT_EMB = True
GET_LDA_EMB = True
GET_BLENDNN_EMB = True

SG_EMB_PATH = "models/sg_embeddings.pt"
BERT_EMB_PATH = "models/bert_embeddings.pt"
LDA_EMB_PATH = "models/lda_embeddings.pt"
BLENDNN_EMB_PATH = "models/films_embeddings.pt"

SAVE_CHANGES = False

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [120]:
df_films = pd.read_json(FILMS_DATA_PATH)
df_films.shape

(35500, 31)

In [121]:
def convert_watched(value):
    if isinstance(value, str):
        if "K" in value:
            return float(value.replace("K", "")) * 1e3
        elif "M" in value:
            return float(value.replace("M", "")) * 1e6
        else:
            return float(value)
    return value

def explode_and_onehot(df, col, prefix):
    exploded = df[["film_id", col]].explode(col)
    dummies = pd.get_dummies(exploded[col], prefix=prefix)
    return exploded[["film_id"]].join(dummies).groupby("film_id").max().astype(int)

def films_main_prepare(df_films, log_watched_liked=True):
    df_films = df_films.copy()
    df_films = df_films.dropna(subset=["name", "description"])
    df_films["name"] = df_films["name"].apply(lambda x: x.replace("\xa0", " "))

    df_films["watched"] = df_films["watched"].map(convert_watched)
    df_films["liked"] = df_films["liked"].map(convert_watched)

    df_films["watched"] = df_films["watched"].fillna(df_films["watched"].mean())
    df_films["liked"] = df_films["liked"].fillna(df_films["liked"].mean())
    
    if log_watched_liked:
        df_films["log_liked"] = df_films["liked"].apply(np.log)
        df_films["log_watched"] = df_films["watched"].apply(np.log)
        df_films = df_films.drop(columns=["liked", "watched"])


    df_films = df_films.dropna(subset=["year", "director"])
    df_films["year"] = df_films["year"].astype(np.int32)
    df_films["decade"] = df_films["year"] // 10 * 10
    df_films["decade"] = df_films["decade"].astype(str)
    decade_dummies = pd.get_dummies(df_films["decade"], prefix="decade")
    df_films = df_films.drop(columns="decade")
    df_films = df_films.join(decade_dummies)


    df_films = df_films[(df_films["year"] >= 1920) & (df_films["year"] <= 2024)]
    df_films = df_films[(df_films["duration"] >= 60) & (df_films["duration"] <= 240)]

    minmax = MinMaxScaler()
    df_films["year"] = minmax.fit_transform(df_films[["year"]])

    #print(df_films[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']].dtypes)

    df_films[
        ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
    ] = df_films[
        ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
    ].apply(pd.to_numeric, errors='coerce').fillna(0.0)

    df_films["cnt_ratings"] = df_films[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']].sum(axis=1)
    df_films = df_films.dropna(subset=["cnt_ratings"])
    df_films['1'] = (df_films['1'] / df_films["cnt_ratings"]).round(4)
    df_films['2'] = (df_films['2'] / df_films["cnt_ratings"]).round(4)
    df_films['3'] = (df_films['3'] / df_films["cnt_ratings"]).round(4)
    df_films['4'] = (df_films['4'] / df_films["cnt_ratings"]).round(4)
    df_films['5'] = (df_films['5'] / df_films["cnt_ratings"]).round(4)
    df_films['6'] = (df_films['6'] / df_films["cnt_ratings"]).round(4)
    df_films['7'] = (df_films['7'] / df_films["cnt_ratings"]).round(4)
    df_films['8'] = (df_films['8'] / df_films["cnt_ratings"]).round(4)
    df_films['9'] = (df_films['9'] / df_films["cnt_ratings"]).round(4)
    df_films['10'] = (df_films['10'] / df_films["cnt_ratings"]).round(4)

    df_films["rating"] = df_films["rating"].fillna(df_films["rating"].mean())

    df_films["top"] = df_films["top"].notna().astype(int)

    df_films["cnt_genres"] = df_films["genres"].apply(len)
    df_films["cnt_themes"] = df_films["themes"].apply(len)
    df_films["cnt_countries"] = df_films["country"].apply(len)
    df_films["cnt_studios"] = df_films["studio"].apply(len)

    df_films["film_id"] = df_films.index

    df_films["genres"] = df_films["genres"].apply(lambda x: [g for g in x if g != "Documentary"])

    genres_ohe = explode_and_onehot(df_films, "genres", "genre")
    countries_ohe = explode_and_onehot(df_films, "country", "country")
    #studios_ohe = explode_and_onehot(df_films, "studio", "studio")
    themes_ohe = explode_and_onehot(df_films, "themes", "theme")

    df_final = df_films.set_index("film_id")
    df_final = df_final.join([genres_ohe, countries_ohe, themes_ohe]) 

    return df_final.reset_index(drop=True)

In [122]:
prepared_df = films_main_prepare(df_films)
prepared_df.head()

Unnamed: 0,name,year,description,1,2,3,4,5,6,7,...,theme_Tragic sadness and captivating beauty,theme_Twisted dark psychological thriller,theme_Underdog fighting and boxing stories,theme_Underdogs and coming of age,"theme_Violent action, guns, and crime",theme_Violent crime and drugs,theme_War and historical adventure,theme_Western frontier dramas with a touch of humor,theme_Westerns,theme_Wild west outlaws and gunfights
0,Barbie,0.990385,Barbie and Ken are having the time of their li...,0.0048,0.0113,0.0072,0.0384,0.0369,0.154,0.1361,...,0,0,0,0,0,0,0,0,0,0
1,Parasite,0.951923,"All unemployed, Ki-taek’s family takes peculia...",0.0009,0.0021,0.0009,0.006,0.005,0.0352,0.0409,...,0,1,0,0,0,0,0,0,0,0
2,Interstellar,0.903846,The adventures of a group of explorers who mak...,0.0015,0.0038,0.0019,0.0125,0.01,0.0553,0.055,...,0,0,0,0,0,0,0,0,0,0
3,Fight Club,0.759615,A ticking-time-bomb insomniac and a slippery s...,0.0014,0.0037,0.0019,0.0123,0.0105,0.0682,0.0735,...,0,1,0,0,0,0,0,0,0,0
4,La La Land,0.923077,"Mia, an aspiring actress, serves lattes to mov...",0.0045,0.0137,0.0047,0.0331,0.0192,0.0988,0.0768,...,0,0,0,0,0,0,0,0,0,0


In [123]:
def preprocess_sg(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()

    # Remove all words with  5 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > 5]

    return trimmed_words

def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    param 
        words: Input list of words
    return: 
        Two dictionaries, vocab_to_int, int_to_vocab
    """
    word_counts = Counter(words)
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

def prob_of_word_to_save(freq, t):
    if freq == 0:
        return 0
    prob = np.sqrt(t/(freq))
    rand_element = np.random.rand()
    return prob > rand_element

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[.,!?;:()"]', '', text)
    text = re.sub(r"’s", "", text)

    tokens = text.split()
    tokens = [token for token in tokens if token not in STOP_WORDS]
    text = " ".join(tokens)
    return text

In [124]:
description_corpus = prepared_df["description"]
description_corpus = description_corpus.apply(lambda x: x.replace("\xa0", " "))

In [125]:
def find_topk_similar_films(embeddings_matrix, film_idx, criterion=skl_cosine_similarity, k=5):
    """
    From matrix with embeddings of films (with the same indexing as original films dataset)
    find k most similar by criterion embeddings and return dict with their indecies and value of similarness
    """
    film_emb = embeddings_matrix[film_idx]
    similarities_test = criterion(film_emb.unsqueeze(0), embeddings_matrix)
    if k != -1:
        topk = similarities_test[0].argsort()[::-1][1:k+1]
    else:
        topk = similarities_test[0].argsort()[::-1]
    return {int(idx): round(float(criterion(embeddings_matrix[film_idx].unsqueeze(0), embeddings_matrix[idx].unsqueeze(0))[0][0]), 5) for idx in topk}

def print_similar_films(films_ds, embeddings_matrix, film_idx, criterion=skl_cosine_similarity, k=5):
    print(films_ds.iloc[film_idx, 0])
    print(
        "\n".join(
            [
                f"{films_ds.iloc[idx, 0]} -  {value}" 
                for idx, value in find_topk_similar_films(embeddings_matrix, film_idx, criterion=criterion, k=k).items()
            ] #sorted()?
        )
    )

def pair_film_similarity(film_idx1, film_idx2, films_matrix, embedding_matrix, criterion=skl_cosine_similarity, print_=True):
    film_emb1 = embedding_matrix[film_idx1].unsqueeze(0)
    film_emb2 = embedding_matrix[film_idx2].unsqueeze(0)

    similarity = criterion(film_emb1, film_emb2)[0][0]
    if print_:
        print(
            f"{films_matrix.iloc[film_idx1, 0]} - {films_matrix.iloc[film_idx2, 0]} - {similarity:.4f}"
        )

## Skiprgamms

In [126]:
def get_film_embedding_sg(description, model, vocab_to_int):
    indicies = [vocab_to_int.get(word, vocab_to_int["<PERIOD>"]) for word in description.split(" ")]
    tensor = torch.LongTensor(indicies)
    with torch.no_grad():
        embeddings = model.embed(tensor)
        desc_embedding = embeddings.mean(dim=0)
    return desc_embedding

In [127]:
def get_target(words, idx, window_size=5):
    """
    Get a list of words in a window of index
    """

    right_side = words[idx+1:min(len(words),idx+window_size+1)]
    left_side = words[max(0, idx-window_size):idx]
    return left_side + right_side

def get_batches(words, batch_size, window_size=5):
    """Create a generator of batches as a tuple (inputs, targets)"""
    n_batches = len(words) // batch_size

    words = words[: n_batches * batch_size]

    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx : idx + batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x] * len(batch_y))
        yield x, y

def my_cosine_similarity(embedding, valid_size=16, valid_window=100):
    embed_vectors = embedding.weight
    vocab_size = embed_vectors.shape[0]

    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)

    max_index = min(vocab_size, 1000 + valid_window)
    valid_pool = list(range(min(valid_window, vocab_size))) + list(range(1000, max_index))

    valid_examples = np.random.choice(valid_pool, size=valid_size, replace=False)
    valid_examples = torch.LongTensor(valid_examples)

    valid_vectors = embedding(valid_examples)
    similarities = torch.mm(valid_vectors, embed_vectors.t()) / magnitudes

    return valid_examples, similarities



In [128]:
class MySkipgram(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.linear = nn.Linear(embed_size, vocab_size)
    
    def forward(self, x):
        return self.linear(self.embed(x))

def train_skip_grams(model, train_words, int_to_vocab, criterion, optimizer, n_epochs=15, batch_size=1024, print_every=300):
    mean_loss = []
    steps = 0
    for epoch in trange(n_epochs, leave=True, desc="Epoch number: "):
        pbar = tqdm(
            get_batches(train_words, batch_size),
            leave=False,
            desc="Batch number",
            total=len(train_words) // batch_size
        )

        epoch_loss = []

        for inputs, targets in pbar:
            steps += 1
            inputs, targets = torch.LongTensor(inputs), torch.LongTensor(targets)

            log_ps = model(inputs)
            loss = criterion(log_ps, targets)
            epoch_loss.append(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if steps % print_every == 0:
                valid_examples, valid_similarities = my_cosine_similarity(model.embed)
                _, closest_idxs = valid_similarities.topk(5)

                lines_to_write = []
                for ii, valid_idx in enumerate(valid_examples):
                    closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
                    lines_to_write.append(int_to_vocab[valid_idx.item()] + " | " + ", ".join(closest_words))
                
                # clear_output(wait=True)

                for line in lines_to_write:
                    pbar.write(line)
                pbar.write("...")

def get_film_embedding_sg(description, model, vocab_to_int):
    indicies = [vocab_to_int.get(word, vocab_to_int["<PERIOD>"]) for word in description.split(" ")]
    tensor = torch.LongTensor(indicies)
    with torch.no_grad():
        embeddings = model.embed(tensor)
        desc_embedding = embeddings.mean(dim=0)
    return desc_embedding

def get_avg_skip_gramms(descriptions, embedding_dim, n_epochs=5, pretrained_model=None):
    words = preprocess_sg(" ".join(descriptions))
    vocab_to_int, int_to_vocab = create_lookup_tables(words)
    int_words = [vocab_to_int[word] for word in words]

    threshold = 1e-5
    word_counts = Counter(int_words)

    train_words = [word for word in int_words if prob_of_word_to_save(word_counts[word]/len(int_words), threshold)]

    if PRETRAINED_SG_MODEL:
        model = pretrained_model
    else:
        model = MySkipgram(len(vocab_to_int), embedding_dim)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
        train_skip_grams(model, train_words, int_to_vocab, criterion, optimizer, n_epochs=n_epochs)
    
    skip_gramms_embeddings_matrix = torch.stack([
        get_film_embedding_sg(desc, model, vocab_to_int) for desc in description_corpus
    ])
    return model, skip_gramms_embeddings_matrix    

In [129]:
if PRETRAINED_SG_MODEL:
    sg_model = torch.load(PRETRAINED_SG_PATH, weights_only=False)

if GET_SG_EMB:
    sgramms = torch.load(SG_EMB_PATH)

if PRETRAINED_SG_MODEL and not GET_SG_EMB:
    sg_model, sgramms = get_avg_skip_gramms(description_corpus, 128, 25, pretrained_model=sg_model)

if not PRETRAINED_SG_MODEL and not GET_SG_EMB:
    sg_model, sgramms = get_avg_skip_gramms(description_corpus, 128, 25)

In [130]:
sgramms

tensor([[-0.3191, -0.1515,  0.0500,  ...,  0.0027,  0.1108,  0.0089],
        [-0.1453, -0.1262,  0.0968,  ...,  0.1479, -0.0615, -0.0273],
        [ 0.0602,  0.0041,  0.0097,  ..., -0.2327,  0.0156, -0.0532],
        ...,
        [-0.2435, -0.1436,  0.0558,  ..., -0.0124,  0.0718, -0.1726],
        [-0.0501, -0.0521, -0.3055,  ...,  0.0663,  0.2914, -0.0880],
        [-0.1449, -0.0901,  0.0508,  ...,  0.1371, -0.0475, -0.1074]])

In [131]:
find_topk_similar_films(sgramms, 1)

{21839: 0.69166, 11539: 0.6887, 3290: 0.68494, 16210: 0.68291, 7087: 0.68041}

In [132]:
print_similar_films(prepared_df, sgramms, 3)

Fight Club
The Connection -  0.70258
The Orphanage -  0.67978
A Hard Day’s Night -  0.67954
Next Door -  0.67658
The Sound of Music -  0.67407


In [133]:
pair_film_similarity(71, 236, prepared_df, sgramms)

The Lord of the Rings: The Fellowship of the Ring - Avengers: Age of Ultron - 0.6555


In [134]:
#df_films[(df_films["name"].notna()) & (df_films["name"].str.contains("Casino"))].head(10)

## Bert

In [135]:
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze(0)

def get_bert_embeddings(descriptions, embedding_dim=128):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert = BertModel.from_pretrained('bert-base-uncased')
    films_embeddings_matrix = torch.stack([
        get_bert_embedding(desc, tokenizer, bert) for desc in tqdm(descriptions)
    ])
    return bert, films_embeddings_matrix

In [136]:
if GET_BERT_EMB:
    bert_emb = torch.load(BERT_EMB_PATH)
else:
    bert, bert_emb = get_bert_embeddings(description_corpus)

In [137]:
#bert, bert_emb = get_bert_embeddings(description_corpus)

In [138]:
find_topk_similar_films(bert_emb, 1)

{29034: 0.91709, 212: 0.91533, 19342: 0.91372, 2660: 0.9134, 30063: 0.91274}

In [139]:
print_similar_films(prepared_df, bert_emb, 71)

The Lord of the Rings: The Fellowship of the Ring
The Hobbit: An Unexpected Journey -  0.92037
Jason and the Argonauts -  0.91415
Happily Ever After -  0.91365
The Hobbit: The Battle of the Five Armies -  0.90962
Princess Mononoke -  0.90904


In [140]:
pair_film_similarity(71, 235, prepared_df, bert_emb)

The Lord of the Rings: The Fellowship of the Ring - Fargo - 0.7138


## LDA

In [141]:
def print_topics(model, vectorizer, top_n=10, k_topics=5):
    words = vectorizer.get_feature_names_out()
    counter = 0
    for idx, topic in enumerate(model.components_):
        print("_____________________")
        print(f"Theme: {idx+1}")
        top_words = topic.argsort()[:-top_n - 1:-1]
        for i in top_words:
            print(f"{words[i]} ({topic[i]:.2f})")
        counter += 1
        if counter == k_topics:
            break

In [142]:
clean_corpus = description_corpus.apply(preprocess)

In [143]:
def print_topics(model, vectorizer, top_n=10, k_topics=5):
    words = vectorizer.get_feature_names_out()
    counter = 0
    for idx, topic in enumerate(model.components_):
        print("_____________________")
        print(f"Theme: {idx+1}")
        top_words = topic.argsort()[:-top_n - 1:-1]
        for i in top_words:
            print(f"{words[i]} ({topic[i]:.2f})")
        counter += 1
        if counter == k_topics:
            break

def get_top_k_topics(topic_distribution, k=3):
    top_indices = np.argsort(topic_distribution)[::-1][:k]
    top_probs = topic_distribution[top_indices]
    return list(zip(top_indices, top_probs))

def print_topic_by_id(model, vectorizer, idx, top_n=10):
    words = vectorizer.get_feature_names_out()
    topic = model.components_[idx]
    print(f"Theme: {idx}")
    top_words = topic.argsort()[:-top_n - 1:-1]
    for i in top_words:
        print(f"{words[i]} ({topic[i]:.2f})")
    print()

def get_lda_embedding(desc, model, vectorizer):
    clean_desc = preprocess(desc)
    new_vec = vectorizer.transform([clean_desc])
    emb = model.transform(new_vec)[0]
    return emb

def get_lda_matrix(clean_corpus, fitted_vectorizer, pretrained_model=None):
    x = fitted_vectorizer.transform(clean_corpus)

    if PRETRAINED_LDA_MODEL:
        lda = pretrained_model
    else:
        lda = LatentDirichletAllocation(
            n_components=256,
            random_state=42,
            learning_method='batch',
            max_iter=20
        )
        lda.fit(x)
    films_embeddings_matrix_lda = torch.stack([
        torch.tensor(get_lda_embedding(desc, lda, fitted_vectorizer), dtype=torch.float32) for desc in tqdm(description_corpus)
    ])
    return lda, films_embeddings_matrix_lda

In [144]:
vectorizer = CountVectorizer(max_df=0.9, min_df=5)
vectorizer.fit(clean_corpus)

In [145]:
if PRETRAINED_LDA_MODEL:
    lda = joblib.load(PRETRAINED_LDA_PATH)

if GET_LDA_EMB:
    lda_matrix = torch.load(LDA_EMB_PATH)

if PRETRAINED_LDA_MODEL and not GET_LDA_EMB:
    lda, lda_matrix = get_lda_matrix(clean_corpus, vectorizer, pretrained_model=lda)

if not PRETRAINED_LDA_MODEL and not GET_LDA_EMB:
    lda, lda_matrix = get_lda_matrix(clean_corpus, vectorizer)

In [146]:
#lda, lda_matrix = get_lda_matrix(clean_corpus, vectorizer)

In [147]:
pair_film_similarity(136, 150, prepared_df, lda_matrix)

The Lord of the Rings: The Return of the King - The Lord of the Rings: The Two Towers - 0.2015


In [148]:
lda_matrix[71]

tensor([1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 3.9081e-01,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 3.4189e-02,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04, 1.3021e-04,
        1.3021e-04, 1.3021e-04, 1.3021e-

In [149]:
get_top_k_topics(
    lda.transform(
        vectorizer.transform(
            [clean_corpus[71]]
            )
        )[0]
    )

[(np.int64(11), np.float64(0.3908102939073579)),
 (np.int64(105), np.float64(0.19650576111446902)),
 (np.int64(224), np.float64(0.09937300143840277))]

In [150]:
for topic in [11, 105, 224]:
    print_topic_by_id(lda, vectorizer, topic, top_n=2)

Theme: 11
king (484.60)
evil (328.48)

Theme: 105
terrorist (127.92)
pursuit (85.21)

Theme: 224
charles (35.56)
young (29.76)



In [151]:
print_topics(lda, vectorizer, top_n=5)

_____________________
Theme: 1
convicted (74.48)
framed (69.03)
property (53.48)
reclusive (36.76)
family (36.53)
_____________________
Theme: 2
park (204.00)
weekend (182.00)
reunite (133.91)
mountain (129.34)
group (81.90)
_____________________
Theme: 3
los (335.00)
angeles (327.00)
rises (73.63)
crazed (27.34)
streets (23.09)
_____________________
Theme: 4
believe (131.47)
ten (82.07)
string (75.62)
years (56.89)
gone (49.88)
_____________________
Theme: 5
town (230.72)
small (144.02)
teens (122.41)
sheriff (105.27)
aliens (105.00)


In [152]:
find_topk_similar_films(lda_matrix, 71)

{19591: 0.83123, 5650: 0.8228, 25988: 0.82112, 28836: 0.82011, 8965: 0.81774}

In [153]:
print_similar_films(prepared_df, lda_matrix, 71)

The Lord of the Rings: The Fellowship of the Ring
Rampant -  0.83123
Shadow -  0.8228
Tower of London -  0.82112
A Ravaging Wind -  0.82011
Big Fish & Begonia -  0.81774


In [154]:
pair_film_similarity(71, 150, prepared_df, lda_matrix)

The Lord of the Rings: The Fellowship of the Ring - The Lord of the Rings: The Two Towers - 0.2209


## Blending NN

In [155]:
sgramms.size(), bert_emb.size(), lda_matrix.size()

(torch.Size([30910, 128]), torch.Size([30910, 768]), torch.Size([30910, 256]))

In [156]:
prepared_df.shape

(30910, 348)

In [157]:
films_embeddings = torch.cat(
    [sgramms, bert_emb, lda_matrix], dim=1
)
films_embeddings.size()

torch.Size([30910, 1152])

In [158]:
y_year = prepared_df["year"]
y_rating = prepared_df["rating"]
genre_columns = prepared_df.columns[prepared_df.columns.str.startswith("genre_")]
themes_columns = prepared_df.columns[prepared_df.columns.str.startswith("theme_")]
y_genres = prepared_df[genre_columns]
y_themes = prepared_df[themes_columns]
y_year.shape, y_rating.shape, y_genres.shape, y_themes.shape

((30910,), (30910,), (30910, 18), (30910, 109))

In [159]:
class DatasetFilm(torch.utils.data.Dataset):
    def __init__(self, films_embs, genres, themes, year, rating):
        self.X = films_embs
        self.y_genres = torch.tensor(genres.values) if isinstance(genres, pd.DataFrame) else genres
        self.y_themes = torch.tensor(themes.values) if isinstance(themes, pd.DataFrame) else themes
        self.y_year = torch.tensor(year.values) if isinstance(year, pd.Series) else year
        self.y_rating = torch.tensor(rating.values) if isinstance(rating, pd.Series) else rating

    def __len__(self):
        return len(self.y_year)
    
    def __getitem__(self, index):
        return (
            self.X[index],
            self.y_genres[index].float(),
            self.y_themes[index].float(),
            self.y_year[index].float(),
            self.y_rating[index].float(),
        )

In [160]:
dataset = DatasetFilm(films_embeddings, y_genres, y_themes, y_year, y_rating)

In [161]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=0
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=0
)

In [168]:
class MultiTaskFilm(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_genres, num_themes):
        super().__init__()

        self.prepare_emb = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.ReLU(),
            nn.Dropout(0.25),

            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.25),

            # nn.Linear(1024, 1024),
            # nn.ReLU(),
            # nn.Dropout(0.25),

            nn.Linear(1024,  hidden_dim)
        )

        self.genre_head = nn.Linear(hidden_dim, num_genres)
        self.theme_head = nn.Linear(hidden_dim, num_themes)
        self.year_head = nn.Linear(hidden_dim, 1)
        self.rating_head = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = self.prepare_emb(x)
        return (
            self.genre_head(x),
            self.theme_head(x),
            self.year_head(x),
            self.rating_head(x)
        )

    def get_embedding(self, x):
        return self.prepare_emb(x)

In [169]:
CNT_GENRES = 18
CNT_THEMES = 109

In [170]:
blender = MultiTaskFilm(1152, 512, CNT_GENRES, CNT_THEMES)
optimizer = torch.optim.SGD(blender.parameters(), lr=0.001, momentum=0.9)

In [171]:
def plot_history(train_history, val_history, title="loss"):
    plt.figure()
    plt.title("{}".format(title))
    plt.plot(train_history, label="train", zorder=1)
    plt.legend(loc="best")
    plt.grid()

    plt.show()

def train(model, optimizer, train_dataloader, test_dataloader, n_epochs=5):
    loss_genere_log, loss_theme_log, loss_year_log, loss_rating_log = [], [], [], []
    mean_loss_log = []
    val_loss_log = []

    for epoch in range(n_epochs):
        model.train()
        train_epoch_loss = []

        for batch in tqdm(train_dataloader, desc=f"Training, epoch {epoch}", leave=False):
            
            optimizer.zero_grad()
            
            film, y_genres, y_themes, y_year, y_rating = batch
            pred_genres, pred_themes, pred_year, pred_rating = model(film)

            loss_genre = nn.BCEWithLogitsLoss()(pred_genres, y_genres.float())
            loss_theme = nn.BCEWithLogitsLoss()(pred_themes, y_themes.float())
            loss_year = nn.MSELoss()(pred_year.squeeze(), y_year)
            loss_rating = nn.MSELoss()(pred_rating.squeeze(), y_rating)

            loss = (loss_genre + loss_theme + loss_year + loss_rating)/4

            loss.backward()
            optimizer.step()

            loss_genere_log.append(loss_genre)
            loss_theme_log.append(loss_theme)
            loss_year_log.append(loss_year)
            loss_rating_log.append(loss_rating)
            #mean_loss_log.append(loss)
            train_epoch_loss.append(loss.item())

            # pbar.set_postfix({
            #     "genre": f"{loss_genre.item():.4f}",
            #     "theme": f"{loss_theme.item():.4f}",
            #     "year": f"{loss_year.item():.4f}",
            #     "rating": f"{loss_rating.item():.4f}",
            #     "mean": f"{loss.item():.4f}",
            # })

        val_loss_genere_log, val_loss_theme_log, val_loss_year_log, val_loss_rating_log = [], [], [], []
        val_loss_log = []
        val_epoch_loss = []
        model.eval()
        with torch.no_grad():
            for batch in tqdm(test_dataloader, desc=f"Training, epoch {epoch}", leave=False):
            
                film, y_genres, y_themes, y_year, y_rating = batch
                pred_genres, pred_themes, pred_year, pred_rating = model(film)

                loss_genre = nn.BCEWithLogitsLoss()(pred_genres, y_genres.float())
                loss_theme = nn.BCEWithLogitsLoss()(pred_themes, y_themes.float())
                loss_year = nn.MSELoss()(pred_year.squeeze(), y_year)
                loss_rating = nn.MSELoss()(pred_rating.squeeze(), y_rating)

                loss = (loss_genre + loss_theme + loss_year + loss_rating)/4

                val_loss_genere_log.append(loss_genre)
                val_loss_theme_log.append(loss_theme)
                val_loss_year_log.append(loss_year)
                val_loss_rating_log.append(loss_rating)
                val_loss_log.append(loss)
            val_epoch_loss.append(sum(val_loss_log)/len(val_loss_log))


            train_mean = np.mean(train_epoch_loss)
            mean_loss_log.append(train_mean)
            val_mean = np.mean(val_loss_log)
            val_loss_log.append(val_mean)

            clear_output()

            plot_history(mean_loss_log, val_loss_log, "loss")

        # clear_output()

        # print(f"Train loss (epoch {epoch}):", train_mean)
        # print(f"Val loss (epoch {epoch}):", val_mean)
        
        

        print(f"Train loss (epoch {epoch}):")
        print(f"  genre:  {np.mean([loss.detach().item() for loss in loss_genere_log[-len(train_epoch_loss):]]):.4f}")
        print(f"  theme:  {np.mean([loss.detach().item() for loss in loss_theme_log[-len(train_epoch_loss):]]):.4f}")
        print(f"  year:   {np.mean([loss.detach().item() for loss in loss_year_log[-len(train_epoch_loss):]]):.4f}")
        print(f"  rating: {np.mean([loss.detach().item() for loss in loss_rating_log[-len(train_epoch_loss):]]):.4f}")
        print(f"  mean:   {train_mean:.4f}")

        print(f"\nVal loss (epoch {epoch}):")
        print(f"  genre:  {np.mean([loss.detach().item() for loss in val_loss_genere_log]):.4f}")
        print(f"  theme:  {np.mean([loss.detach().item() for loss in val_loss_theme_log]):.4f}")
        print(f"  year:   {np.mean([loss.detach().item() for loss in val_loss_year_log]):.4f}")
        print(f"  rating: {np.mean([loss.detach().item() for loss in val_loss_rating_log]):.4f}")
        print(f"  mean:   {val_mean:.4f}")


In [172]:
if PRETRAINED_BLENDINGNN:
    blender.load_state_dict(torch.load(PRETRAINED_BLENDINGNN_PATH))
else:
    train(blender, optimizer, train_dataloader, test_dataloader, n_epochs=100)

In [173]:
if GET_BLENDNN_EMB:
    nn_embeddings = torch.load(BLENDNN_EMB_PATH)
else:
    nn_embeddings = torch.stack([
        blender.get_embedding(emb).detach() for emb in films_embeddings
    ])

In [174]:
if SAVE_CHANGES:
    torch.save(blender.state_dict(), "film_prepare_nn_weights_final.pth")

    torch.save(films_embeddings, "films_embeddings.pt")
    torch.save(lda_matrix, "lda_embeddings.pt")
    torch.save(bert_emb, "bert_embeddings.pt")
    torch.save(sgramms, "sg_embeddings.pt")

    torch.save(sg_model, "sgmodel.pth")
    joblib.dump(lda, "models/lda_model.pkl")

### Inference

In [175]:
test_idx = 71
test_emb = films_embeddings[test_idx]
blender.get_embedding(test_emb).data


tensor([ 0.2008,  0.6989,  0.2834,  0.3809,  0.4038,  0.0073,  0.3233,  0.1286,
         0.2826,  0.3149,  0.4274,  0.7273,  0.2387, -0.1917,  0.1168,  0.2919,
        -0.1565, -0.3877,  0.6940, -0.4775,  0.0930, -0.1887, -0.4946,  0.5394,
         0.1135, -0.1649,  0.1573, -0.0738, -0.7115, -0.1881, -0.6489, -0.2542,
         0.5220, -0.4802,  0.4658, -0.1466,  0.1152, -0.0405,  0.1043,  0.2168,
         0.5043, -0.5603,  0.3438,  0.9217,  0.5905, -0.2420,  0.3028, -0.0027,
        -0.4311,  0.0638, -0.0950,  0.3762,  0.9601, -0.4661,  0.4256,  0.3660,
         0.2240, -0.0157,  0.4371,  0.1747,  0.5872, -0.1158,  0.4128,  0.2197,
         0.0770,  0.3567, -0.1960,  0.7006,  0.2464, -0.0313, -0.4827, -0.5695,
        -0.0864,  0.1100,  0.1809, -0.4178, -0.8826, -0.1242, -0.2817, -0.1705,
         0.7734, -0.0419,  0.1353, -0.1761, -0.0544, -0.1061,  0.0548, -0.3134,
        -0.3163, -0.5995,  0.2614, -0.2326, -0.7287, -0.2513, -0.6858,  0.4414,
         0.5033,  0.2642, -0.9918, -0.18

In [176]:
find_topk_similar_films(nn_embeddings, 71)

{539: 0.91705, 29859: 0.91194, 5404: 0.91159, 183: 0.90762, 711: 0.907}

In [178]:
print_similar_films(prepared_df, nn_embeddings, 71, k=10)

The Lord of the Rings: The Fellowship of the Ring
The Hobbit: An Unexpected Journey -  0.91705
Happily Ever After -  0.91194
Jason and the Argonauts -  0.91159
Princess Mononoke -  0.90762
The Hobbit: The Battle of the Five Armies -  0.907
Pan -  0.90643
The Chronicles of Narnia: The Lion, the Witch and the Wardrobe -  0.90554
Tinker Bell and the Legend of the NeverBeast -  0.9049
Ninja -  0.90457
Beyond Skyline -  0.90411


In [179]:
pair_film_similarity(71, 150, prepared_df, nn_embeddings)

The Lord of the Rings: The Fellowship of the Ring - The Lord of the Rings: The Two Towers - 0.7014


In [180]:
print_similar_films(prepared_df, nn_embeddings, 24, k=10)

Spider-Man: Across the Spider-Verse
Superman II -  0.88189
Dark Phoenix -  0.87512
Spider-Man 3 -  0.87484
Hellboy II: The Golden Army -  0.87121
Superman: Brainiac Attacks -  0.87116
The Dark Knight -  0.87114
Legion of Super-Heroes -  0.86859
The Dark Knight Rises -  0.8655
Logan -  0.86448
Iron Man: Rise of Technovore -  0.86417


In [181]:
print_similar_films(prepared_df, bert_emb, 24, k=10)

Spider-Man: Across the Spider-Verse
Superman II -  0.88347
Spider-Man 3 -  0.87807
Dark Phoenix -  0.87725
Superman: Brainiac Attacks -  0.87447
The Dark Knight -  0.87415
Hellboy II: The Golden Army -  0.87366
Legion of Super-Heroes -  0.87037
The Dark Knight Rises -  0.8677
Iron Man: Rise of Technovore -  0.8673
Logan -  0.86717


Result of blending NN is similar to BERT embeddings. Despite BERT is already powerfull pretrained model, I'm not fully satisfied with results. Maybe it is becouse of small amount of data and we could parse TFDB or IMDB with full synopsises, which are double size larger than regular descriptions from LB