In [1]:
import torch
import pickle
import pandas as pd
import numpy as np
from src.train import *
from src.processing import *
from src.models import *
from src.inference import *
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
import os
import tiktoken
from openai import OpenAI
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs')

os.environ["OPENAI_API_KEY"] = "sk-..."

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

torch.cuda.empty_cache()

Using device: cuda


In [5]:
vocab = Vocabulary()
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
model = BertModel.from_pretrained('google-bert/bert-base-uncased').to(device)

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
# model = DistilBertModel.from_pretrained('distilbert/distilbert-base-uncased')

# tokenizer = BertTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') 
# model = BertModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# openai_model = "text-embedding-3-large"
# tokenizer = tiktoken.encoding_for_model("text-embedding-3-large")

ratings_df, movie_descriptions, movies_metadata = create_ratings_df(
    number_of_movies=7500,
    links_path='archive/links.csv',
    movies_metadata_path='archive/movies_metadata.csv',
    ratings_path='archive/ratings.csv'
    )
sequences = get_sequences(ratings_df)
vocab.build_vocab(sequences)

train_sentences, val_sentences = train_test_split(sequences, test_size=0.2, random_state=42)
train_data, film_descriptions_encoded = prepare_dataset(
    train_sentences, movie_descriptions, tokenizer, vocab, encode_descriptions=True, max_len=100
)
val_data = prepare_dataset(
    val_sentences, movie_descriptions, tokenizer, vocab
)

train_dataset = FilmRecommendationDataset(train_data, film_descriptions_encoded)
val_dataset = FilmRecommendationDataset(val_data, film_descriptions_encoded)

epochs = 6
batch_size = 64
lr = 0.0001

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  movies_metadata = pd.read_csv(movies_metadata_path)
100%|██████████| 865083/865083 [00:03<00:00, 234166.15it/s]
100%|██████████| 7315/7315 [00:08<00:00, 835.28it/s]
100%|██████████| 216271/216271 [00:00<00:00, 307205.94it/s]


In [6]:
len(train_loader)

13517

In [7]:
emb_dim = 768

film_encoder = SASFilmEncoder(item_num=len(vocab.word_to_index), seq_len=seq_len, embed_dim=emb_dim, device=device)
text_encoder = TextEncoder(model, output_dim=emb_dim, add_fc_layer=True)
torch.manual_seed(42)

<torch._C.Generator at 0x7f97880422d0>

In [11]:
train_clip(film_encoder, text_encoder, train_loader, val_loader, 
           epochs=epochs, lr=lr, device=device, iter_verbose=1500, folder='artifacts', writer=writer)

writer.flush()

In [None]:
writer.close()

In [13]:
# torch.save(film_encoder.state_dict(), 'artifacts/film_encoder_weights_final.pth')
# torch.save(text_encoder.state_dict(), 'artifacts/text_encoder_weights_final.pth')

torch.save(train_dataset, 'artifacts/train_dataset.pt')
torch.save(val_dataset, 'artifacts/val_dataset.pt')

with open('artifacts/ratings_df.pickle', 'wb') as f:
  pickle.dump(ratings_df, f)

with open('artifacts/movie_descriptions.pickle', 'wb') as f:
  pickle.dump(movie_descriptions, f)

with open('artifacts/sequences.pickle', 'wb') as f:
  pickle.dump(sequences, f)

with open('artifacts/vocab.pickle', 'wb') as f:
  pickle.dump(vocab, f)

with open('artifacts/film_descriptions_encoded.pickle', 'wb') as f:
  pickle.dump(film_descriptions_encoded, f)

with open('artifacts/movies_metadata.pickle', 'wb') as f:
  pickle.dump(movies_metadata, f)

In [8]:
# list_movies = ["Only Lovers Left Alive",
#                "The Twilight Saga: Eclipse",
#                "Me Before You",
#                "(500) Days of Summer"]

list_movies = ["Minions",
               "Zootopia",
               "Shrek",
               "Kung Fu Panda"]

In [3]:
vocab = pd.read_pickle('artifacts/vocab.pickle')
movies_metadata = pd.read_pickle('artifacts/movies_metadata.pickle')
film_descriptions_encoded = pd.read_pickle('artifacts/film_descriptions_encoded.pickle')

bert_tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
bert_model = BertModel.from_pretrained('google-bert/bert-base-uncased').to(device)

In [5]:
dim = 768
num_trees=12
search_type='euclidean'

In [5]:
# build and save

inference = Inference(
    film_encoder_path = 'artifacts/film_encoder_weights_4.pth',
    text_encoder_path = 'artifacts/text_encoder_weights_4.pth',
    vocab=vocab,
    dim=dim,
    movies_metadata=movies_metadata,
    seq_len=seq_len,
    device=device,
    bert_model=bert_model,
    bert_tokenizer=bert_tokenizer,
)

film_embeddings, text_embeddings = inference.get_embeddings(film_descriptions_encoded, batch_size=32)

annoy_model = AnnoySearchEngine(
    dim=dim,
    num_trees=num_trees,
    search_type=search_type,
)
annoy_model.build_trees(film_embeddings, text_embeddings)
annoy_model.save_indexes('artifacts/text_index.ann', 'artifacts/film_index.ann', 'artifacts/idx_to_movieId.pickle')
inference.init_annoy_model('artifacts/text_index.ann', 'artifacts/film_index.ann', 'artifacts/idx_to_movieId.pickle', num_trees=12)

100%|██████████| 229/229 [00:12<00:00, 19.01it/s]
100%|██████████| 7314/7314 [00:01<00:00, 4462.45it/s]


In [15]:
overview = movies_metadata.query('title=="The Devil Wears Prada"')['overview'].values[0]
inference.search_text(overview, in_films=False)

['The Devil Wears Prada',
 'Slumdog Millionaire',
 'Limitless',
 'Scott Pilgrim vs. the World',
 'Interstellar',
 'Bloodsucking Bastards',
 'The Judge',
 'Boyhood',
 'Harry Potter and the Goblet of Fire',
 "Everyone's Hero"]

In [6]:
# load and init

inference = Inference(
    film_encoder_path = 'artifacts/film_encoder_weights_4.pth',
    text_encoder_path = 'artifacts/text_encoder_weights_4.pth',
    vocab=vocab,
    dim=dim,
    movies_metadata=movies_metadata,
    seq_len=seq_len,
    device=device,
    bert_model=bert_model,
    bert_tokenizer=bert_tokenizer,
)

inference.init_annoy_model('artifacts/text_index.ann', 'artifacts/film_index.ann', 'artifacts/idx_to_movieId.pickle', num_trees=12)

In [13]:
overview = movies_metadata.query('title=="Megamind"')['overview'].values[0]
inference.search_text(overview, in_films=True)

['Megamind',
 'The Animal',
 'Highlander: The Search for Vengeance',
 'Justice League: War',
 'Batman Unlimited: Animal Instincts',
 'All Star Superman',
 'Superman/Batman: Public Enemies',
 'Justice League: Crisis on Two Earths',
 'Batman: Mystery of the Batwoman',
 'Fantastic Four']

In [18]:
inference.predict_next_movie(list_movies)

['Ice Age',
 'Shrek 2',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Monsters, Inc.',
 'Frozen',
 'How to Train Your Dragon 2',
 'Cars',
 'Despicable Me 2',
 'Kung Fu Panda 2',
 'Despicable Me']

In [10]:
inference.search_film_by_sequence_and_text(
    list_movies,
    'The film is about the princess in the tower, who was saved by the prince',
    ration=0.1
)

["Dante's Inferno: An Animated Epic",
 'A Glimpse Inside the Mind of Charles Swan III',
 '61*',
 'The Ring Two',
 'In Fear',
 'Empire of Dreams: The Story of the Star Wars Trilogy',
 'Source Code',
 'I Am Bruce Lee',
 'Roll Bounce',
 'Fame']