In [2]:
import torch
import pickle
import pandas as pd
import numpy as np
from src.train import *
from src.processing import *
from src.models import *
from src.inference import *
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
import os
import tiktoken
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "sk-..."

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

torch.cuda.empty_cache()

Using device: cuda


In [3]:
vocab = Vocabulary()
# tokenizer = BertTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
# model = BertModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').to(device)
openai_model = "text-embedding-3-large"
tokenizer = tiktoken.encoding_for_model("text-embedding-3-large")

ratings_df, movie_descriptions, movies_metadata = create_ratings_df(
    number_of_movies=7500,
    links_path='CLIP4Rec/archive/links.csv',
    movies_metadata_path='CLIP4Rec/archive/movies_metadata.csv',
    ratings_path='CLIP4Rec/archive/ratings.csv'
    )
sequences = get_sequences(ratings_df)
vocab.build_vocab(sequences)

train_sentences, val_sentences = train_test_split(sequences, test_size=0.2, random_state=42)
train_data, film_descriptions_encoded = prepare_dataset(
    train_sentences, movie_descriptions, tokenizer, vocab, 
    encode_descriptions=True, encode_openai=True, max_len=250
)
val_data = prepare_dataset(
    val_sentences, movie_descriptions, tokenizer, vocab
)

train_dataset = FilmRecommendationDataset(train_data, film_descriptions_encoded)
val_dataset = FilmRecommendationDataset(val_data, film_descriptions_encoded)

epochs = 6
batch_size = 64
lr = 0.0001

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  movies_metadata = pd.read_csv(movies_metadata_path)
100%|██████████| 865083/865083 [00:02<00:00, 351527.02it/s]
100%|██████████| 7315/7315 [00:00<00:00, 10499.78it/s]
100%|██████████| 216271/216271 [00:00<00:00, 243560.71it/s]


In [4]:
len(train_loader)

13517

In [7]:
film_encoder = SASFilmEncoder(item_num=len(vocab.word_to_index), seq_len=seq_len, embed_dim=512, device=device)
# text_encoder = TextEncoder(model, output_dim=384, add_fc_layer=True)
text_encoder = TextEncoderOpenAI(model=openai_model, output_dim=512, add_fc_layer=True, device=device)

In [8]:
torch.manual_seed(42)
train_clip(film_encoder, text_encoder, train_loader, val_loader, 
           epochs=epochs, lr=lr, device=device, iter_verbose=3000, folder='CLIP4Rec/artifacts')

  0%|          | 0/13517 [00:00<?, ?it/s]

  0%|          | 1/13517 [00:23<86:55:30, 23.15s/it]


KeyboardInterrupt: 

In [None]:
# torch.save(film_encoder.state_dict(), 'artifacts/film_encoder_weights_final.pth')
# torch.save(text_encoder.state_dict(), 'artifacts/text_encoder_weights_final.pth')

torch.save(train_dataset, 'CLIP4Rec/artifacts/train_dataset.pt')
torch.save(val_dataset, 'CLIP4Rec/artifacts/val_dataset.pt')

with open('CLIP4Rec/artifacts/ratings_df.pickle', 'wb') as f:
  pickle.dump(ratings_df, f)

with open('CLIP4Rec/artifacts/movie_descriptions.pickle', 'wb') as f:
  pickle.dump(movie_descriptions, f)

with open('CLIP4Rec/artifacts/sequences.pickle', 'wb') as f:
  pickle.dump(sequences, f)

with open('CLIP4Rec/artifacts/vocab.pickle', 'wb') as f:
  pickle.dump(vocab, f)

with open('CLIP4Rec/artifacts/film_descriptions_encoded.pickle', 'wb') as f:
  pickle.dump(film_descriptions_encoded, f)

with open('CLIP4Rec/artifacts/movies_metadata.pickle', 'wb') as f:
  pickle.dump(movies_metadata, f)

In [2]:
# list_movies = ["Only Lovers Left Alive",
#                "The Twilight Saga: Eclipse",
#                "Me Before You",
#                "(500) Days of Summer"]

list_movies = ["Minions",
               "Zootopia",
               "Shrek",
               "Kung Fu Panda"]

In [3]:
vocab = pd.read_pickle('artifacts/vocab.pickle')
movies_metadata = pd.read_pickle('artifacts/movies_metadata.pickle')
film_descriptions_encoded = pd.read_pickle('artifacts/film_descriptions_encoded.pickle')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

dim = 384
num_trees=10
search_type='euclidean'



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [4]:
# build and save

inference = Inference(
    film_encoder_path = 'artifacts/film_encoder_weights_final_4.pth',
    text_encoder_path = 'artifacts/text_encoder_weights_final_4.pth',
    vocab=vocab,
    dim=dim,
    movies_metadata=movies_metadata,
    seq_len=seq_len,
    device=device,
    bert_model=bert_model,
    bert_tokenizer=bert_tokenizer,
)

film_embeddings, text_embeddings = inference.get_embeddings(film_descriptions_encoded, batch_size=32)

annoy_model = AnnoySearchEngine(
    dim=dim,
    num_trees=num_trees,
    search_type=search_type,
)
annoy_model.build_trees(film_embeddings, text_embeddings)
annoy_model.save_indexes('artifacts/text_index.ann', 'artifacts/film_index.ann', 'artifacts/idx_to_movieId.pickle')
inference.init_annoy_model('artifacts/text_index.ann', 'artifacts/film_index.ann', 'artifacts/idx_to_movieId.pickle', num_trees=10)

100%|██████████| 229/229 [00:08<00:00, 28.22it/s]
100%|██████████| 7314/7314 [00:01<00:00, 7024.77it/s]


In [5]:
overview = movies_metadata.query('title=="Kung Fu Panda"')['overview'].values[0]
inference.search_text(overview, in_films=True)

['Kung Fu Panda',
 'Kung Fu Panda 2',
 'The Mermaid',
 'Kung Fu Dunk',
 'The Man with the Iron Fists 2',
 'Girls Against Boys',
 'Shanghai Knights',
 'Clean',
 'Rise: Blood Hunter',
 'Saving Mr. Wu']

In [11]:
# load and init

inference = Inference(
    film_encoder_path = 'artifacts/film_encoder_weights_final_4.pth',
    text_encoder_path = 'artifacts/text_encoder_weights_final_4.pth',
    vocab=vocab,
    dim=384,
    movies_metadata=movies_metadata,
    seq_len=seq_len,
    device=device,
    bert_model=bert_model,
    bert_tokenizer=bert_tokenizer,
)

inference.init_annoy_model('artifacts/text_index.ann', 'artifacts/film_index.ann', 'artifacts/idx_to_movieId.pickle', num_trees=10)

In [8]:
overview = movies_metadata.query('title=="Megamind"')['overview'].values[0]
inference.search_text(overview, in_films=True)

['Megamind',
 'Despicable Me 2',
 'Teenage Mutant Ninja Turtles: Out of the Shadows',
 'The Lego Movie',
 'Teen Titans: Trouble in Tokyo',
 'LEGO DC Comics Super Heroes: Justice League: Attack of the Legion of Doom!',
 'Superman/Shazam!: The Return of Black Adam',
 'The SpongeBob SquarePants Movie',
 'In the Name of the King 2: Two Worlds',
 'Fantastic 4: Rise of the Silver Surfer']