In [None]:
from src import pipeline, model, training, data_containers
import torch
import time
import numpy as np

BATCH_SIZE = 8192

class EarlyStopper:

    def __init__(self, patience: int):
        self.patience_left = patience
        self.patience = patience
        self.best_loss = float("inf")

    def should_stop(self, loss: float) -> bool:
        self.patience_left -= 1
        if self.patience_left == 0:
            return True
        if loss < self.best_loss:
            self.best_loss = loss
            self.patience_left = self.patience
        return False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

user_data, movies = pipeline.clean_data()
formatted_user_data = training.format_user_data(user_data)
train, test = training.train_test_split(formatted_user_data, 0.1)

def calculate_loss(model_out: torch.Tensor, batch: data_containers.UserDataBatch, device: torch.device) -> torch.Tensor:
    model_out = model_out*batch.relevancy_vector.to(device)
    euclidian_distance = (model_out - batch.target_user_rating_vector.to(device))**2
    loss = euclidian_distance.sum(dim=1) / batch.n_masked_ratings.to(device)
    return loss.sum()

for n_neurons in [2048]:
    clf = model.Recommender(len(movies), n_neurons).to(device)
    optim = torch.optim.Adam(clf.parameters(), lr=0.0001)
    stopper = EarlyStopper(patience=5)

    train_losses = []
    test_losses = []
    train_losses = []

    print(f"Starting Training with {n_neurons} neurons...")

    for epoch in range(1, 100):
        
        train = train.sample(frac=1) # shuffle

        total_train_loss = 0
        total_test_loss = 0
        current_index = 0
        epoch_start = time.time()

        while current_index < len(train):
            batch_data = train.iloc[current_index:(current_index+BATCH_SIZE)]
            current_index += BATCH_SIZE
            

            model_out = clf(batch.input_user_rating_vector.to(device))
            loss = calculate_loss(model_out, batch, device)
            total_train_loss += loss.item()
            loss.backward()
            optim.step()
            optim.zero_grad()


        current_index = 0
        clf.eval()
        while current_index < len(test):
            batch_data = test.iloc[current_index:(current_index+BATCH_SIZE)]
            current_index += BATCH_SIZE
            masked = batch_data.apply(lambda x: x.mask_values())
            batch = training.masked_user_data_to_batch(masked, len(movies))
            with torch.no_grad():
                model_out = clf(batch.input_user_rating_vector.to(device))
                loss = calculate_loss(model_out, batch, device)
            total_test_loss += loss.item()
        clf.train()

        total_train_loss = round(total_train_loss / len(train), 6)
        total_test_loss = round(total_test_loss / len(test), 6)

        train_losses.append(total_train_loss)
        test_losses.append(total_test_loss)

        if total_test_loss < stopper.best_loss:
             torch.save(clf, f"model_{n_neurons}.pt")

        if stopper.should_stop(total_test_loss):
            print("Early Stopping")
            break

        epoch_time = int(time.time() - epoch_start)
        print(f"[EPOCH {epoch}] Total Training Loss: {total_train_loss} | Total Test Loss: {total_test_loss} | Time: {epoch_time}s")
    
    np.save(f"train_losses_{n_neurons}.npy", np.array(train_losses))
    np.save(f"test_losses_{n_neurons}.npy", np.array(test_losses))

In [1]:
from src import pipeline, model, training, data_containers
import torch
import time
import numpy as np

BATCH_SIZE = 8192

class EarlyStopper:

    def __init__(self, patience: int):
        self.patience_left = patience
        self.patience = patience
        self.best_loss = float("inf")

    def should_stop(self, loss: float) -> bool:
        self.patience_left -= 1
        if self.patience_left == 0:
            return True
        if loss < self.best_loss:
            self.best_loss = loss
            self.patience_left = self.patience
        return False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

user_data, movies = pipeline.clean_data()
formatted_user_data = training.format_user_data(user_data)
train, test = training.train_test_split(formatted_user_data, 0.1)

Cleaning data...
Data cleaned!


In [2]:
movies

Unnamed: 0,film,year,title,new_id
0,8,2004,What the #$*! Do We Know!?,0
1,18,1994,Immortal Beloved,1
2,28,2002,Lilo and Stitch,2
3,30,2003,Something's Gotta Give,3
4,58,1996,Dragonheart,4
...,...,...,...,...
2037,17697,2004,New York Minute,2037
2038,17703,2003,Hulk,2038
2039,17709,1992,A River Runs Through It,2039
2040,17762,1997,Gattaca,2040


In [None]:
movies.to_csv("movies.csv", index=False, sep="|")

In [None]:
sample_user = test.iloc[0]

a = movies[movies["new_id"].isin(sample_user.film_ids)]
a["ratings"] = sample_user.ratings

vector = torch.zeros(1, len(movies))

for _, row in a.iterrows():
    vector[0, row["new_id"]] = row["ratings"]

In [None]:
a

In [None]:
clf = torch.load("runs/model_4096.pt")

clf.eval()

a = clf(vector.to(device)).detach().cpu().numpy()

In [None]:
sample_user

In [None]:
clf = torch.load("runs/model_4096.pt")

clf.eval()

lilo_and_stitch = torch.zeros(1, len(movies))
lilo_and_stitch[0, 1264] = 1

a = clf(lilo_and_stitch.to(device)).detach().cpu().numpy()

In [None]:
import pandas as pd
import numpy as np

data1 = np.load("runs/test_losses_512.npy")
data2 = np.load("runs/test_losses_1024.npy")
data3 = np.load("runs/test_losses_2048.npy")
data4 = np.load("runs/test_losses_4096.npy")
data5 = np.load("runs/test_losses_8192.npy")


data_1 = [(i, l, 512) for i, l in enumerate(data1, start=1)]
data_2 = [(i, l, 1024) for i, l in enumerate(data2, start=1)]
data_3 = [(i, l, 2048) for i, l in enumerate(data3, start=1)]
data_4 = [(i, l, 4096) for i, l in enumerate(data4, start=1)]
data_5 = [(i, l, 8192) for i, l in enumerate(data5, start=1)]


data = data_1 + data_2 + data_3 + data_5
data = pd.DataFrame(data, columns=["epoch", "loss", "neurons"])

In [None]:
import torch

clf = torch.load("runs/model_8192.pt")


In [None]:
import plotly.express as px
import numpy as np



fig = px.line(
    data,
    x="epoch",
    y="loss",
    color="neurons",
)
fig

In [None]:
import numpy as np

np.argsort(a[0])[-10:]

In [None]:
movies.query("new_id == 781")

In [None]:
movies[movies["title"].str.contains("Star Wars")]628,945,994,1097,1120,1860

In [None]:
movies.sample()

In [None]:
import pandas as pd


data = pd.read_csv(r"C:\Users\johnn\Desktop\netflix_recommendation\Source\data\movie_data.csv", sep="|")

In [None]:
data

In [None]:
import requests
image_url = "https://www.imdb.com/title/tt0399877/mediaviewer/rm1940363520/?ref_=tt_ov_i"

#download image from url and save to disk
img_data = requests.get(image_url).content
with open('image_name.jpg', 'wb') as handler:
    handler.write(img_data)
    

In [None]:
requests.get("https://www.imdb.com/title/tt0121765/").text

In [4]:



request_json = {"ratings": [
    {"movieId": 1, "rating": 5},
    {"movieId": 2, "rating": 3},
    {"movieId": 3, "rating": 4}
]}

import torch

model = torch.load("model_8192.pt")
model.eval()

Recommender(
  (inp): Linear(in_features=2042, out_features=8192, bias=True)
  (inp_bn): BatchNorm1d(8192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (inp_relu): LeakyReLU(negative_slope=0.01)
  (hidden1): Linear(in_features=8192, out_features=8192, bias=True)
  (bn1): BatchNorm1d(8192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): LeakyReLU(negative_slope=0.01)
  (hidden2): Linear(in_features=8192, out_features=8192, bias=True)
  (bn2): BatchNorm1d(8192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): LeakyReLU(negative_slope=0.01)
  (hidden3): Linear(in_features=8192, out_features=8192, bias=True)
  (bn3): BatchNorm1d(8192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (outp): Linear(in_features=8192, out_features=2042, bias=True)
)

In [10]:
vec = torch.zeros(1, 2042).to("cuda")
avg_rating = sum([r["rating"] for r in request_json["ratings"]]) / len(request_json["ratings"])
for rating in request_json["ratings"]:
    vec[0, rating["movieId"]] = (rating["rating"] - avg_rating)

In [12]:
preds = model(vec)

In [28]:
import numpy as np

predictions_ordered = np.argsort(preds[0].detach().cpu().numpy())
predictions_ordered = predictions_ordered[-(9+len(request_json["ratings"])):]
without_input_movies = [p for p in predictions_ordered if p not in [r["movieId"] for r in request_json["ratings"]]][-9:]

In [29]:
without_input_movies

[888, 939, 1072, 877, 1028, 422, 1783, 930, 408]