In [1]:
from src import pipeline, model, training, data_containers
import torch
import time
import numpy as np

BATCH_SIZE = 8192

class EarlyStopper:

    def __init__(self, patience: int):
        self.patience_left = patience
        self.patience = patience
        self.best_loss = float("inf")

    def should_stop(self, loss: float) -> bool:
        self.patience_left -= 1
        if self.patience_left == 0:
            return True
        if loss < self.best_loss:
            self.best_loss = loss
            self.patience_left = self.patience
        return False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

user_data, movies = pipeline.clean_data()
formatted_user_data = training.format_user_data(user_data)
train, test = training.train_test_split(formatted_user_data, 0.1)

def calculate_loss(model_out: torch.Tensor, batch: data_containers.UserDataBatch, device: torch.device) -> torch.Tensor:
    model_out = model_out*batch.relevancy_vector.to(device)
    euclidian_distance = (model_out - batch.target_user_rating_vector.to(device))**2
    loss = euclidian_distance.sum(dim=1) / batch.n_masked_ratings.to(device)
    return loss.sum()

for n_neurons in [4096]:
    clf = model.Recommender(len(movies), n_neurons).to(device)
    optim = torch.optim.Adam(clf.parameters(), lr=0.0001)
    stopper = EarlyStopper(patience=5)

    atrain_losses = []
    test_losses = []
    train_losses = []

    print(f"Starting Training with {n_neurons} neurons...")

    for epoch in range(1, 100):
        
        train = train.sample(frac=1) # shuffle

        total_train_loss = 0
        total_test_loss = 0
        current_index = 0
        epoch_start = time.time()

        while current_index < len(train):
            batch_data = train.iloc[current_index:(current_index+BATCH_SIZE)]
            current_index += BATCH_SIZE
            masked = batch_data.apply(lambda x: x.mask_values())
            batch = training.masked_user_data_to_batch(masked, len(movies))

            model_out = clf(batch.input_user_rating_vector.to(device))
            loss = calculate_loss(model_out, batch, device)
            total_train_loss += loss.item()
            loss.backward()
            optim.step()
            optim.zero_grad()


        current_index = 0
        clf.eval()
        while current_index < len(test):
            batch_data = test.iloc[current_index:(current_index+BATCH_SIZE)]
            current_index += BATCH_SIZE
            masked = batch_data.apply(lambda x: x.mask_values())
            batch = training.masked_user_data_to_batch(masked, len(movies))
            with torch.no_grad():
                model_out = clf(batch.input_user_rating_vector.to(device))
                loss = calculate_loss(model_out, batch, device)
            total_test_loss += loss.item()
        clf.train()

        total_train_loss = round(total_train_loss / len(train), 6)
        total_test_loss = round(total_test_loss / len(test), 6)

        train_losses.append(total_train_loss)
        test_losses.append(total_test_loss)

        if total_test_loss < stopper.best_loss:
             torch.save(clf, f"model_{n_neurons}.pt")

        if stopper.should_stop(total_test_loss):
            print("Early Stopping")
            break

        epoch_time = int(time.time() - epoch_start)
        print(f"[EPOCH {epoch}] Total Training Loss: {total_train_loss} | Total Test Loss: {total_test_loss} | Time: {epoch_time}s")
    
    np.save(f"train_losses_{n_neurons}.npy", np.array(train_losses))
    np.save(f"test_losses_{n_neurons}.npy", np.array(test_losses))

Cleaning data...
Data cleaned!
Starting Training with 4096 neurons...
[EPOCH 1] Total Training Loss: 15.721486 | Total Test Loss: 11.016048 | Time: 132s
[EPOCH 2] Total Training Loss: 8.586202 | Total Test Loss: 7.830708 | Time: 127s
[EPOCH 3] Total Training Loss: 6.718593 | Total Test Loss: 6.544724 | Time: 125s
[EPOCH 4] Total Training Loss: 5.756086 | Total Test Loss: 5.829314 | Time: 125s


In [8]:
from src import pipeline, model, training, data_containers
import torch
import time
import numpy as np

BATCH_SIZE = 8192

class EarlyStopper:

    def __init__(self, patience: int):
        self.patience_left = patience
        self.patience = patience
        self.best_loss = float("inf")

    def should_stop(self, loss: float) -> bool:
        self.patience_left -= 1
        if self.patience_left == 0:
            return True
        if loss < self.best_loss:
            self.best_loss = loss
            self.patience_left = self.patience
        return False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

user_data, movies = pipeline.clean_data()
formatted_user_data = training.format_user_data(user_data)
train, test = training.train_test_split(formatted_user_data, 0.1)

Cleaning data...
Data cleaned!


In [84]:
sample_user = test.iloc[0]

a = movies[movies["new_id"].isin(sample_user.film_ids)]
a["ratings"] = sample_user.ratings

vector = torch.zeros(1, len(movies))

for _, row in a.iterrows():
    vector[0, row["new_id"]] = row["ratings"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [85]:
a

Unnamed: 0,film,year,title,new_id,ratings
82,758,2004,Mean Girls,82,0.580645
114,1046,2003,Uptown Girls,114,0.580645
136,1307,2003,S.W.A.T.,136,0.580645
156,1561,2003,American Wedding,156,-1.419355
206,1962,2004,50 First Dates,206,-0.419355
276,2580,2003,Freaky Friday,276,0.580645
285,2699,2003,The Missing,285,1.580645
346,3256,2003,Swimming Pool,346,-0.419355
606,5401,2004,Dodgeball: A True Underdog Story,606,-1.419355
643,5692,2000,"Dude, Where's My Car?",643,-2.419355


In [75]:
clf = torch.load("runs/model_4096.pt")

clf.eval()

a = clf(vector.to(device)).detach().cpu().numpy()

In [66]:
sample_user

UnmaskedUserData(user=73431, film_ids=array([  82,  114,  136,  156,  206,  276,  285,  346,  606,  643,  689,
        702,  781,  790,  836,  891, 1198, 1250, 1270, 1351, 1368, 1391,
       1399, 1431, 1662, 1719, 1729, 1732, 1739, 1958, 1994], dtype=int64), ratings=array([ 0.58064516,  0.58064516,  0.58064516, -1.41935484, -0.41935484,
        0.58064516,  1.58064516, -0.41935484, -1.41935484, -2.41935484,
        1.58064516,  0.58064516,  0.58064516,  0.58064516, -0.41935484,
       -0.41935484,  0.58064516, -0.41935484,  0.58064516, -0.41935484,
       -0.41935484, -1.41935484, -0.41935484,  1.58064516,  0.58064516,
        0.58064516, -0.41935484,  0.58064516, -0.41935484, -0.41935484,
       -0.41935484]))

In [56]:
clf = torch.load("runs/model_4096.pt")

clf.eval()

lilo_and_stitch = torch.zeros(1, len(movies))
lilo_and_stitch[0, 1264] = 1

a = clf(lilo_and_stitch.to(device)).detach().cpu().numpy()

In [2]:
import pandas as pd
import numpy as np

data1 = np.load("runs/test_losses_512.npy")
data2 = np.load("runs/test_losses_1024.npy")
data3 = np.load("runs/test_losses_2048.npy")
data4 = np.load("runs/test_losses_4096.npy")
data5 = np.load("runs/test_losses_8192.npy")


data_1 = [(i, l, 512) for i, l in enumerate(data1, start=1)]
data_2 = [(i, l, 1024) for i, l in enumerate(data2, start=1)]
data_3 = [(i, l, 2048) for i, l in enumerate(data3, start=1)]
data_4 = [(i, l, 4096) for i, l in enumerate(data4, start=1)]
data_5 = [(i, l, 8192) for i, l in enumerate(data5, start=1)]


data = data_1 + data_2 + data_3 + data_5
data = pd.DataFrame(data, columns=["epoch", "loss", "neurons"])

In [5]:
import torch

clf = torch.load("runs/model_8192.pt")


In [3]:
import plotly.express as px
import numpy as np



fig = px.line(
    data,
    x="epoch",
    y="loss",
    color="neurons",
)
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [76]:
import numpy as np

np.argsort(a[0])[-10:]

array([1732,  790, 1719,  136,  781,  702,  114,  285,  689, 1431],
      dtype=int64)

In [82]:
movies.query("new_id == 781")

Unnamed: 0,film,year,title,new_id
781,6859,2003,Cheaper by the Dozen,781


In [16]:
movies[movies["title"].str.contains("Star Wars")]628,945,994,1097,1120,1860

Unnamed: 0,film,year,title,new_id
628,5582,1980,Star Wars: Episode V: The Empire Strikes Back,628
945,8292,2004,Star Wars: Clone Wars: Vol. 1,945
994,8687,2002,Star Wars: Episode II: Attack of the Clones,994
1097,9628,1983,Star Wars: Episode VI: Return of the Jedi,1097
1120,9886,1999,Star Wars: Episode I: The Phantom Menace,1120
1860,16265,1977,Star Wars: Episode IV: A New Hope,1860


In [55]:
movies.sample()

Unnamed: 0,film,year,title,new_id
1264,11149,2002,Maid in Manhattan,1264
