In [2]:
import pandas as pd
import numpy as np
import torch
import os
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn as nn

from typing import Tuple

# Preprocess whole dataset code:

In [None]:
movie_ratings = os.listdir("/home/leibniz/Desktop/DL_final_prooject/nf_prize_dataset/download/training_set")
column_names = ["user_id", "rating", "date"]
df = pd.DataFrame()

# This code takes 13 hours to complete!!!
for movie in movie_ratings:
    movie_id = int(movie[3:-4])
    print(movie_id)

    data = pd.read_csv("training_set/" + movie, skiprows=[0], names=column_names)

    data["movie_id"] = movie_id

    df = pd.concat([df, data])

df

# Sample to get a subset of the dataset:

In [71]:
# I need to get a sample of user ids but the dataframe is to big to get in memory:
user_ids = []
for df in pd.read_csv("movie_ratings.csv", chunksize=100000):
    user_ids = user_ids + list(df["user_id"].unique())
    # eliminate duplicates:
    user_ids = list(set(user_ids))

In [72]:
len(user_ids)

480189

In [73]:
# Once we have the ids we should get a random sample of 10000
import random
user_ids_sampled = random.sample(user_ids, 50000)

df_out = pd.DataFrame()
for df in pd.read_csv("movie_ratings.csv", chunksize=100000):
    aux = df.loc[df["user_id"].isin(user_ids_sampled)]
    df_out = pd.concat([df_out, aux])

In [36]:
number_of_reviews = df_out.groupby("movie_id")["user_id"].count()

import plotly.graph_objects as go

# Create the histogram trace
histogram_trace = go.Histogram(x=number_of_reviews, nbinsx=1000)

# Create the layout
layout = go.Layout(
    title='Histogram',
    xaxis=dict(title='Values'),
    yaxis=dict(title='Frequency')
)

# Create the figure
fig = go.Figure(data=[histogram_trace], layout=layout)

# Display the figure
fig.show()

In [74]:
df_out.to_csv("movies_sampled.csv")

# Recommender first tests:

In [14]:
data = pd.read_csv("movies_sampled.csv")

In [15]:
# Drop usless columns:
data.drop(["Unnamed: 0.1", "Unnamed: 0"], axis=1, inplace=True)

# Eliminate the users with just one review:
# First count the number of reviews:
users_review_count = data.groupby("user_id").count()
# Eliminate users with less than 5 reviews:
users_review_count = users_review_count.loc[users_review_count["movie_id"] >= 5]
# Get the id of this users that have less than 5 reviews and eliminate them from the dataset:
data = data.loc[data["user_id"].isin(users_review_count.index)]

In [16]:
# We want to map user_ids and movie_ids with a new reindexation so we can do the embedings. For example, if the minimum id that we have for
# a movie is 345 we want to reindex this id to be 1. We will do the same with users.

reindexation_df_users = pd.DataFrame({"user_id": np.sort(data["user_id"].unique())})
reindexation_df_users.index = reindexation_df_users.index

reindexation_df_movies = pd.DataFrame({"movie_id": np.sort(data["movie_id"].unique())})
reindexation_df_movies.index= reindexation_df_movies.index

user_id_mapping = {key: value for key, value in zip(reindexation_df_users["user_id"], reindexation_df_users.index)}
movie_id_mapping = {key: value for key, value in zip(reindexation_df_movies["movie_id"], reindexation_df_movies.index)}

def return_new_id(mappings, id):
    return mappings[id]

data["new_user_id"] = data["user_id"].apply(lambda x: return_new_id(user_id_mapping, x))
data["new_movie_id"] = data["movie_id"].apply(lambda x: return_new_id(movie_id_mapping, x))

# Now we need each node to have a unique id as it is mentioned un the Recommender systems practice by Paula:
# So get the maximum new_id of users and add it to all movie ids:
data["new_movie_id"] = data["new_movie_id"] + data["new_user_id"].max() + 1 # Plus one so the maximum user id and the minimum movie id do not have the same id

In [17]:
# Substitute the idf columns with the new id columns:
data.drop(["user_id", "movie_id"], axis=1, inplace=True)
data = data.rename(columns={"new_user_id": "user_id", "new_movie_id": "movie_id"})

In [87]:
# This code tales arround 1 hour to run!!!!!

# Create artificial negative samples:
non_existing_combinations_list = []
unique_movie_ids = data["movie_id"].unique()
# For each user id we will create negative samples:
for us_id, us_df in data.groupby("user_id"):

    all_combinations = set([(u_id, m_id) for u_id in us_df["user_id"].unique() for m_id in unique_movie_ids])
    existing_combinations = set(zip(us_df["user_id"], us_df["movie_id"]))
    non_existing_combinations = all_combinations - existing_combinations
    # Get a sample of the non existing combinations for each one of the users:
    non_existing_combinations = pd.DataFrame(non_existing_combinations, columns=["user_id", "movie_id"])

    # Aparently there are some users that have been waching a lot of movies and the line "non_existing_combinations.sample(n=4*len(us_df))"
    # returns an error of "ValueError: Cannot take a larger sample than population when 'replace=False'". Let's fix this with an if statement:
    if 4*len(us_df) <= len(non_existing_combinations):
        non_existing_combinations = non_existing_combinations.sample(n=4*len(us_df))
    else:
        print(f"User {us_id} has seen to many movies!")
        non_existing_combinations = non_existing_combinations.sample(n=len(us_df))

    # Store the non_existing_combinations:
    non_existing_combinations_list.append(non_existing_combinations)

neagtive_samples = pd.concat(non_existing_combinations_list)

# Add the column "seen" and specify that the user has not seen the movie:
neagtive_samples["seen"] = 0
# We will store this samples for now as we will concat them with the train, validation and test sets later




# neagtive_samples.to_csv("neagtive_samples.csv")

User 5833 has seen to many movies!
User 7336 has seen to many movies!
User 14609 has seen to many movies!
User 18574 has seen to many movies!
User 22763 has seen to many movies!
User 26615 has seen to many movies!
User 29092 has seen to many movies!
User 34933 has seen to many movies!
User 35893 has seen to many movies!
User 35963 has seen to many movies!
User 39876 has seen to many movies!
User 48100 has seen to many movies!


In [18]:
neagtive_samples = pd.read_csv("neagtive_samples.csv")

In [19]:
# For test we will keep the last movie a user has seen and for validation the penultimate.
# If he has seen more than one movie in the last day we will get a random one
validation_rows = []
test_rows = []
# All the other entries will be stored to train dataset:
train_dataframes = []

for us_id, us_df in data.groupby("user_id"):
    # Get the data from the user and sort it by date:
    us_df = us_df.sort_values(by="date")

    # Get the last two entries and add them to the test and validation dataframes:
    validation_rows.append(us_df.iloc[-2])
    test_rows.append(us_df.iloc[-1])

    # Store the rest of the data for the train:
    train_dataframes.append(us_df.iloc[:-2])

# Create the dataframes with the rows we have stored:
validation = pd.DataFrame(validation_rows, columns=data.columns)
test = pd.DataFrame(test_rows, columns=data.columns)
train = pd.concat(train_dataframes)
print("Train")
print(len(train))
print("Validation:")
print(len(validation))
print("Test:")
print(len(test))

Train
10372063
Validation:
49247
Test:
49247


In [20]:
# Add the variable "seen" with indicates if the user has seen the film:
train["seen"] = 1
validation["seen"] = 1
test["seen"] = 1

# drop the date and ratings columns:
train.drop(["rating", "date"], axis=1, inplace=True)
validation.drop(["rating", "date"], axis=1, inplace=True)
test.drop(["rating", "date"], axis=1, inplace=True)

In [21]:
# Add negative samples for each train.  Validation and test sets:
# First shuffle the dataset just in case:
neagtive_samples = neagtive_samples.sample(frac=1).reset_index(drop=True)
# Get the first n=2*len(test) for test, the following m=2*len(validation) for validation and the rest for train:
test = pd.concat(
    [
        test,
        neagtive_samples[:2*len(test)]
    ]
).sample(frac=1).reset_index(drop=True)

print(f"Test with negative samples: {len(test)}")

# Make the dataset smaller so the kernel does not crash:
neagtive_samples = neagtive_samples[2*len(test):]

validation = pd.concat(
    [
        validation,
        neagtive_samples[:2*len(validation)]
    ]
).sample(frac=1).reset_index(drop=True)

print(f"Validation with negative samples: {len(validation)}")

# Make the dataset smaller so the kernel does not crash:
neagtive_samples = neagtive_samples[2*len(validation):]

# So the kernel crashes anyway so my plan is to just drop half of the negative_samples:
neagtive_samples = neagtive_samples.sample(frac=0.5)

# Try if the kernel crashes:
# Put in train the rest of the data
train = pd.concat(
    [
        train,
        neagtive_samples
    ]
).sample(frac=1).reset_index(drop=True)

print(f"Train with negative samples: {len(train)}")

Test with negative samples: 147741
Validation with negative samples: 147741
Train with negative samples: 30928629


In [None]:
# This code takes arround 4 hours to run:
# Make sure that we have created non-seen interactions indeed:
for u_id, m_id in zip(neagtive_samples["user_id"], neagtive_samples["movie_id"]):
    if len(train.loc[(train["user_id"] == u_id) & (train["movie_id"] == m_id)]) != 0:
        print("Intersection found! There are previously seen interactions in the code")

In [22]:
train.to_csv("train.csv")
validation.to_csv("validation.csv")
test.to_csv("test.csv")

### Recommender training

In [3]:
train = pd.read_csv("train.csv").drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
validation = pd.read_csv("validation.csv").drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
test = pd.read_csv("test.csv").drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)

In [4]:
# We need to get the number of ids for movies and users:
aux = pd.concat([train, validation, test])
max_user_id = aux["user_id"].max() + 1
max_movie_id = aux["movie_id"].max() + 1

# Free some memory:
del aux

In [5]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
seed = 123
np.random.seed(seed)
_ = torch.manual_seed(seed)
_ = torch.cuda.manual_seed(seed)

In [6]:
print(max_user_id)
print(max_movie_id)

49247
67015


In [7]:
features_train = torch.tensor(train.drop("seen", axis=1).values)
target_train = torch.tensor(train["seen"].values)
train_dataset = TensorDataset(features_train, target_train)

features_validation = torch.tensor(validation.drop("seen", axis=1).values)
target_validation = torch.tensor(validation["seen"].values)
validation_dataset = TensorDataset(features_validation, target_validation)

features_test = torch.tensor(test.drop("seen", axis=1).values)
target_test = torch.tensor(test["seen"].values)
test_dataset = TensorDataset(features_test, target_test)

In [8]:
hparams = {
    'batch_size': 500000,
    'num_epochs': 10,
    'lr': 0.1,
    'test_batch_size': 64,
    'embeding_size': 10
}

In [9]:
class Recommender(nn.Module):
    def __init__(self, user_count, movie_count, embeding_size):

        super().__init__()

        self.user_embeding = nn.Embedding(user_count, embeding_size)
        self.movie_embeding = nn.Embedding(movie_count, embeding_size)

        self.fc_u = nn.Linear(embeding_size, embeding_size)
        self.fc_m = nn.Linear(embeding_size, embeding_size)

        self.fc1 = nn.Linear(embeding_size * 2, 30)
        self.fc2 = nn.Linear(30, 10)
        self.fc3 = nn.Linear(10, 1)

        self.dropout = nn.Dropout(0.2)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, user_id, movie_id):
        
        user_id_emb = self.relu(self.fc_m(self.user_embeding(user_id)))
        movie_id_emb = self.relu(self.fc_u(self.movie_embeding(movie_id)))

        combined_embedings = torch.cat((user_id_emb, movie_id_emb), dim=1)

        x = self.relu(self.fc1(combined_embedings))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc3(x))

        return x

In [10]:
rec = Recommender(max_user_id, max_movie_id, hparams["embeding_size"])
rec = rec.to(device)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=hparams["batch_size"], shuffle=True)
# validation_loader = DataLoader(validation_dataset, batch_size=hparams["batch_size"], shuffle=True)

def train_epoch(
        train_loader: torch.utils.data.DataLoader,
        network: torch.nn.Module,
        optimizer: torch.optim,
        criterion: torch.nn.functional
        ) -> Tuple[float, float]:
    
        network.train()

        train_loss = []

        for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(device), target.to(device)

                optimizer.zero_grad()
                
                user_ids = data[:, 0]
                movie_ids = data[:, 1]

                output = network(user_ids, movie_ids)

                loss = criterion(output.squeeze(dim=1).float(), target.float())

                loss.backward()

                optimizer.step()

                train_loss.append(loss.item())

                if batch_idx % 100 != 0:
                        print("Batch " + str(batch_idx) + " loss")
                        print(loss.item())

        return np.mean(train_loss)

In [12]:
optimizer = torch.optim.Adam(rec.parameters(), lr=hparams["lr"])
criterion = nn.BCELoss()

for epoch in range(hparams["num_epochs"]):

    train_loss = train_epoch(train_loader, rec, optimizer, criterion)
    print("Epoch mean loss:" + str(train_loss))

Batch 1 loss
0.6588695645332336
Batch 2 loss
0.6666874885559082
Batch 3 loss
0.6411874890327454
Batch 4 loss
0.6098321676254272
Batch 5 loss
0.6048372387886047
Batch 6 loss
0.5625730752944946
Batch 7 loss
0.5356735587120056
Batch 8 loss
0.5106550455093384
Batch 9 loss
0.48722800612449646
Batch 10 loss
0.4740479290485382
Batch 11 loss
0.45099493861198425
Batch 12 loss
0.45001766085624695
Batch 13 loss
0.4346177279949188
Batch 14 loss
0.4321960210800171
Batch 15 loss
0.41984811425209045
Batch 16 loss
0.41895636916160583
Batch 17 loss
0.41194579005241394
Batch 18 loss
0.4086032509803772
Batch 19 loss
0.3984423875808716
Batch 20 loss
0.399302214384079
Batch 21 loss
0.3923700749874115
Batch 22 loss
0.3930634558200836
Batch 23 loss
0.3869635760784149
Batch 24 loss
0.38624343276023865
Batch 25 loss
0.38219335675239563
Batch 26 loss
0.3770248293876648
Batch 27 loss
0.37960579991340637
Batch 28 loss
0.37579604983329773
Batch 29 loss
0.3735434114933014
Batch 30 loss
0.37172985076904297
Batch 31 

KeyboardInterrupt: 

In [53]:
# torch.save(rec.state_dict(), 'first_recommender.pth')
rec = Recommender(max_user_id, max_movie_id, hparams["embeding_size"])
rec = rec.to(device)

rec.load_state_dict(torch.load('first_recommender.pth'))

<All keys matched successfully>

In [120]:
def test_epoch(loader: torch.utils.data.DataLoader, network: torch.nn.Module, prob_threshold: float):

    network.eval()
    acc = torch.tensor(0)

    for data, target in loader:
        user_ids = data[:, 0]
        movie_ids = data[:, 1]

        output = network(user_ids, movie_ids)

        predictions =  (output > prob_threshold).int()
        acc = acc + (predictions.squeeze() == target).sum() / (target.size(0))

    return acc

In [55]:
# Let's adjust the probability threshold using the validation dataset:

validation_loader = DataLoader(validation_dataset, batch_size=len(validation_dataset), shuffle=True)

for prob_threshold in np.arange(0.45, 0.55, 0.01):

    validation_acc = test_epoch(validation_loader, rec, 0.5)
    print(f"Threshold {prob_threshold} get's an accuracy of {validation_acc}")


Threshold 0.45 get's an accuracy of 0.8515036702156067
Threshold 0.46 get's an accuracy of 0.8515036702156067
Threshold 0.47000000000000003 get's an accuracy of 0.8515036702156067
Threshold 0.48000000000000004 get's an accuracy of 0.8515036702156067
Threshold 0.49000000000000005 get's an accuracy of 0.8515036702156067
Threshold 0.5 get's an accuracy of 0.8515036702156067
Threshold 0.51 get's an accuracy of 0.8515036702156067
Threshold 0.52 get's an accuracy of 0.8515036702156067
Threshold 0.53 get's an accuracy of 0.8515036702156067
Threshold 0.54 get's an accuracy of 0.8515036702156067
Threshold 0.55 get's an accuracy of 0.8515036702156067


### Evaluation for the test dataset:

In [40]:
train = pd.read_csv("train.csv").drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
validation = pd.read_csv("validation.csv").drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
test = pd.read_csv("test.csv").drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)

In [41]:
aux = pd.concat([train, validation, test])

# We need to store a list of all the movie unique ids for doing predictions later:
unique_movie_ids = set(aux["movie_id"].unique())

del aux
# Remake the aux dataframe to extract later the already seen movies by the user:
historical_data = pd.concat([train.loc[train["seen"] == 1], validation.loc[validation["seen"] == 1]])

In [42]:
# We need a function thet returns the movies that a user has already seen in the past (outsude of the test sample):
def already_seen_movies(hist_data: pd.DataFrame, user_id) -> set:
    """
    Returns a set of seen movies by the user
    """
    return set(hist_data.loc[hist_data["user_id"] == user_id, "movie_id"].tolist())

In [167]:
rec = Recommender(max_user_id, max_movie_id, hparams["embeding_size"])
rec = rec.to(device)

rec.load_state_dict(torch.load('first_recommender.pth'))

<All keys matched successfully>

In [None]:
# Eliminate the not seen movies in the test dataset. I don't know why I created negative
# samples for test dataset but for now we will not need them:
test = test.loc[test["seen"] == 1]

features_test = torch.tensor(test.drop("seen", axis=1).values)
target_test = torch.tensor(test["seen"].values)
test_dataset = TensorDataset(features_test, target_test)

test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=True)

In [169]:
test_acc = 0
idx = 0

for data, target in test_loader:
    user_ids = data[:, 0]
    movie_ids = data[:, 1]

    # Create a TensorDataset in which one column contains the fixes id of the user we want to evaluate and 
    # the other one contains all the unique movies to calculate the probability for each one.
    for us_id, mo_id in zip(user_ids, movie_ids):
        
        # To control the state of the execution:
        idx += 0
        if idx % 10000 != 0:
            print(f"{idx} done!")

        seen_movies_by_user = already_seen_movies(historical_data, int(us_id))

        not_seen_movies = list(unique_movie_ids - seen_movies_by_user)

        # list of the repeated same user id and get the not_seen_movies list to a tensor:
        list_user_id = torch.tensor(len(not_seen_movies) * [int(us_id)])
        not_seen_movies = torch.tensor(not_seen_movies)

        # Pass the movie trough the network:
        output = rec(list_user_id, not_seen_movies).squeeze()

        # Get the movies with the highest probability:
        _ , recommendations_idx = torch.topk(output, k=20)

        # get the movie_id from the movies with the highest probability:
        recommended_movies_id = not_seen_movies[recommendations_idx]

        # Check if the last movie that the user saw is in the top 10 of recommended movies
        # by the network:
        if torch.any(recommended_movies_id == mo_id):
            test_acc += 1

test_acc = test_acc / len(test)
display(test_acc)

tensor([[44083, 56218],
        [17371, 51588],
        [48029, 52617],
        ...,
        [ 6797, 66871],
        [19209, 55533],
        [46219, 61010]])

0.03577882916725892

### Factorization machine

In [27]:
hparams_fm = {
    'batch_size': 500000,
    'num_epochs': 2,
    'lr': 0.1,
    'test_batch_size': 64,
    'embeding_size': 10
}

In [28]:
# Linear part of the equation
class FeaturesLinear(torch.nn.Module):

    def __init__(self, field_dims, output_dim=1):
        super().__init__()

        self.emb = torch.nn.Embedding(field_dims, output_dim)
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        # self.fc(x).shape --> [batch_size, num_fields, 1]
        # torch.sum(self.fc(x), dim=1).shape --> ([batch_size, 1])
        return torch.sum(self.emb(x), dim=1) + self.bias

# FM part of the equation
class FM_operation(torch.nn.Module):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix

class FactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of Factorization Machine.

    Reference:
        S Rendle, Factorization Machines, 2010.
    """

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        #############################
        # Exercice 1: Write code of the model with a usual embedding layer in Pytorch documentation. Then, check that in the 
        # forward pass, we are building the expression of factorization machines.           
        #############################

        self.linear = FeaturesLinear(field_dims)
        self.embedding = torch.nn.Embedding(field_dims, embed_dim)
        self.fm = FM_operation(reduce_sum=True)

        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs):
        """
        :param interaction_pairs: Long tensor of size ``(batch_size, num_fields)``
        """
        out = self.linear(interaction_pairs) + self.fm(self.embedding(interaction_pairs))
        
        return out.squeeze(1)
        
    def predict(self, interactions, device):
        # return the score, inputs are numpy arrays, outputs are tensors
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device)
        output_scores = self.forward(test_interactions)
        return output_scores

In [31]:
fm = FactorizationMachineModel(max_movie_id, hparams_fm["embeding_size"])
criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=fm.parameters(), lr=hparams_fm["lr"])

In [32]:
def train_epoch_fm(model, optimizer, data_loader, criterion, device):

    fm.train()
    total_loss =[]

    for batch_idx, (data, target) in enumerate(data_loader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()
        
        predictions = model(data)

        loss = criterion(predictions, target.float())
        
        loss.backward()

        optimizer.step()

        total_loss.append(loss.item())

        if batch_idx % 100 != 0:
            print("Batch " + str(batch_idx) + " loss:")
            print(loss.item())

    return np.mean(total_loss)

for epoch in range(hparams_fm["num_epochs"]):

    train_loss = train_epoch_fm(fm, optimizer, train_loader, criterion, device)
    print("Epoch mean loss:" + str(train_loss))

Batch 1 loss:
0.8380887508392334
Batch 2 loss:
0.780125617980957
Batch 3 loss:
0.7286313772201538
Batch 4 loss:
0.6780445575714111
Batch 5 loss:
0.6236633062362671
Batch 6 loss:
0.5658349394798279
Batch 7 loss:
0.4993305504322052
Batch 8 loss:
0.4377967119216919
Batch 9 loss:
0.38524094223976135
Batch 10 loss:
0.3538700342178345
Batch 11 loss:
0.34539833664894104
Batch 12 loss:
0.3427564203739166
Batch 13 loss:
0.3466753661632538
Batch 14 loss:
0.34502673149108887
Batch 15 loss:
0.3405351936817169
Batch 16 loss:
0.34026840329170227
Batch 17 loss:
0.33585408329963684
Batch 18 loss:
0.3327391743659973
Batch 19 loss:
0.3297637104988098
Batch 20 loss:
0.32661864161491394
Batch 21 loss:
0.321259468793869
Batch 22 loss:
0.3185984492301941
Batch 23 loss:
0.3152467608451843
Batch 24 loss:
0.31187722086906433
Batch 25 loss:
0.3091270625591278
Batch 26 loss:
0.30516594648361206
Batch 27 loss:
0.3023201823234558
Batch 28 loss:
0.3016773760318756
Batch 29 loss:
0.2987406551837921
Batch 30 loss:
0.

In [35]:
torch.save(fm.state_dict(), 'factorization_machine.pth')

In [36]:
# Eliminate the not seen movies in the test dataset. I don't know why I created negative
# samples for test dataset but for now we will not need them:
test = test.loc[test["seen"] == 1]

features_test = torch.tensor(test.drop("seen", axis=1).values)
target_test = torch.tensor(test["seen"].values)
test_dataset = TensorDataset(features_test, target_test)

test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=True)

In [47]:
test_acc = 0
idx = 0

for data, target in test_loader:

    user_ids = data[:, 0]
    movie_ids = data[:, 1]

    # Create a TensorDataset in which one column contains the fixes id of the user we want to evaluate and 
    # the other one contains all the unique movies to calculate the probability for each one.
    for us_id, mo_id in zip(user_ids, movie_ids):
        
        # To control the state of the execution:
        idx += 0
        if idx % 10000 != 0:
            print(f"{idx} done!")

        seen_movies_by_user = already_seen_movies(historical_data, int(us_id))

        not_seen_movies = list(unique_movie_ids - seen_movies_by_user)

        # list of the repeated same user id and get the not_seen_movies list to a tensor:
        list_user_id = torch.tensor(len(not_seen_movies) * [int(us_id)])
        not_seen_movies = torch.tensor(not_seen_movies)

        # Pass the movie trough the network:
        input = torch.cat((list_user_id.unsqueeze(1), not_seen_movies.unsqueeze(1)), dim=1)

        output = fm(input)

        # Get the movies with the highest probability:
        _ , recommendations_idx = torch.topk(output, k=20)

        # get the movie_id from the movies with the highest probability:
        recommended_movies_id = not_seen_movies[recommendations_idx]

        # Check if the last movie that the user saw is in the top 10 of recommended movies
        # by the network:
        if torch.any(recommended_movies_id == mo_id):
            test_acc += 1

test_acc = test_acc / len(test)
display(test_acc)

0.02153769095917856

Recommend top 10 most popular movies to everyone: