In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.nn import functional as F
import os
from pathlib import Path
from collections import defaultdict
import os
from argparse import Namespace
from joblib import dump, load
base_dir = Path('/Users/vinay/Projects/Recsys')
if not base_dir:
    base_dir = Path(os.getcwd())
from tqdm import tqdm

data_dir = base_dir/'data'/'archive'
store_dir = base_dir/'artifacts'
import wandb
from torch.utils.data import DataLoader as dl

- We have 20M `user-rating` data from 1995 to 2015.
- To avoid `cold-start` problem I choose to split each user data




# Step 1: I have taken a smaller size versions of the whole data  `partial`(with 1 lakh rows) and `sample`(few thousand rows). 


In [2]:
# This is how a full data-set looks
df = pd.read_csv(data_dir/'full_rating.csv')
df.head(4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07


In [3]:
dummy_data = {'userId':[2,3,1],'movieId':[0,1,2],'rating':[1.5,5.0,3.4]}
pd.DataFrame(dummy_data)

Unnamed: 0,userId,movieId,rating
0,2,0,1.5
1,3,1,5.0
2,1,2,3.4


## Code to split the data-sets.

Need not run as most likely you already have these

In [77]:

def reduce_sz(df,sz=10000):
    all_users = sorted(list(set(df['userId'])))
    data = []
    start = 0
    for user in tqdm(all_users):
        temp = df[df['userId'] == user]
        end = start+len(temp)
        temp = copy.deepcopy(temp.reindex(range(start,end)))
        start = end
        data.append(temp)
        if end > sz:
            break
    return pd.concat(data)
        

In [195]:
sample_df = reduce_sz(df)

  0%|                                       | 90/138493 [00:01<31:08, 74.07it/s]


In [198]:
part_df = reduce_sz(df,sz=1000000)

  5%|█▊                                   | 6742/138493 [01:28<28:56, 75.85it/s]


In [199]:
sample_df.to_csv(data_dir/'sample_rating.csv',index=False)
part_df.to_csv(data_dir/'partial_rating.csv',index=False)

# Splitting data into - train,valid,test

In [3]:
global seed 
import random
seed = 0
def set_seeds(seed=0):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # multi-GPU
set_seeds(seed)
    
def get_data(data_dir,split:list,mode,data_type,save_to_disk=False):
    #split = [0.6,0.5]
    #mode = 'random' or 'seq_aware'
    #save_to_disk = flag to specify if we also want to save the splitted data to disk to avoid future computation.
    # We are currently using 'random' split-> for each user, we take all the rows(movies he rated ) and randomly split
    #them between trn,vld,tst
    trn,vld,tst = defaultdict(list),defaultdict(list),defaultdict(list)
    df = pd.read_csv(data_dir/(data_type+'_'+'rating.csv'))
    #Reorder rows
    perm = random.sample(range(len(df)),len(df))
    df = df.iloc[perm].reset_index()
    df['date'] = pd.to_datetime(df['timestamp'])
    user_ids = set(df['userId'])
    for user in tqdm(user_ids):
        tmp = df[df['userId'] == user]
        if split == 'seq_aware':
            tmp = tmp.sort_values(by='date')
        sz = len(tmp)
        #splitting-ids
        t,v = int(split[0]*sz),int(split[0]*sz)+int((sz-int(split[0]*sz))*split[-1])
        trn_ids,vld_ids,tst_ids = slice(0,t),slice(t,v),slice(v,sz)
        
        trn['user_id'].extend(tmp[trn_ids]['userId'].tolist())
        trn['rating'].extend(tmp[trn_ids]['rating'].tolist())
        trn['movie_id'].extend(tmp[trn_ids]['movieId'].tolist())
        
        
        vld['user_id'].extend(tmp[vld_ids]['userId'].tolist())
        vld['rating'].extend(tmp[vld_ids]['rating'].tolist())
        vld['movie_id'].extend(tmp[vld_ids]['movieId'].tolist())
        
        tst['user_id'].extend(tmp[tst_ids]['userId'].tolist())
        tst['rating'].extend(tmp[tst_ids]['rating'].tolist())
        tst['movie_id'].extend(tmp[tst_ids]['movieId'].tolist())
    trn,vld,tst = pd.DataFrame(trn),pd.DataFrame(vld),pd.DataFrame(tst)
    if save_to_disk:
        trn.to_csv(data_dir/(data_type+'_'+mode+'_trn.csv'))
        vld.to_csv(data_dir/(data_type+'_'+mode+'_vld.csv'))
        tst.to_csv(data_dir/(data_type+'_'+mode+'_tst.csv'))
    return trn,vld,tst
        
    
        
            
            
            


    

Note : In the above split we might not have the situation where there exist some movies in test set that are not
trained(seen by anyone). We will obviously not be able to predict ratings there. This is clearly evident below.

In [240]:
t,v,tt = get_data(data_dir,[0.6,0.5],'random','sample')

100%|█████████████████████████████████████████| 91/91 [00:00<00:00, 1643.87it/s]


In [243]:
set(t['user_id'])== set(tt['user_id'])

True

In [246]:
len(set(tt['movie_id'])-set(t['movie_id']))/len(set(tt['movie_id']))

0.30165289256198347

We may be not able to use all this data.Let's make smaller data-sets

In [247]:
data_type = 'sample'
split = 'random'
t,v,tt = get_data(data_dir,[0.6,0.5],split,data_type,save_to_disk=True)

100%|█████████████████████████████████████████| 91/91 [00:00<00:00, 1564.59it/s]


In [248]:
data_type = 'partial'
t,v,tt = get_data(data_dir,[0.6,0.5],split,data_type,save_to_disk=True)

100%|██████████████████████████████████████| 6743/6743 [00:07<00:00, 921.97it/s]


In [211]:
len(set(t['user_id'])),len(set(t['movie_id']))

(6743, 12649)

In [213]:
assert set(t['user_id']) == set(tt['user_id'])
assert set(t['movie_id']) != set(tt['movie_id'])

In [249]:
len(set(tt['movie_id'])-set(t['movie_id']))/len(set(tt['movie_id']))

0.07414910858995137

Note that 7 percent of movies are not there in training data

## Testing on dummy data

In [6]:
class RecsysDataset(torch.utils.data.Dataset):
    def __init__(self,df,usr_dict=None,mov_dict=None):
        self.df = df
        self.usr_dict = usr_dict
        self.mov_dict = mov_dict
    def __getitem__(self,index):
        if self.usr_dict and self.mov_dict:
            return [self.usr_dict[int(self.df.iloc[index]['user_id'])],self.mov_dict[int(self.df.iloc[index]['movie_id'])]],self.df.iloc[index]['rating']
        else:
            return [int(self.df.iloc[index]['user_id']-1),int(self.df.iloc[index]['movie_id']-1)],self.df.iloc[index]['rating']
            
    def __len__(self):
        return len(self.df)
        

In [7]:
sample = pd.DataFrame({'user_id':[1,2,3,2,2,3,2,2],'movie_id':[1,2,3,3,3,2,1,1],'rating':[2.0,1.0,4.0,5.0,1.3,3.5,3.0,4.5]})
trn_ids = random.sample(range(8),4,)
valid_ids = [i for i in range(8) if i not in trn_ids]


In [8]:
import copy

In [9]:
sample_trn,sample_vld = copy.deepcopy(sample.iloc[trn_ids].reset_index()),copy.deepcopy(sample.iloc[valid_ids].reset_index())


In [10]:
sample_vld = RecsysDataset(sample_vld)
sample_trn = RecsysDataset(sample_trn)

In [11]:
sample_vld[0]


([1, 1], 1.0)

In [12]:
train_loader = torch.utils.data.DataLoader(sample_trn, batch_size=2, shuffle=True)

In [13]:
valid_loader = torch.utils.data.DataLoader(sample_vld, batch_size=2, shuffle=True)

In [24]:
for u,r in train_loader:
    #user,item = u
    print(f'user:{u[0]},item:{u[-1]} and rating:{r}')
    #print(u)
    break

user:tensor([1, 1]),item:tensor([2, 0]) and rating:tensor([5.0000, 4.5000], dtype=torch.float64)


In [25]:
embedding = nn.Embedding(3, 5)
a = torch.tensor([1,2,0])
embedding(a)

tensor([[ 0.8380, -0.7193, -0.4033, -0.5966,  0.1820],
        [-0.8567,  1.1006, -1.0712,  0.1227, -0.5663],
        [-2.1788,  0.5684, -1.0845, -1.3986,  0.4033]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
for ele in train

## Model

In [22]:
class NCF(nn.Module):
    
    def __init__(self,user_sz,item_sz,embd_sz,dropout_fac,min_r=0.0,max_r=5.0,alpha=0.5,with_variable_alpha=False):
        super().__init__()
        self.dropout_fac = dropout_fac
        self.user_embd_mtrx = nn.Embedding(user_sz,embd_sz)
        self.item_embd_mtrx = nn.Embedding(item_sz,embd_sz)
        #bias = torch.zeros(size=(user_sz, 1), requires_grad=True)
        self.h =  nn.Linear(embd_sz,1)
        self.fst_lyr = nn.Linear(embd_sz*2,embd_sz)
        self.snd_lyr = nn.Linear(embd_sz,embd_sz//2)
        self.thrd_lyr = nn.Linear(embd_sz//2,embd_sz//4)
        self.out_lyr = nn.Linear(embd_sz//4,1)
        self.alpha = torch.tensor(alpha)
        self.min_r,self.max_r = min_r,max_r
        if with_variable_alpha:
            self.alpha = torch.tensor(alpha,requires_grad=True)
        
    def forward(self,x):
        user_emd = self.user_embd_mtrx(x[0])
        item_emd = self.item_embd_mtrx(x[-1])
        #hadamard-product
        gmf = user_emd*item_emd
        gmf = self.h(gmf)
        
        
        mlp = torch.cat([user_emd,item_emd],dim=-1)
        mlp = self.out_lyr(F.relu(self.thrd_lyr(F.relu(self.snd_lyr(F.dropout(F.relu(self.fst_lyr(mlp)),p=self.dropout_fac))))))
        fac = torch.clip(self.alpha,min=0.0,max=1.0)
        out = fac*gmf+ (1-fac)*mlp
        out = torch.clip(out,min=self.min_r,max=self.max_r)
        return out
        
        

In [23]:
#does it work
model = NCF(3,3,4,0.5)
for u,r in train_loader:
    #user,item = u
    print(f'user:{u[0]},item:{u[-1]} and rating:{r}')
    #print(u)
    out = model(u)
    print(f'output of the network=> out:{out},shape:{out.shape}')
    break

user:tensor([1, 0]),item:tensor([2, 0]) and rating:tensor([5., 2.], dtype=torch.float64)
output of the network=> out:tensor([[0.3493],
        [0.0404]], grad_fn=<ClampBackward1>),shape:torch.Size([2, 1])


## Trainer Class

In [24]:
class Trainer(object):
    def __init__(self, model, device,loss_fn=None, optimizer=None, scheduler=None,artifacts_loc=None,exp_tracker=None):

        # Set params
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.store_loc = artifacts_loc
        self.exp_tracker = exp_tracker

    def train_step(self, dataloader):
        """Train step."""
        # Set model to train mode
        self.model.train()
        loss = 0.0

        # Iterate over train batches
        for i, batch in enumerate(dataloader):
            #batch = [item.to(self.device) for item in batch]  # Set device
            inputs,targets = batch
            inputs = [item.to(self.device) for item in inputs]
            targets = targets.to(self.device)
            #inputs, targets = batch[:-1], batch[-1]
            #import pdb;pdb.set_trace()
            self.optimizer.zero_grad()  # Reset gradients
            z = self.model(inputs)  # Forward pass
            targets = targets.reshape(z.shape)
            J = self.loss_fn(z.float(), targets.float())  # Define loss
            J.backward()  # Backward pass
            self.optimizer.step()  # Update weights

            # Cumulative Metrics
            loss += (J.detach().item() - loss) / (i + 1)

        return loss

    def eval_step(self, dataloader):
        """Validation or test step."""
        # Set model to eval mode
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):
                inputs,y_true = batch
                inputs = [item.to(self.device) for item in inputs]
                y_true = y_true.to(self.device).float()

                # Step
                z = self.model(inputs).float()  # Forward pass
                y_true = y_true.reshape(z.shape)
                J = self.loss_fn(z, y_true).item()

                # Cumulative Metrics
                loss += (J - loss) / (i + 1)

                # Store outputs
                y_prob = z.cpu().numpy()
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())

        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        """Prediction step."""
        # Set model to eval mode
        self.model.eval()
        y_probs = []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Forward pass w/ inputs
                inputs, targets = batch
                z = self.model(inputs).float()

                # Store outputs
                y_prob = z.cpu().numpy()
                y_probs.extend(y_prob)

        return np.vstack(y_probs)
    
    def train(self, num_epochs, patience, train_dataloader, val_dataloader, 
              tolerance=1e-5):
        best_val_loss = np.inf
        training_stats = defaultdict(list)
        for epoch in tqdm(range(num_epochs)):
            # Steps
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            #store stats
            training_stats['epoch'].append(epoch)
            training_stats['train_loss'].append(train_loss)
            training_stats['val_loss'].append(val_loss)
            #log-stats
            if self.exp_tracker == 'wandb':
                log_metrics = {'epoch':epoch,'train_loss':train_loss,'val_loss':val_loss}
                wandb.log(log_metrics,step=epoch)
            
            self.scheduler.step(val_loss)

            # Early stopping
            if val_loss < best_val_loss - tolerance:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience  # reset _patience
            else:
                _patience -= 1
            if not _patience:  # 0
                print("Stopping early!")
                break

            # Tracking
            #mlflow.log_metrics({"train_loss": train_loss, "val_loss": val_loss}, step=epoch)

            # Logging
            if epoch%5 == 0:
                print(
                    f"Epoch: {epoch+1} | "
                    f"train_loss: {train_loss:.5f}, "
                    f"val_loss: {val_loss:.5f}, "
                    f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                    f"_patience: {_patience}"
                )
        if self.store_loc:
            pd.DataFrame(training_stats).to_csv(self.store_loc/'training_stats.csv',index=False)
        return best_model, best_val_loss

In [25]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=5)

trainer = Trainer(model,'cpu',loss_fn,optimizer,scheduler)

In [31]:
trainer.train(100,10,train_loader,valid_loader)

100%|████████████████████████████████████████| 100/100 [00:00<00:00, 443.39it/s]

Epoch: 1 | train_loss: 11.97598, val_loss: 6.06628, lr: 1.00E-03, _patience: 10
Epoch: 11 | train_loss: 11.71282, val_loss: 5.94650, lr: 1.00E-03, _patience: 10
Epoch: 21 | train_loss: 11.45816, val_loss: 5.83341, lr: 1.00E-03, _patience: 10
Epoch: 31 | train_loss: 11.21199, val_loss: 5.72781, lr: 1.00E-03, _patience: 10
Epoch: 41 | train_loss: 10.96701, val_loss: 5.62273, lr: 1.00E-03, _patience: 10
Epoch: 51 | train_loss: 10.70814, val_loss: 5.51317, lr: 1.00E-03, _patience: 9
Epoch: 61 | train_loss: 10.41593, val_loss: 5.38095, lr: 1.00E-03, _patience: 10
Epoch: 71 | train_loss: 10.07872, val_loss: 5.30314, lr: 1.00E-03, _patience: 9
Epoch: 81 | train_loss: 9.57599, val_loss: 5.07092, lr: 1.00E-03, _patience: 10
Epoch: 91 | train_loss: 9.02137, val_loss: 4.92003, lr: 1.00E-03, _patience: 8





(NCF(
   (user_embd_mtrx): Embedding(3, 4)
   (item_embd_mtrx): Embedding(3, 4)
   (h): Linear(in_features=4, out_features=1, bias=True)
   (fst_lyr): Linear(in_features=8, out_features=4, bias=True)
   (snd_lyr): Linear(in_features=4, out_features=2, bias=True)
   (thrd_lyr): Linear(in_features=2, out_features=1, bias=True)
   (out_lyr): Linear(in_features=1, out_features=1, bias=True)
 ),
 4.7468710243701935)

The code -> Network+trainingloop works. Let's write experiment tracking system.

In [26]:
def run(args):
    #main function that handles everything.
    config_dict = vars(args)
    if args.exp_tracker == 'wandb':
        wandb.init(project=f"{args.trail_id}_{args.dataset}_{args.data_type}",config=config_dict)
        
    base_dir = Path(args.base_dir)
    if not args.base_dir:
        base_dir = Path(os.getcwd())
        
    if args.dataset == 'movielens':
        data_dir = base_dir/'data'/'archive'
    else:
        Print('Unknown dataset')
        exit()
 
    store_dir = base_dir/'artifacts'
    base_dir = args.base_dir
    os.makedirs(store_dir,exist_ok=True)
    
    #Check if all the necessary data is already there
    is_data_premade = True
    for d_type in args.on:
        file_name = args.data_type+'_'+args.split_type+'_'+d_type+'.csv'
        if file_name not in os.listdir(data_dir):
            is_data_premade=False
   
    #load the dataset
    df = pd.read_csv(data_dir/(args.data_type+'_'+'rating.csv'))
    mov_dict = dict(zip(sorted(set(df['movieId'])),range(len(sorted(set(df['movieId']))))))
    usr_dict = dict(zip(sorted(set(df['userId'])),range(len(sorted(set(df['userId']))))))
        
    if not is_data_premade:
        pass
        #get-data
        #?
        #trn_df,vld_df,tst_df = get_data()
    else:
        #load the csv's
        trn_file= args.data_type+'_'+args.split_type+'_'+'trn'+'.csv'
        vld_file = args.data_type+'_'+args.split_type+'_'+'vld'+'.csv'
        tst_file = args.data_type+'_'+args.split_type+'_'+'tst'+'.csv'
        trn_df,vld_df,tst_df = pd.read_csv(data_dir/trn_file),pd.read_csv(data_dir/vld_file),pd.read_csv(data_dir/tst_file)
        
    train_artifacts_loc = store_dir/(str(args.trail_id)+'_'+args.dataset+'_'+args.data_type)
    exp_tracker = args.exp_tracker
    if args.mode == 'train':
         
        #train-from scratch
        trn = RecsysDataset(trn_df,usr_dict,mov_dict)
        vld = RecsysDataset(vld_df,usr_dict,mov_dict)
        tst = RecsysDataset(tst_df,usr_dict,mov_dict)
        
        trn_dl,vld_dl,tst_dl = dl(trn, batch_size=args.batch_size, shuffle=True),dl(vld, batch_size=args.batch_size),dl(tst, batch_size=args.batch_size)
        
        #get model
        emb_len_usr = len(set(df['userId']))
        emb_len_itm = len(set(df['movieId']))
        model = NCF(emb_len_usr,emb_len_itm,args.embd_sz,args.dropout_p,alpha=args.alpha,with_variable_alpha=args.with_variable_alpha)
        
        device = 'cpu'
        loss_fn = nn.MSELoss()
        # Define optimizer & scheduler
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.1, patience=5)
        
        
        os.makedirs(train_artifacts_loc,exist_ok=True)
        
        #train
        trainer = Trainer(model,'cpu',loss_fn,optimizer,scheduler,artifacts_loc=train_artifacts_loc,exp_tracker=exp_tracker)
        best_model, best_val_loss = trainer.train(args.num_epochs,args.patience,trn_dl,vld_dl)
        #save the model
        PATH = train_artifacts_loc/'model.pth'
        torch.save(best_model.state_dict(), PATH)
        
        test_ratings = tst_df['rating']
        #loss, np.vstack(y_trues), np.vstack(y_probs)
        tst_loss,y_trues,y_predict = trainer.eval_step(tst_dl)
        test_rslts = pd.DataFrame({'y_true':y_trues.squeeze(),'y_pred':y_predict.squeeze()},index=range(len(y_trues)))
        
        #also do for train-dataset
        trn_loss,y_trues_trn,y_predict_trn = trainer.eval_step(trn_dl)
        trn_rslts = pd.DataFrame({'y_true':y_trues_trn.squeeze(),'y_pred':y_predict_trn.squeeze()},index=range(len(y_trues_trn)))
        
        #also do for valid-dataset
        vld_loss,y_trues_vld,y_predict_vld = trainer.eval_step(trn_dl)
        vld_rslts = pd.DataFrame({'y_true':y_trues_vld.squeeze(),'y_pred':y_predict_vld.squeeze()},index=range(len(y_trues_vld)))
        
        #save test predictions
        test_rslts.to_csv(str(train_artifacts_loc/('inference'+'_'+args.split_type+'_tst'+'.csv')))
        #save trn predictions
        trn_rslts.to_csv(str(train_artifacts_loc/('inference'+'_'+args.split_type+'_trn'+'.csv')))
        vld_rslts.to_csv(str(train_artifacts_loc/('inference'+'_'+args.split_type+'_vld'+'.csv')))
        dump(args,train_artifacts_loc/'args.joblib')
        
    if args.mode == 'inference':
        #? This code is not written yet - buggy
        losses = {}
        for split_type in args.on:
            #'trn','vld','tst'
            if split_type == 'trn':
                data = RecsysDataset(trn_df)
            elif split_type == 'vld':
                data = RecsysDataset(vld_df)
            elif split_type == 'tst':
                data = RecsysDataset(tst_df)
            data_dl = dl(data, batch_size=args.batch_size)
            #?load it from the path
            dir_loc = store_dir/(str(trail_id)+'_'+dataset+'_'+data_type)
            #model  = 
            trainer = Trainer(model,'cpu',loss_fn,optimizer,scheduler,artifacts_loc=train_artifacts_loc)
            tst_loss,y_trues,y_predict = trainer.eval_step(data_dl)
            test_rslts = pd.DataFrame({'y_true':y_trues.squeeze(),'y_pred':y_predict})
            
            losses[split_type] = tst_loss
            test_rslts.to_csv(str(dir_loc)/('inference'+'_'+split_type+'.csv'))
                
     
                
                
        
        
        
        
        
        
        



        


In [39]:
df_part = pd.read_csv(data_dir/'partial_rating.csv')

In [46]:
f"dataset has {len(set(df_part['movieId']))} movies and {len(set(df_part['userId']))} users"

'dataset has 13950 movies and 6743 users'

Below I we will study 3 models and their performance

- alpha == 1,CONSTANT ALPHA -> GMF (1)
- alpha == 0,CONSTANT ALPHA -> MLP(2)
- alpha == 0.5,Constant alpha -> equal contribution(3)
- alpha == 0.5,variable alpha-> dynamically figure out the value of optimal value(4)

In [27]:
#alpha == 1,CONSTANT ALPHA -> GMF (trail_id=1)

#mode = 'train','inference'
#data_type = 'sample','full','partial'

args_1 = Namespace(
    exp_tracker = 'wandb',
    base_dir = '/Users/vinay/Projects/Recsys',
    model_path = '',
    trail_id = 1,
    dataset = 'movielens',
    mode = 'train',
    on = ['trn','vld','tst'],
    alpha=1.0,
    with_variable_alpha=False,
    data_type = 'partial',
    split_type = 'random',
    split = [0.6,0.5],
    embd_sz = 32,
    batch_size=64, 
    dropout_p=0.5,
    lr=2e-4,
    num_epochs=100,
    patience=10
)

args_2 = Namespace(
    exp_tracker = 'wandb',
    base_dir = '/Users/vinay/Projects/Recsys',
    model_path = '',
    trail_id = 2,
    dataset = 'movielens',
    mode = 'train',
    on = ['trn','vld','tst'],
    alpha=0.0,
    with_variable_alpha=False,
    data_type = 'partial',
    split_type = 'random',
    split = [0.6,0.5],
    embd_sz = 32,
    batch_size=64, 
    dropout_p=0.5,
    lr=2e-4,
    num_epochs=100,
    patience=10
)

args_3 = Namespace(
    exp_tracker = 'wandb',
    base_dir = '/Users/vinay/Projects/Recsys',
    model_path = '',
    trail_id = 3,
    dataset = 'movielens',
    mode = 'train',
    on = ['trn','vld','tst'],
    alpha=0.5,
    with_variable_alpha=False,
    data_type = 'partial',
    split_type = 'random',
    split = [0.6,0.5],
    embd_sz = 32,
    batch_size=64, 
    dropout_p=0.5,
    lr=2e-4,
    num_epochs=100,
    patience=10
)


args_4 = Namespace(
    exp_tracker = 'wandb',
    base_dir = '/Users/vinay/Projects/Recsys',
    model_path = '',
    trail_id = 4,
    dataset = 'movielens',
    mode = 'train',
    on = ['trn','vld','tst'],
    alpha=0.5,
    with_variable_alpha=True,
    data_type = 'partial',
    split_type = 'random',
    split = [0.6,0.5],
    embd_sz = 32,
    batch_size=128, 
    dropout_p=0.5,
    lr=2e-4,
    num_epochs=100,
    patience=10
)

In [28]:
args_1

Namespace(alpha=1.0, base_dir='/Users/vinay/Projects/Recsys', batch_size=64, data_type='partial', dataset='movielens', dropout_p=0.5, embd_sz=32, exp_tracker='wandb', lr=0.0002, mode='train', model_path='', num_epochs=100, on=['trn', 'vld', 'tst'], patience=10, split=[0.6, 0.5], split_type='random', trail_id=1, with_variable_alpha=False)

In [29]:
run(args_1)

[34m[1mwandb[0m: Currently logged in as: [33mvin136[0m (use `wandb login --relogin` to force relogin)


  1%|▍                                       | 1/100 [01:49<3:00:54, 109.64s/it]

Epoch: 1 | train_loss: 8.38704, val_loss: 5.54270, lr: 2.00E-04, _patience: 10


  6%|██▍                                     | 6/100 [10:39<2:46:06, 106.03s/it]

Epoch: 6 | train_loss: 1.10523, val_loss: 1.11294, lr: 2.00E-04, _patience: 7


 11%|████▎                                  | 11/100 [19:26<2:36:31, 105.52s/it]

Epoch: 11 | train_loss: 1.09611, val_loss: 1.12085, lr: 2.00E-05, _patience: 2


 12%|████▋                                  | 12/100 [22:58<2:48:27, 114.86s/it]

Stopping early!





In [30]:
run(args_2)
run(args_3)
run(args_4)

0,1
epoch,▁▂▂▃▃▄▅▅▆▆▇▇█
train_loss,█▂▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,12.0
train_loss,1.09541
val_loss,1.12162


  1%|▍                                      | 1/100 [10:39<17:34:22, 639.01s/it]

Epoch: 1 | train_loss: 1.61496, val_loss: 1.08814, lr: 2.00E-04, _patience: 10


  6%|██▍                                     | 6/100 [20:17<3:42:25, 141.97s/it]

Epoch: 6 | train_loss: 0.83313, val_loss: 0.83841, lr: 2.00E-04, _patience: 10


 11%|████▎                                  | 11/100 [29:11<2:46:55, 112.54s/it]

Epoch: 11 | train_loss: 0.76910, val_loss: 0.79568, lr: 2.00E-04, _patience: 10


 16%|██████▏                                | 16/100 [38:04<2:30:21, 107.40s/it]

Epoch: 16 | train_loss: 0.74284, val_loss: 0.77361, lr: 2.00E-04, _patience: 10


 21%|████████▏                              | 21/100 [46:55<2:20:07, 106.43s/it]

Epoch: 21 | train_loss: 0.72822, val_loss: 0.76295, lr: 2.00E-04, _patience: 10


 26%|██████████▏                            | 26/100 [55:50<2:11:42, 106.79s/it]

Epoch: 26 | train_loss: 0.71906, val_loss: 0.76202, lr: 2.00E-04, _patience: 8


 31%|███████████▍                         | 31/100 [1:31:41<6:04:45, 317.19s/it]

Epoch: 31 | train_loss: 0.71264, val_loss: 0.75723, lr: 2.00E-04, _patience: 10


 36%|█████████████▎                       | 36/100 [1:40:36<2:31:39, 142.18s/it]

Epoch: 36 | train_loss: 0.70780, val_loss: 0.75592, lr: 2.00E-04, _patience: 7


 41%|███████████████▏                     | 41/100 [1:49:31<1:51:07, 113.01s/it]

Epoch: 41 | train_loss: 0.70361, val_loss: 0.75714, lr: 2.00E-04, _patience: 6


 46%|█████████████████                    | 46/100 [1:58:26<1:37:06, 107.89s/it]

Epoch: 46 | train_loss: 0.70077, val_loss: 0.76179, lr: 2.00E-04, _patience: 8


 51%|██████████████████▊                  | 51/100 [2:22:00<3:58:34, 292.12s/it]

Epoch: 51 | train_loss: 0.69142, val_loss: 0.75424, lr: 2.00E-05, _patience: 3


 56%|████████████████████▋                | 56/100 [2:31:02<1:42:19, 139.54s/it]

Epoch: 56 | train_loss: 0.68999, val_loss: 0.75318, lr: 2.00E-05, _patience: 6


 61%|██████████████████████▌              | 61/100 [2:40:02<1:13:35, 113.21s/it]

Epoch: 61 | train_loss: 0.68844, val_loss: 0.75427, lr: 2.00E-06, _patience: 1


 61%|██████████████████████▌              | 61/100 [2:41:50<1:43:28, 159.18s/it]

Stopping early!





0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,61.0
train_loss,0.68889
val_loss,0.7533


  1%|▍                                       | 1/100 [01:46<2:55:23, 106.30s/it]

Epoch: 1 | train_loss: 1.73742, val_loss: 1.09607, lr: 2.00E-04, _patience: 10


  2%|▊                                       | 2/100 [03:33<2:54:18, 106.72s/it]wandb: Network error (ConnectionError), entering retry loop.
  6%|██▏                                  | 6/100 [1:06:11<17:13:12, 659.50s/it]

Epoch: 6 | train_loss: 0.83325, val_loss: 0.83875, lr: 2.00E-04, _patience: 10


 11%|████                                 | 11/100 [1:15:13<4:44:08, 191.56s/it]

Epoch: 11 | train_loss: 0.76169, val_loss: 0.78499, lr: 2.00E-04, _patience: 10


 16%|█████▉                               | 16/100 [1:24:18<2:51:54, 122.80s/it]

Epoch: 16 | train_loss: 0.73569, val_loss: 0.76714, lr: 2.00E-04, _patience: 10


 21%|███████▊                             | 21/100 [1:33:15<2:24:30, 109.76s/it]

Epoch: 21 | train_loss: 0.71912, val_loss: 0.75746, lr: 2.00E-04, _patience: 10


 26%|█████████▌                           | 26/100 [1:42:13<2:13:03, 107.88s/it]

Epoch: 26 | train_loss: 0.71021, val_loss: 0.75357, lr: 2.00E-04, _patience: 10


 31%|███████████▍                         | 31/100 [1:51:05<2:02:33, 106.58s/it]

Epoch: 31 | train_loss: 0.70306, val_loss: 0.75139, lr: 2.00E-04, _patience: 8


 36%|█████████████▎                       | 36/100 [2:03:25<2:59:51, 168.61s/it]

Epoch: 36 | train_loss: 0.69734, val_loss: 0.74907, lr: 2.00E-04, _patience: 9


 41%|███████████████▏                     | 41/100 [2:12:19<1:55:23, 117.34s/it]

Epoch: 41 | train_loss: 0.69165, val_loss: 0.74875, lr: 2.00E-05, _patience: 4


 46%|█████████████████                    | 46/100 [2:21:15<1:37:50, 108.71s/it]

Epoch: 46 | train_loss: 0.68290, val_loss: 0.74689, lr: 2.00E-05, _patience: 8


 51%|██████████████████▊                  | 51/100 [2:30:05<1:26:55, 106.45s/it]

Epoch: 51 | train_loss: 0.68166, val_loss: 0.74721, lr: 2.00E-06, _patience: 8


 56%|████████████████████▋                | 56/100 [2:39:05<1:19:06, 107.88s/it]

Epoch: 56 | train_loss: 0.68133, val_loss: 0.74757, lr: 2.00E-06, _patience: 8


 61%|██████████████████████▌              | 61/100 [2:48:02<1:09:41, 107.22s/it]

Epoch: 61 | train_loss: 0.68093, val_loss: 0.74607, lr: 2.00E-06, _patience: 10


 66%|████████████████████████▍            | 66/100 [2:56:59<1:00:45, 107.23s/it]

Epoch: 66 | train_loss: 0.68127, val_loss: 0.74638, lr: 2.00E-06, _patience: 5


 70%|█████████████████████████▉           | 70/100 [3:05:57<1:19:41, 159.39s/it]

Stopping early!





0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▇▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,70.0
train_loss,0.6812
val_loss,0.74726


  1%|▍                                        | 1/100 [01:36<2:38:51, 96.27s/it]

Epoch: 1 | train_loss: 2.34668, val_loss: 1.14362, lr: 2.00E-04, _patience: 10


  6%|██▍                                      | 6/100 [09:39<2:31:07, 96.46s/it]

Epoch: 6 | train_loss: 0.90191, val_loss: 0.89937, lr: 2.00E-04, _patience: 10


 11%|████▍                                   | 11/100 [17:36<2:21:47, 95.59s/it]

Epoch: 11 | train_loss: 0.79898, val_loss: 0.81825, lr: 2.00E-04, _patience: 10


 16%|██████▍                                 | 16/100 [25:34<2:13:45, 95.54s/it]

Epoch: 16 | train_loss: 0.76106, val_loss: 0.79204, lr: 2.00E-04, _patience: 10


 21%|████████▍                               | 21/100 [33:32<2:05:46, 95.53s/it]

Epoch: 21 | train_loss: 0.74149, val_loss: 0.77620, lr: 2.00E-04, _patience: 10


 26%|██████████▍                             | 26/100 [41:30<1:57:55, 95.62s/it]

Epoch: 26 | train_loss: 0.73098, val_loss: 0.77035, lr: 2.00E-04, _patience: 10


 31%|████████████                           | 31/100 [54:57<3:43:09, 194.05s/it]

Epoch: 31 | train_loss: 0.72467, val_loss: 0.76732, lr: 2.00E-04, _patience: 10


 36%|█████████████▎                       | 36/100 [1:03:08<2:02:20, 114.69s/it]

Epoch: 36 | train_loss: 0.71883, val_loss: 0.76680, lr: 2.00E-04, _patience: 7


 41%|███████████████▏                     | 41/100 [1:11:16<1:38:39, 100.32s/it]

Epoch: 41 | train_loss: 0.71447, val_loss: 0.76860, lr: 2.00E-04, _patience: 9


 46%|█████████████████▍                    | 46/100 [1:19:25<1:28:57, 98.84s/it]

Epoch: 46 | train_loss: 0.70849, val_loss: 0.76384, lr: 2.00E-04, _patience: 8


 51%|██████████████████▊                  | 51/100 [1:28:19<1:25:57, 105.26s/it]

Epoch: 51 | train_loss: 0.69951, val_loss: 0.76228, lr: 2.00E-05, _patience: 10


 56%|████████████████████▋                | 56/100 [1:37:10<1:17:52, 106.19s/it]

Epoch: 56 | train_loss: 0.69867, val_loss: 0.76240, lr: 2.00E-05, _patience: 7


 61%|██████████████████████▌              | 61/100 [1:46:05<1:09:39, 107.17s/it]

Epoch: 61 | train_loss: 0.69753, val_loss: 0.76207, lr: 2.00E-05, _patience: 9


 66%|██████████████████████████▍             | 66/100 [1:54:24<56:30, 99.71s/it]

Epoch: 66 | train_loss: 0.69737, val_loss: 0.76260, lr: 2.00E-05, _patience: 7


 71%|████████████████████████████▍           | 71/100 [2:02:25<46:46, 96.76s/it]

Epoch: 71 | train_loss: 0.69667, val_loss: 0.76193, lr: 2.00E-06, _patience: 2


 72%|████████████████████████████           | 72/100 [2:05:42<48:53, 104.76s/it]

Stopping early!





## To DO-> COMPARE TEST PERFORMANCE

In [47]:
out = pd.read_csv('/Users/vinay/Projects/Recsys/artifacts/0_movielens_sample/inference_random_tst.csv')

In [48]:
out

Unnamed: 0.1,Unnamed: 0,y_true,y_pred
0,0,4.0,4.289770
1,1,3.5,4.038643
2,2,4.0,4.109258
3,3,3.5,3.687514
4,4,3.5,4.484389
...,...,...,...
2088,2088,2.5,3.660824
2089,2089,0.5,3.312373
2090,2090,3.5,3.657814
2091,2091,4.0,3.827990


- IF we recommend based on `y_pred`(define some notion of `high` and `low` rating..say all those that are greater than some value)and compare between various models.
- A good way to choose this threshold is use `valid` results and choose the metric.
- other simple way is just look at the squared error on test set and compare. 