## Denoising AutoEncoder

### Import Packages

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from scipy import sparse
from tqdm import tqdm
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader 
import torch
import torch.nn.functional as F

### Loading data

In [2]:
df_val = pd.read_csv('df_val.csv')

In [3]:
R = load_npz('R_train.npz')

In [4]:
n_users,n_movies = R.shape

### Prepare data for PyTorch model

In [5]:
class MovieDatasetUsers(Dataset):
    def __init__(self,utility_matrix):
        self.utility  = utility_matrix
        
    def __len__(self):
        return self.utility.shape[0]
    def __getitem__(self,idx):
        
        user_vector = self.utility[[idx],:].toarray()[0]
        user_vector = torch.tensor(user_vector,dtype=torch.float32)
        
        return user_vector

In [6]:
train_dataset = MovieDatasetUsers(R)

In [7]:
BATCH_SIZE = 128
train_loader = DataLoader(dataset=train_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         )

### Construct accuracy function

In [8]:
one_idxs=[]
minus_one_idxs=[]
hold_out=[]
hold_out_minus=[]

In [9]:
for i in range(n_users):
    one_idxs.append(np.where(R[[i],:].toarray()[0] == 1)[0])
    minus_one_idxs.append(np.where(R[[i],:].toarray()[0] == -1)[0])
    hold_out.append(df_val.query(f"userId=={i} & rating==1").movieId.values)
    hold_out_minus.append(df_val.query(f"userId=={i} & rating==-1").movieId.values)

In [10]:
def accuracy_func(model,k=10):
    accuracy = []
    for i in range(n_users):
        output = model(torch.unsqueeze(train_dataset[i].to(torch.float32).to(device),dim=0))
        output = output.to('cpu').detach().numpy()[0]
        np.put(output,one_idxs[i],-np.inf)
        np.put(output,minus_one_idxs[i],-np.inf)
        c = len(np.intersect1d(np.argsort(output)[::-1][:k],hold_out[i]))
        nc = len(np.intersect1d(np.argsort(output)[::-1][:k],hold_out_minus[i]))
#         acc = np.max([0,(c-nc)/(np.min([k,len(hold_out[i])+1]))]) ## Recal@K
        acc = np.max([0,(c-nc)/k]) ## HR@K
        accuracy.append(acc)
    return np.mean(accuracy)

### DAE model

In [11]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim,dropout_rate=0.5):
        super().__init__()
        
        self.fc1 = nn.Linear(input_dim,hidden_dim)
        self.fc_out = nn.Linear(hidden_dim,latent_dim)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def forward(self,x):
        x = F.normalize(x)
        x = self.dropout(x)
        
        h1 = self.activation(self.fc1(x))
        z = self.activation(self.fc_out(h1))
        
        return z

In [12]:
class Decoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super().__init__()
        
        self.fc1 = nn.Linear(latent_dim,hidden_dim)
        
        self.fc_out = nn.Linear(hidden_dim,input_dim)
        self.activation = nn.Tanh()
        self.tanh = nn.Tanh()
        
    def forward(self,z):
        
        
        h1 = self.activation(self.fc1(z))
        
        x_rec = self.fc_out(h1)
        
        return x_rec

In [13]:
class DAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super().__init__()
        
        self.encoder = Encoder(input_dim, hidden_dim, latent_dim,dropout_rate=0.5)
        self.decoder = Decoder(input_dim, hidden_dim, latent_dim)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self,x):
        z = self.encoder(x)
        x_rec = self.decoder(self.dropout(z))
        
        return x_rec

In [14]:
# Multinomial loss function for implementing MultiDAE
class MultinomialLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self,x_rec,x):
        construction_error = -torch.mean(torch.sum(F.log_softmax(x_rec,dim=1) * x, dim=-1))
        return construction_error

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
model_DAE = DAE(n_movies,256,128).to(device)

In [17]:
loss_func = nn.MSELoss()

In [18]:
optimizer = optim.Adam(model_DAE.parameters(),lr=0.0003)

In [19]:
epochs = 20

for epoch in range(epochs):
    model_DAE.train()
    train_losses = []
    for i,x in enumerate(train_loader):
        x = x.to(device).to(torch.float32)
        x_rec = model_DAE(x)
        
        cost = loss_func(x_rec,x)
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        train_losses.append(cost.item())
    model_DAE.eval()
    acc = accuracy_func(model_DAE,10)
    
    print(f"Epoch {epoch + 1},train loss: {torch.tensor(train_losses).mean():.4f}, val accuracy: {acc:.4f}")


Epoch 1,train loss: 0.0344, val accuracy: 0.1513
Epoch 2,train loss: 0.0316, val accuracy: 0.1730
Epoch 3,train loss: 0.0311, val accuracy: 0.1953
Epoch 4,train loss: 0.0308, val accuracy: 0.2086
Epoch 5,train loss: 0.0306, val accuracy: 0.2275
Epoch 6,train loss: 0.0302, val accuracy: 0.2372
Epoch 7,train loss: 0.0300, val accuracy: 0.2470
Epoch 8,train loss: 0.0298, val accuracy: 0.2535
Epoch 9,train loss: 0.0297, val accuracy: 0.2581
Epoch 10,train loss: 0.0296, val accuracy: 0.2629
Epoch 11,train loss: 0.0295, val accuracy: 0.2661
Epoch 12,train loss: 0.0295, val accuracy: 0.2708
Epoch 13,train loss: 0.0294, val accuracy: 0.2751
Epoch 14,train loss: 0.0293, val accuracy: 0.2791
Epoch 15,train loss: 0.0293, val accuracy: 0.2822
Epoch 16,train loss: 0.0292, val accuracy: 0.2840
Epoch 17,train loss: 0.0291, val accuracy: 0.2868
Epoch 18,train loss: 0.0290, val accuracy: 0.2885
Epoch 19,train loss: 0.0291, val accuracy: 0.2900
Epoch 20,train loss: 0.0289, val accuracy: 0.2914
