# Variational AutoEncoder

## Import

In [102]:
import learning2read
learning2read.reload_all()
from learning2read.b05 import *

## Load data

In [103]:
raw_train = Data('btrain')
raw_test = Data('btest')
raw_implicit = Data('implicit')
raw_user = Data('user')
raw_book = Data('book')
raw_dataset = {
    'raw_train' : raw_train,
    'raw_test' : raw_test,
    'raw_implicit' : raw_implicit,
    'raw_user' : raw_user,
    'raw_book' : raw_book,
}


In [104]:
df_total = learning2read.preprocessing.TotalDataFrame.run([raw_train, raw_test, raw_implicit])['output']

In [105]:
dfg_user=df_total.groupby('User-ID')
gdf_user=dfg_user.agg({'Book-Rating':['count','min','max']})
gdf_user.columns=['count','min','max'] # cancel multilevel index
gdf_user_gte10=gdf_user.loc[gdf_user['count']>=1000,'count'].sort_values(ascending=False)
print(len(gdf_user_gte10))
gdf_user_gte10[:5]

117


User-ID
188ec05cf3    13602
58a34dcfe5     7550
91a5b876c9     6109
997dc62862     5891
67a302acdf     5850
Name: count, dtype: int64

In [106]:
gdf_book=df_total.groupby('ISBN').agg({'User-ID':'count'})
gdf_book.columns=['count']
gdf_book=gdf_book.loc[gdf_book['count']>=2,:] # cut
# gdf_book=gdf_book.loc[gdf_book['count']>=300,:] # cut2
gdf_book=gdf_book.sort_values('count',ascending=False)
dim1=len(gdf_book.index)
dim2=len(gdf_user_gte10.index)
dim1,dim2

(143511, 117)

In [107]:
from collections import defaultdict
user_vector_id = defaultdict(lambda: -1)
book_id = defaultdict(lambda: -1)
i=0
for x in gdf_user_gte10.index:
    user_vector_id[x] = i
    i+=1
i=0
for x in gdf_book.index:
    book_id[x]=i
    i+=1

index_list = []
for r in df_total.to_dict('record'):
    uid = user_vector_id[r['User-ID']]
    bid = book_id[r['ISBN']]
    if uid>=0 and bid>=0:
        index_list.append([bid,uid])

## Parsing CLI arguements

In [108]:
import argparse
parser = argparse.ArgumentParser(description='VAE MNIST Example')
parser.add_argument('--batch-size', type=int, default=128, metavar='N',
                    help='input batch size for training (default: 128)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--no-cuda', action='store_true', default=True,
                    help='enables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')
args = parser.parse_args([])
args.cuda = not args.no_cuda and torch.cuda.is_available()
# Sets the seed for generating random numbers. Returns a torch._C.Generator object.
torch.manual_seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

In [109]:
index_tns=torch.LongTensor(index_list).t()
value_tns=torch.ones(index_tns.size(1))
train_tns=torch.sparse.FloatTensor(index_tns, value_tns, torch.Size([dim1,dim2]))

In [110]:
train_tns=train_tns.to_dense()

In [111]:
train_tns.size()

torch.Size([143511, 117])

In [112]:
class BookVectorData(torch.utils.data.Dataset):
    """

    Args:
        root (string): Root directory of dataset where ``processed/training.pt``
            and  ``processed/test.pt`` exist.
        train (bool, optional): If True, creates dataset from ``training.pt``,
            otherwise from ``test.pt``.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """

    def __init__(self, train_features):
        self.train_features = train_features

    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: target_data[index]
        """
        target = self.train_features[index]
        return target

    def __len__(self):
        return len(self.train_features)

In [113]:
train_dataset = BookVectorData(train_tns)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=args.batch_size, shuffle=True, **kwargs)

In [114]:
from torch import nn, optim
from torch.nn import functional as F
import datetime
now=datetime.datetime.now

In [115]:
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()

        self.fc1 = nn.Linear(117, 64)
        self.fc21 = nn.Linear(64, 16)
        self.fc22 = nn.Linear(64, 16)
        self.fc3 = nn.Linear(16, 64)
        self.fc4 = nn.Linear(64, 117)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5*logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return F.sigmoid(self.fc4(h3))

    def forward(self, x):
        mu, logvar = self.encode(x.view(-1, 117))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [116]:
model = VAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [117]:
# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function(recon_x, x, mu, logvar):
    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 117), size_average=False)

    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    return BCE + KLD

In [118]:
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.item() / len(data)))

    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(train_loader.dataset)))

In [119]:
for epoch in range(1, args.epochs + 1):
    train(epoch)

====> Epoch: 1 Average loss: 10.2543


====> Epoch: 2 Average loss: 6.3411


====> Epoch: 3 Average loss: 6.2945
====> Epoch: 4 Average loss: 6.2417


====> Epoch: 5 Average loss: 6.2047


====> Epoch: 6 Average loss: 6.1812
====> Epoch: 7 Average loss: 6.1543


====> Epoch: 8 Average loss: 6.1409


====> Epoch: 9 Average loss: 6.1109


====> Epoch: 10 Average loss: 6.0967
