In [1]:
import sys
import os
import pandas as pd
import numpy as np

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [2]:
import torch
from torch import nn
from torch import functional as F

batch_size = 32
block_size = 4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
DATA_PATH = os.path.join(PROJECT_ROOT, "data")
PATH_RATINGS = os.path.join(DATA_PATH, "raw_data", "kaggle_second_sem", "books_rating.csv")
PATH_BOOKS = os.path.join(DATA_PATH, "raw_data", "kaggle_second_sem", "books_data.csv")
PATH_EMBDS = os.path.join(DATA_PATH, "embeddings", "expanded_embds_ss.npy")

df_books = pd.read_csv(PATH_BOOKS)
df_ratings = pd.read_csv(PATH_RATINGS)

# Book embds has shape like
#(Title, (count, 1), (review/score, 1) (author, 1), (categories, 1), (publisher, 1), (description_embd, 384))

In [55]:
max_title = book_embds[:, 1].max()
max_score = book_embds[:, 2].max()
max_author = book_embds[:, 3].max()
max_categories = book_embds[:, 4].max()
max_publisher = book_embds[:, 5].max()

print(f"max_title: {max_title}, max_score: {max_score}, max_author: {max_author}, max_categories: {max_categories}, max_publisher: {max_publisher}")

max_title: 22023.0, max_score: 5.0, max_author: 127278, max_categories: 10883, max_publisher: 16016


  return umr_maximum(a, axis, None, out, keepdims, initial, where)


In [127]:
book_embds = np.load(PATH_EMBDS, allow_pickle=True)

ratings_seqs = df_ratings.groupby("User_id")["Title"].apply(lambda x: list(set(x.tolist()))).loc[lambda x: x.str.len() > 4].reset_index()
ratings_seqs = ratings_seqs["Title"]

def clear_nan(sample):
    return sample[np.array([all(isinstance(j, str) for j in seq) for seq in sample])]

ratings_seqs = clear_nan(ratings_seqs)
train = ratings_seqs.iloc[:30000]
test = ratings_seqs.iloc[30000:]

def normalize(x):
    x[0] = x[0] / max_title
    x[1] = x[1] / max_score
    x[2] = x[2] / max_author
    x[3] = x[3] / max_categories
    x[4] = x[4] / max_publisher
    return x
book_embds_dict = {row[0]: np.array(normalize(row[1:]), dtype=np.float32) for row in book_embds}

In [125]:
def encode(book_title):
    return torch.tensor(book_embds_dict[book_title], dtype=torch.float32)
def encode_seq(seq):
    return torch.cat([encode(title).unsqueeze(0) for title in seq], dim=0)

In [131]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence

def get_batch(split):
    data = train.reset_index(drop=True) if split == 'train' else test.reset_index(drop=True)
    ix = torch.randint(0, len(data), (batch_size,))

    seq = []
    y = []

    for i in ix:
        i = int(i)
        s = encode_seq(data[i][:-1])
        seq.append(s)
        y.append(encode(data[i][-1]))

    
    lengths = torch.tensor([len(s) for s in seq], dtype=torch.long)
    padded_seq = pad_sequence(seq, batch_first=True)
    packed_seq = pack_padded_sequence(padded_seq, lengths, batch_first=True, enforce_sorted=False)

    x = packed_seq.to(device)
    y = torch.stack(y).to(device)

    return x, y


In [233]:
input_dim = 389
output_dim = 389
hidden_dim = 128
num_layers = 3
dropout = 0.2


class Rekomendatel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm1 = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        lstm_out1, (h_n1, c_n1) = self.lstm1(x)
        lstm_out2, (h_n2, c_n2) = self.lstm2(lstm_out1)
        last_hidden_state = h_n2[-1]
        out = self.dropout(last_hidden_state)
        out = self.fc(out)
        return out


In [235]:
import torch
import torch.optim as optim
from torch import nn

model = Rekomendatel()
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

num_epochs = 10
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for _ in range(len(train) // batch_size):
        inputs, targets = get_batch('train')

        inputs, targets = inputs.cuda() if torch.cuda.is_available() else inputs, targets.cuda() if torch.cuda.is_available() else targets
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train):.4f}')
    
    scheduler.step(running_loss / len(train))

torch.save(model.state_dict(), 'model.pth')


Epoch [1/10], Loss: 0.0017
Epoch [2/10], Loss: 0.0015
Epoch [3/10], Loss: 0.0015
Epoch [4/10], Loss: 0.0015
Epoch [5/10], Loss: 0.0015
Epoch [6/10], Loss: 0.0014
Epoch [7/10], Loss: 0.0014
Epoch [8/10], Loss: 0.0014
Epoch [9/10], Loss: 0.0014
Epoch [10/10], Loss: 0.0014


In [134]:
import torch
torch.save(model.state_dict(), 'model.pth')


In [229]:
loaded_model = Rekomendatel()
loaded_model.eval()

loaded_model.load_state_dict(torch.load('model.pth'))
total_loss = 0
for _ in range(len(test) // batch_size):
    x, y = get_batch('test')
    with torch.no_grad():
        pred = loaded_model(x)
        loss = criterion(pred, y)
        total_loss += loss.item()
    
print(f"Test loss: {total_loss/len(test):.4f}")

Test loss: 0.0014
