## Classical model
Multilayer FC model should be able to estimate users' rating of the movie based on movie release year and genres and user's age, gender and occupation. While that is not a large amount of info, this should be enough to give a rough estimate.

In [134]:
import pandas as pd

data = pd.read_csv('classical.csv', index_col=0)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,49,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,39,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,0,3
2,25,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,28,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,2
4,47,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Year has much higher variance and mean that any other value in this dataset, so it has to be normalized before it can be used

In [135]:
data['23'].describe()

count    99991.000000
mean      1987.956216
std         14.155523
min       1922.000000
25%       1986.000000
50%       1994.000000
75%       1996.000000
max       1998.000000
Name: 23, dtype: float64

In [136]:
def norm_year(year):
    return year - 1900
data['23'] = data['23'].apply(norm_year)
data['23']

0        97
1        97
2        94
3        94
4        97
         ..
99995    96
99996    85
99997    93
99998    96
99999    92
Name: 23, Length: 99991, dtype: int64

In [139]:
X = data.drop(columns=['43'])
y = data['43']
display(X.head())
display(y)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,49,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,39,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,25,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
4,47,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


0        3
1        3
2        1
3        2
4        1
        ..
99995    3
99996    5
99997    1
99998    2
99999    3
Name: 43, Length: 99991, dtype: int64

In [154]:
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import random_split
import torch
torch.manual_seed(1337)

Xt = torch.tensor(X.values, dtype=torch.float)
yt = torch.tensor(y, dtype=torch.float)

processed_dataset = TensorDataset(Xt, yt)

# set proportion and split dataset into train and validation parts
proportion = 0.2
train_dataset, val_dataset = random_split(processed_dataset, [1-proportion, proportion])

In [155]:
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [156]:
from tqdm.notebook import tqdm

def train(
    model,
    optimizer,
    scheduler,
    loss_fn,
    train_loader,
    val_loader,
    epochs=1,
    device="cpu",
    ckpt_path="best.pt",
):
    # best score for checkpointing
    best = 0
    
    # iterating over epochs
    for epoch in range(epochs):
        # training loop description
        train_loop = tqdm(
            enumerate(train_loader, 0), total=len(train_loader), desc=f"Epoch {epoch}"
        )
        model.to(device)
        model.train()
        train_loss = 0.0
        # iterate over dataset 
        for i, data in train_loop:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward pass and loss calculation
            outputs = model(inputs)
            
            labels = torch.squeeze(labels)
            outputs = torch.squeeze(outputs)
            
            loss = loss_fn(outputs, labels)

            # backward pass
            loss.backward()

            # optimizer run
            optimizer.step()

            train_loss += loss.item()
            train_loop.set_postfix({"loss": train_loss/(i+1)})
        
        # validation
        
        with torch.no_grad():
            eval_loss = 0.0
            model.eval()  # evaluation mode
            val_loop = tqdm(enumerate(val_loader, 0), total=len(val_loader), desc="Val")
            for i, data in val_loop:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                labels = torch.squeeze(labels)
                outputs = torch.squeeze(outputs)
                
                loss = loss_fn(outputs, labels)

                eval_loss += loss.item()
                

            score = (i+1) / eval_loss
            print(f'eval_loss: {eval_loss / (i+1)}')

            if score > best:
                torch.save(model.state_dict(), ckpt_path)
                best = score
                
            scheduler.step(eval_loss / (i+1))



In [157]:
from torch import nn

class RatingModel(nn.Module):

    def __init__(self, input_dim):
        super(RatingModel, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.model(x)


In [161]:
import torch.optim as optim

model = RatingModel(input_dim=43)
loss_fn = nn.L1Loss()
device = 'cuda' if torch.cuda.is_available else 'cpu'

In [162]:
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)

train(
    model,
    optimizer,
    scheduler,
    loss_fn,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    epochs=10
)

Epoch 0:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.9395965531468391


Epoch 1:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.8972798213362694


Epoch 2:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.8936260148882866


Epoch 3:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.877587367594242


Epoch 4:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.87840576171875


Epoch 5:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.8545227691531181


Epoch 6:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.847923320531845


Epoch 7:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.8603326112031937


Epoch 8:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.8683356061577797


Epoch 9:   0%|          | 0/157 [00:00<?, ?it/s]

Val:   0%|          | 0/40 [00:00<?, ?it/s]

eval_loss: 0.8671587467193603


The resulting model has got Mean Average Error of 0.84 on the test part of the dataset, which is a decent result, considering the limited data available for the model, and assuming that recommender model should not really aim for perfect accuracy.