# Performance

In [1]:
import numpy as np
import matplotlib.pyplot as plt

def mse(y: np.ndarray, y_pred: np.ndarray) -> float:
    """Compute the mse loss.
    
    Arguments:
        y {[np.ndarray]} -- 1D float values.
        y_pred {[np.ndarray]} -- 1D float values.
    
    Returns:
        float -- MSE value.
    """
    d = (y_pred - y)
    return np.mean(d*d)

In [3]:
ground_truth_train = np.load(file="./data/rated_embeddings_train.npy")
rating_pred_train = np.load(file="./data/rating_pred_train_nn.npy")
train_true_ratings = ground_truth_train[:,-1]
train_pred_ratings = rating_pred_train[:,0]
print("train_mse=", mse(train_pred_ratings, train_true_ratings))

ground_truth_valid = np.load(file="./data/rated_embeddings_valid.npy")
rating_pred_valid = np.load(file="./data/rating_pred_valid_nn.npy")
valid_true_ratings = ground_truth_valid[:,-1]
valid_pred_ratings = rating_pred_valid[:,0]
print("valid_mse=", mse(valid_pred_ratings, valid_true_ratings))

NameError: name 'ground_truth_ratings' is not defined

### Rating distribution

#### Training (ground truth)

In [None]:
plt.hist(x=train_true_ratings, bins=10)
print("num_data_points=", train_true_ratings.shape[0])
print("mean=", np.mean(train_true_ratings))

#### Training (prediction)

In [None]:
plt.hist(x=train_pred_ratings, bins=10)
print("num_data_points=", train_pred_ratings.shape[0])
print("mean=", np.mean(train_pred_ratings))

Training rating distribution doesn't look alike. The model only captures the mean.

#### Validation (ground truth)

In [None]:
plt.hist(x=valid_true_ratings, bins=10)
print("num_data_points=", valid_true_ratings.shape[0])
print("mean=", np.mean(valid_true_ratings))

#### Validation (prediction)

In [None]:
plt.hist(x=valid_pred_ratings, bins=10)
print("num_data_points=", valid_pred_ratings.shape[0])
print("mean=", np.mean(valid_pred_ratings))

So as the validation.

# Prediction plot

In [None]:
train_x = ground_truth_train[:, 2:-1]
id_pairs = ground_truth_train[:, :2]

print("num_features=", train_x.shape[1])

In [None]:
plt.scatter(x=train_x[:1000,0], y=train_x[:1000,30], c=train_true_ratings[:1000], vmin=0, vmax=5, s=50)

In [None]:
plt.scatter(x=train_x[:1000,0], y=train_x[:1000,30], c=train_pred_ratings[:1000], vmin=0, vmax=5, s=50)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
tsne_train_x = tsne.fit_transform(train_x[:1000, :])

In [None]:
plt.scatter(x=tsne_train_x[:,0], y=tsne_train_x[:,1], c=train_true_ratings[:1000], vmin=0, vmax=5, s=50)

In [None]:
plt.scatter(x=tsne_train_x[:,0], y=tsne_train_x[:,1], c=train_pred_ratings[:1000], vmin=0, vmax=5, s=50)

# Prediction samples

In [None]:
print("training_ground_truth=", train_true_ratings[:20])

In [None]:
print("training_prediction=", np.around(a=train_pred_ratings[:20], decimals=1))

### When ratings are low

In [None]:
print("training_prediction(2)=", np.around(a=train_pred_ratings[train_true_ratings == 2][: 20], decimals=1))