# Given

3 datasets:
* $X_u$ for user's preferences/features
* $X_m$ for items's features
* $Y$ for each item rating

# Find 
the content based filtering model, and predict movies to a new user

# Solution

In [125]:
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

Read inputs

In [137]:
# items

x_m_data = np.load("./Lab4_data_items_orig.npz")
x_m = x_m_data['arr_0']
m_features = x_m_data['arr_1']
xm_start = 1 # exclude movie id
xm_num_features = 16

# users
# there are really 300-ish users, but the qty of rows is boosted to match all other tables

x_u = np.load("./Lab4_data_users_orig.npz")['arr_0']
xu_start = 3 # exclude user id, average rating and other not useful info
xu_num_features = 14

# targets

targets = np.load("./Lab4_data_targets_orig.npz")['arr_0']

# check shapes

x_m.shape, x_u.shape, targets.shape

((50884, 17), (50884, 17), (50884,))

Scale data

In [138]:
m_scaler = StandardScaler()
m_scaler.fit(x_m)
x_m_scaled = m_scaler.transform(x_m)

u_scaler = StandardScaler()
u_scaler.fit(x_u)
x_u_scaled = u_scaler.transform(x_u)

target_scaler = MinMaxScaler()
target_scaler.fit(targets.reshape(-1,1))
targets_scaled = target_scaler.transform(targets.reshape(-1,1))

x_m_scaled.shape, x_u_scaled.shape, targets_scaled.shape

((50884, 17), (50884, 17), (50884, 1))

Train test split

In [139]:
m_train, m_test = train_test_split(x_m_scaled, train_size =0.8, shuffle=True, random_state=36)
u_train, u_test = train_test_split(x_u_scaled, train_size=0.8, shuffle=True, random_state=36)
y_train, y_test = train_test_split(targets_scaled, train_size=0.8, shuffle=True, random_state=36)

m_train.shape, m_test.shape, u_train.shape, u_test.shape, y_train.shape, y_test.shape

((40707, 17), (10177, 17), (40707, 17), (10177, 17), (40707, 1), (10177, 1))

NN convertion

In [140]:
u_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(units = 256, activation='relu', name='layer1'),
    tf.keras.layers.Dense(units = 128, activation='relu', name='layer2'),
    tf.keras.layers.Dense(units = 32, activation='linear', name='layer3'),
])

m_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(units = 256, activation='relu', name='layer1'),
    tf.keras.layers.Dense(units = 128, activation='relu', name='layer2'),
    tf.keras.layers.Dense(units = 32, activation='linear', name='layer3'),
])

u_input_layer = tf.keras.layers.Input(shape=xu_num_features)
vu = u_NN(u_input_layer)
vu = tf.linalg.l2_normalize(vu, axis=1)

m_input_layer = tf.keras.layers.Input(shape=xm_num_features)
vm = m_NN(m_input_layer)
vm = tf.linalg.l2_normalize(vm, axis=1)

result = tf.keras.layers.Dot(axes=1)([vu, vm])

model = tf.keras.Model([u_input_layer, m_input_layer], result)

model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_19 (InputLayer)       [(None, 14)]                 0         []                            
                                                                                                  
 input_20 (InputLayer)       [(None, 16)]                 0         []                            
                                                                                                  
 sequential_20 (Sequential)  (None, 32)                   40864     ['input_19[0][0]']            
                                                                                                  
 sequential_21 (Sequential)  (None, 32)                   41376     ['input_20[0][0]']            
                                                                                            

In [141]:
cost_fn = tf.keras.losses.MeanSquaredError()
opt = tf.keras.optimizers.Adadelta(learning_rate=0.01)
model.compile(optimizer=opt, loss=cost_fn)

In [142]:
model.fit([u_train[:,xu_start:], m_train[:,xm_start:]], y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x12e557450>

In [144]:
model.evaluate([u_train[:,xu_start:], m_train[:,xm_start:]], y_train)



0.02618410438299179

Prediction for a new user

In [149]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0 # preference 1
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0 # preference 2
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

Use user vector and predict on a catalogue of items:

In [173]:
item_vecs = np.genfromtxt('./Lab4_data_catalogue.csv', delimiter=',')

x_u_predict = np.tile(user_vec, (len(item_vecs),1))
x_u_predict_scaled = u_scaler.transform(x_u_predict)
x_m_predict_scaled = u_scaler.transform(item_vecs)

prediction_scaled = model.predict([x_u_predict_scaled[:,xu_start:], x_m_predict_scaled[:,xm_start:]])
prediction = target_scaler.inverse_transform(prediction_scaled)

 1/27 [>.............................] - ETA: 1s



In [174]:
sorted_index = np.argsort(-prediction, axis=0).reshape(-1).tolist()
sorted_prediction = prediction[sorted_index]
sorted_items = item_vecs[sorted_index]

sorted_items[0:5,:]

array([[ 5618.  ,  2001.  ,     4.16,     0.  ,     1.  ,     1.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     0.  ,     1.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     0.  ],
       [31658.  ,  2004.  ,     4.08,     0.  ,     1.  ,     1.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     0.  ,     1.  ,
            0.  ,     0.  ,     1.  ,     0.  ,     0.  ],
       [ 7153.  ,  2003.  ,     4.12,     1.  ,     1.  ,     0.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     1.  ,     1.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     0.  ],
       [ 4993.  ,  2001.  ,     4.11,     0.  ,     1.  ,     0.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     0.  ,     1.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     0.  ],
       [92535.  ,  2011.  ,     4.3 ,     0.  ,     0.  ,     0.  ,
            0.  ,     1.  ,     0.  ,     0.  ,     0.  ,     0.  ,
            0.  ,     0.  ,     0.  ,     0.  ,     0.  ]])

As shown above, selection was done according to user's preferences 