In [1]:
###model.py

import numpy as np
import tensorflow as tf
from keras import layers, regularizers, optimizers
import polars as pl
from keras.saving import register_keras_serializable

@register_keras_serializable()
class ColaborativeFiltering(tf.keras.Model):
    def __init__(self, num_user_features, num_movie_features, user_layers=[128, 64], movie_layers=[128, 64], embedding=32, learning_rate=0.001, user_reg = None, movie_reg = None, **kwargs):
        super().__init__(**kwargs)
        self.num_user_features = num_user_features
        self.num_movie_features = num_movie_features
        self.embedding = embedding
        self.learning_rate = learning_rate

        # User branch
        user_dense_layers = []
        for i, units in enumerate(user_layers):
            reg = user_reg[i] if user_reg is not None else None
            user_dense_layers.append(layers.Dense(units, activation='tanh', kernel_initializer='glorot_uniform', kernel_regularizer=reg))

        user_dense_layers.append(layers.Dense(self.embedding, activation='tanh', kernel_initializer='glorot_uniform'))
        self.user_net = tf.keras.Sequential(user_dense_layers)

        # movie branch
        movie_dense_layers = []
        for units in movie_layers:
            movie_dense_layers.append(layers.Dense(units, activation='tanh', kernel_initializer='glorot_uniform'))
        movie_dense_layers.append(layers.Dense(self.embedding, activation='tanh', kernel_initializer='glorot_uniform'))
        self.movie_net = tf.keras.Sequential(movie_dense_layers)

        self.dot = layers.Dot(axes=1, name='cosine_similarity')

        # Save architecture parameters for serialization
        self.user_layers = user_layers
        self.movie_layers = movie_layers
        self.user_reg = user_reg
        self.movie_reg = movie_reg

        self.compile(
            optimizer=optimizers.Nadam(learning_rate=self.learning_rate),
            loss='mse',
            metrics=['mae', 'mse']
        )

    def get_config(self):
        config = super().get_config()
        config.update({
            'num_user_features': self.num_user_features,
            'num_movie_features': self.num_movie_features,
            'embedding': self.embedding,
            'learning_rate': self.learning_rate,
            'user_layers': self.user_layers,
            'movie_layers': self.movie_layers,
            'user_reg': self.user_reg,
            'movie_reg': self.movie_reg
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

    def call(self, inputs):
        user_input, movie_input = inputs
        user_embedding = tf.nn.l2_normalize(self.user_net(user_input), axis=1)
        movie_embedding = tf.nn.l2_normalize(self.movie_net(movie_input), axis=1)
        cos_sim = self.dot([user_embedding, movie_embedding])
        return cos_sim

    def recommend(self, user_vec, movie_matrix, user_seen_movie_indices = None, k=10, movie_titles=None):
        user_vecs = tf.repeat(tf.reshape(user_vec, (1, -1)), tf.shape(movie_matrix)[0], axis=0)
        preds = self.predict([user_vecs, movie_matrix])
        # mask_indices = tf.constant(list(user_seen_movie_indices), dtype=tf.int32)
        # preds = tf.tensor_scatter_nd_update(
        #     tf.squeeze(preds),
        #     tf.expand_dims(mask_indices, 1),
        #     tf.fill([tf.size(mask_indices)], tf.constant(-float('inf'), dtype=preds.dtype))
        # )
        top_k_idx = tf.argsort(preds, direction='DESCENDING')[:k]
        if movie_titles is not None:
            return [(movie_titles[int(i)], float(preds[i])) for i in top_k_idx]
        else:
            return [(int(i), float(preds[i])) for i in top_k_idx]

    def get_user_seen_movie_indices(self, user_id, ratings, movies):
        gledani_movieid = set(ratings.filter(pl.col('userid') == user_id)['movieid'].to_list())
        movieid_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies['movieid'].to_list())}
        return {movieid_to_idx[movie_id] for movie_id in gledani_movieid if movie_id in movieid_to_idx}

In [470]:
###prep.py


import polars as pl
import os
from sqlalchemy import create_engine
import tensorflow as tf

'''Funkcije za pripremu podataka za collaborative filtering model'''

def read_data_lake():
    '''
    Data lake --> Polars.DataFrame
    '''
    engine = create_engine(f"postgresql+psycopg2://postgres:{os.getenv('POSTGRES_PASSWORD')}@localhost:5432/movie_recommendation")
    conn = engine.connect()
    ratings = pl.read_database(query='SELECT * FROM data_lake.ratings', connection=conn)
    movies = pl.read_database(query='SELECT * FROM raw.movies', connection=conn)
    conn.close()
    return ratings, movies

def prep_pipeline(ratings, movies, user_id = None):
    '''
    Priprema za model
    '''
    #PROSECAN BROJ OCENA PO FILMU
    num_ratings = ratings.group_by('movieid').agg(pl.len().alias('#ratings_film'))
    user = ratings.join(num_ratings, on = 'movieid', how = 'inner').sort(['movieid', 'userid'])
    movies, unique_genres = get_genres(movies, prep = True)
    #LAZY!
    user = user.lazy()
    movies = movies.lazy()
    #SVI ZANROVI
    for genre in unique_genres:
        movies = movies.with_columns(pl.col("genres").list.contains(genre).cast(pl.Int8).alias(genre))
    movies = movies.drop('genres')
    #KOLONA GODINA
    movies = movies.with_columns(pl.col("title").str.extract(r"\((\d{4})\)", 1).cast(pl.Int16).alias("year"))

    #ISTI FORMAT TABELE KAO MOVIES
    user_zanr_train = user.join(movies, on='movieid', how='inner')

    #PIVOT LONGER --> ZANROVE PREBACUJEM U JEDNU KOLONU
    user_longer = (user_zanr_train.unpivot(index=['userid', 'rating'],
                                           on=unique_genres).filter(pl.col('value') == 1).rename({'variable': 'genre', 'value': 'is_genre'}))

    #RACUNAM PROSEK ZA SVAKOG USERA ZA SVAKI ZANR I VRACAM U WIDE FORMAT
    user_feature = user_longer.group_by('userid').agg([(pl.when(pl.col('genre') == genre).then(pl.col('rating')).mean().alias(genre)) for genre in unique_genres]).fill_null(0)
    movie_avg_rating = (user.group_by('movieid').agg(pl.col('rating').mean().alias('avg_rating')))
    movie_features = movies.join(movie_avg_rating, on='movieid', how='left').fill_null(0)
    movie_features = movie_features.select(['movieid', 'title','year','avg_rating', *unique_genres])
    df = user.join(user_feature, on="userid", how="inner").join(movie_features, on="movieid", how="inner")
    df = df.collect()
    # movie_features = movie_features.rename({"(no genres listed)": "no genres listed"})
    # user_feature = user_feature.rename({"(no genres listed)": "no genres listed"})
    # df = df.rename({"(no genres listed)": "no genres listed"})
    user_feature = user_feature.sort('userid')
    df = df.sort('userid')

    return user_feature.collect(), movie_features.collect(), df

def get_genres(movies, prep = False):
    movies = movies.with_columns(pl.col("genres").str.split("|"))
    unique_genres = sorted(set(g for genre in movies["genres"] for g in genre))
    unique_genres[0] = unique_genres[0].replace('(', '').replace(')', '')
    if prep == True:
      return movies, unique_genres
    else:
      return unique_genres

def global_scalers():
    engine = create_engine(f"postgresql+psycopg2://postgres:{os.getenv('POSTGRES_PASSWORD')}@localhost:5432/movie_recommendation")
    conn = engine.connect()
    df = pl.read_database(query='SELECT * FROM raw.ratings', connection=conn)
    user, movies_feat, df = prep_pipeline(df, pl.read_database(query='SELECT * FROM raw.movies', connection=conn))
    _, _ , _, scalers = scale(df, user, movies_feat)
    conn.close()
    return scalers

def scale(df, user, movies, user_id = None):
    '''
    Skaliranje numeričkih karakteristika i prebacivanje u tenzore
    df - Polars DataFrame sa svim podacima
    user - Polars DataFrame sa korisničkim karakteristikama
    movies - Polars DataFrame sa filmskim karakteristikama
    user_id - ako je None, onda se vracaju svi korisnici, ako je lista (ili int) onda se vraca samo taj korisnik

    '''
    y = tf.convert_to_tensor(df.select(pl.col('rating')).to_numpy(), dtype=tf.float16)

    prva_user = df.columns.index('no genres listed')
    poslednja_user = df.columns.index('Western')
    ###prva kolona u X_user_ud je userid!!!, trebace za preporuke, za treniranje koristiti X_user
    X_user_id = tf.convert_to_tensor(df.select(['userid'] + df.columns[prva_user : poslednja_user + 1]).to_numpy(), dtype=tf.float32)
    X_movie_df = df.select(['movieid','year','avg_rating', '#ratings_film'] + [col for col in df.columns if col.endswith('_right')])
    movie_num = tf.convert_to_tensor(X_movie_df.select(['#ratings_film', 'year', 'avg_rating']).to_numpy(), dtype=tf.float32)
    ###u movie_cat ima i movieid
    movie_cat = tf.convert_to_tensor(X_movie_df.select(pl.all().exclude(['#ratings_film', 'year', 'avg_rating'])).to_numpy(), dtype=tf.float32)
    # Standardizacija user i movie numeričkih
    X_user = X_user_id[:, 1:]
    user_mean = tf.reduce_mean(X_user, axis=0)
    user_std = tf.math.reduce_std(X_user, axis=0)
    X_user_scaled = (X_user - user_mean) / (user_std+ 1e-8)
    X_user_id_scaled = tf.concat([X_user_id[:, :1], X_user_scaled], axis=1)  # Skalirano sa ID kolonom
    movie_mean = tf.reduce_mean(movie_num, axis=0)
    movie_std = tf.math.reduce_std(movie_num, axis=0)
    movie_num_scaled = (movie_num - movie_mean) / (movie_std)
    X_movie_scaled = tf.concat([movie_cat[:,1:], movie_num_scaled], axis=1)
    # Target skaliranje na [-1, 1]
    y_scaled = 2 * (y - tf.reduce_min(y)) / (tf.reduce_max(y) - tf.reduce_min(y)) - 1
    scalers = {"user_mean": user_mean, "user_std": user_std,"movie_mean": movie_mean,"movie_std": movie_std, "y_min": tf.reduce_min(y), "y_max": tf.reduce_max(y)}
    if user_id is not None:
        ###Ako je dat user id, filtriramo X_user_id_scaled i X_movie_scaled i vracamo samo korisnika sa tim user_id-om, ako nije vracamo sve korisnike
        maska = tf.reduce_any(tf.equal(tf.expand_dims(X_user_id_scaled[:, 0], 1), tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)
        X_user_id_scaled = tf.boolean_mask(X_user_id_scaled, maska)  #prva kolona je userid
        y_scaled = tf.boolean_mask(y_scaled, maska)

    #   return X_user_id_scaled,X_movie_scaled maska , y_scaled, scalers
    # # Ako user_id nije naveden, vracamo sve korisnike bez filtriranja user_id-a
    # else:
    #   return X_user_scaled, X_movie_scaled, y_scaled, scalers


def batch_generator(movies, batch_size=1000000, total = 2e7):
    '''
    Pravi skupove od batch_size (milion) iz nasumicnih total (20 miliona) redova u tabeli ratings
    '''
    engine = create_engine(f"postgresql+psycopg2://postgres:{os.getenv('POSTGRES_PASSWORD')}@localhost:5432/movie_recommendation")
    conn = engine.connect()
    offset = 0
    while offset < total:
        query = f"SELECT * FROM raw.ratings LIMIT {batch_size} OFFSET {offset}"
        batch = pl.read_database(query=query, connection=conn)
        if batch.height == 0:
            break
        user, movies_feat, df = prep_pipeline(batch, movies, batch)
        X_user, X_movie, y, scalers = scale(df, user, movies_feat)
        yield (X_user, X_movie), tf.squeeze(y)
        offset += batch_size
    conn.close()

# def train_test_split(X_user, X_movie, y, test_size=0.2, random_state= 42):
#     N = X_user.shape[0]
#     tf.random.set_seed(random_state)
#     idx = tf.random.shuffle(tf.range(N))
#     split = int(N * (1 - test_size))
#     train_idx = idx[:split]
#     dev_idx = idx[split:]

#     X_user_train, X_movie_train, y_train = tf.gather(X_user, train_idx), tf.gather(X_movie, train_idx), tf.gather(y, train_idx)

#     X_user_dev, X_movie_dev, y_dev = tf.gather(X_user, dev_idx), tf.gather(X_movie, dev_idx), tf.gather(y, dev_idx)

#     return (X_user_train, X_movie_train), y_train, (X_user_dev, X_movie_dev), y_dev




# def NN_prep(df, user, movies, user_id = None):
#     '''
#     Prebacivanje u tenzore i skaliranje --> tf.Tensor
#     user_id - za listu usera, ako je None onda vraca tf.Tensor sa svim userima
#     '''
#     y = tf.convert_to_tensor(df.select(pl.col('rating')).to_series().to_list(), dtype=tf.float32)
#     prva_user = df.columns.index('no genres listed')
#     poslednja_user = df.columns.index('Western')
#     if user_id is None:
#         X_user = tf.convert_to_tensor(df.select(df.columns[prva_user : poslednja_user + 1])
# .to_numpy(), dtype=tf.float32)
#     else:
#         X_user = tf.convert_to_tensor(df.filter(pl.col('userid') == user_id).select(df.columns[prva_user : poslednja_user + 1]).to_numpy(), dtype=tf.float32)
#     X_movie_df = df.select(['year','avg_rating', '#ratings_film'] + [col for col in df.columns if col.endswith('_right')])
#     movie_num = tf.convert_to_tensor(X_movie_df.select(['#ratings_film', 'year', 'avg_rating']).to_numpy(), dtype=tf.float32)
#     movie_cat = tf.convert_to_tensor(X_movie_df.select(pl.all().exclude(['#ratings_film', 'year', 'avg_rating'])).to_numpy(), dtype=tf.float32)
#     # Skaliranje (standardizacija) user i movie numeričkih karakteristika
#     user_mean = tf.reduce_mean(X_user, axis=0)
#     user_std = tf.math.reduce_std(X_user, axis=0)
#     X_user_scaled = (X_user - user_mean) / (user_std + 1e-7)
#     movie_mean = tf.reduce_mean(movie_num, axis=0)
#     movie_std = tf.math.reduce_std(movie_num, axis=0)
#     movie_num_scaled = (movie_num - movie_mean) / (movie_std)
#     X_movie_scaled = tf.concat([movie_num_scaled, movie_cat], axis=1)
#     # Target skaliranje na [-1, 1]
#     y_min = tf.reduce_min(y)
#     y_max = tf.reduce_max(y)
#     y_scaled = 2 * (y - y_min) / (y_max - y_min) - 1

#     # Vrati i transformatore za kasniju upotrebu
#     scalers = {"user_mean": user_mean, "user_std": user_std, "movie_mean": movie_mean, "movie_std": movie_std, "y_min": y_min, "y_max": y_max}

#     return X_user_scaled, X_movie_scaled, y_scaled, scalers

def inverse_transform_y(y_scaled, scalers):
    """
    Inverzna transformacija za y skaliran na [-1, 1].
    """
    y_min = scalers["y_min"]
    y_max = scalers["y_max"]
    y = (y_scaled + 1) * (y_max - y_min) / 2 + y_min
    return y

def inverse_transform_X_user(X_user_scaled, scalers):
    """
    Inverzna transformacija za X_user.
    """
    user_mean = scalers["user_mean"]
    user_std = scalers["user_std"]
    return X_user_scaled * (user_std + 1e-8) + user_mean

def inverse_transform_X_movie_num(X_movie_num_scaled, scalers):
    """
    Inverzna transformacija za numeričke karakteristike filma.
    """
    movie_mean = scalers["movie_mean"]
    movie_std = scalers["movie_std"]
    return X_movie_num_scaled * (movie_std + 1e-8) + movie_mean





In [480]:
prep()

<tf.Tensor: shape=(10000, 20), dtype=float32, numpy=
array([[-0.03860969, -0.67639303, -0.5786355 , ..., -0.6316742 ,
        -0.24418038, -0.15034235],
       [-0.03860969, -0.67639303, -0.5786355 , ..., -0.6316742 ,
        -0.24418038, -0.15034235],
       [-0.03860969, -0.67639303, -0.5786355 , ..., -0.6316742 ,
        -0.24418038, -0.15034235],
       ...,
       [-0.03860969, -0.67639303, -0.5786355 , ..., -0.6316742 ,
        -0.24418038, -0.15034235],
       [-0.03860969, -0.67639303, -0.5786355 , ..., -0.6316742 ,
        -0.24418038, -0.15034235],
       [-0.03860969, -0.67639303,  1.5425181 , ..., -0.6316742 ,
        -0.24418038, -0.15034235]], dtype=float32)>

In [459]:
def get_genres(movies):
    movies = movies.with_columns(pl.col("genres").str.split("|"))
    unique_genres = sorted(set(g for genre in movies["genres"] for g in genre))
    unique_genres[0] = unique_genres[0].replace('(', '').replace(')', '')

In [3]:
###train.py

# from prep import *
# from model import ColaborativeFiltering
from sqlalchemy import create_engine
import polars as pl
import os
import tensorflow as tf
from keras import layers, Input, regularizers, Model, optimizers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import joblib

# # engine = create_engine(f"postgresql+psycopg2://postgres:{os.getenv('POSTGRES_PASSWORD')}@localhost:5432/movie_recommendation")
# # conn = engine.connect()
# # movies = pl.read_database(query='SELECT * FROM raw.movies', connection=conn)
# # conn.close()

# total = 50000

# data = tf.data.Dataset.from_generator(
#     lambda: batch_generator(movies, batch_size=4096, total = total),
#     output_signature=(
#         (tf.TensorSpec(shape=(None, 20), dtype=tf.float32, name= 'X_user'),
#          tf.TensorSpec(shape=(None, 23), dtype=tf.float32, name='X_movie')),
#         tf.TensorSpec(shape=(None,), dtype=tf.float32, name='y')
#     )
# )

# (X_user_dev, X_movie_dev), y_dev = next(iter(data))
# training_batch = 4096
# train_data = data.unbatch().batch(training_batch).skip(1).prefetch(tf.data.AUTOTUNE).repeat()

# # import numpy as np
# # np.isnan(X_user_dev.numpy()).any()
# # np.isnan(X_movie_dev.numpy()).any()
# # np.isnan(y_dev.numpy()).any()



# # for batch in train_data.take(5):
# #     (X_user_batch, X_movie_batch), y_batch = batch
# #     print("X_user_batch shape:", X_user_batch.shape)
# #     print("X_movie_batch shape:", X_movie_batch.shape)
# #     print("y_batch shape:", y_batch.shape)
# #     print("X_user_batch:", X_user_batch.numpy()[0])
# #     print("X_movie_batch:", X_movie_batch.numpy()[0])
# #     print("y_batch:", y_batch.numpy()[0])

# for batch in train_data.take(15):
#     (X_user_batch, X_movie_batch), y_batch = batch
#     print(np.isnan(X_user_batch.numpy()).any(), np.isnan(X_movie_batch.numpy()).any(), np.isnan(y_batch.numpy()).any())
#     print(np.isinf(X_user_batch.numpy()).any(), np.isinf(X_movie_batch.numpy()).any(), np.isinf(y_batch.numpy()).any())
#     print("y_batch min/max:", y_batch.numpy().min(), y_batch.numpy().max())

# # for i, batch in enumerate(train_data.take(8)):
# #     (X_user_batch, X_movie_batch), y_batch = batch
# #     nan_mask = np.isnan(X_user_batch.numpy()).any(axis=1)
# #     if nan_mask.any():
# #         print(f"Batch {i+1} ima NaN u X_user_batch na indeksima:", np.where(nan_mask)[0])
# #         print("Redovi sa NaN:", X_user_batch.numpy()[nan_mask])
# #         # Opcionalno: pogledaj i y_batch[nan_mask], X_movie_batch[nan_mask]



# model = ColaborativeFiltering(20, 23 ,user_layers = [256, 128, 64],embedding=64, learning_rate=0.001)#, user_reg = [regularizers.l2(0.01), None, None])
# model.summary()
# callbacks = [EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True), ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)]
# history = model.fit(train_data, epochs=20, validation_data=([X_user_dev,  X_movie_dev], y_dev), callbacks=callbacks, steps_per_epoch = int(total // training_batch))


# model.save('model_proba.keras')
# joblib.dump(history, 'history_proba.pkl')
# joblib.dump(scalers, 'scalers_proba.pkl')

# # from tensorflow.keras.models import load_model
# # load_model('model_proba.keras')





In [495]:
user, movies, df = prep_pipeline(ratings,movies)

In [496]:
kolone = user.columns + movies.columns + ['rating']

In [504]:
f'AVG({i}) as mean_{i}, STDDEV({i}) as sd_{i},'

'AVG(rating) as mean_rating, STDDEV(rating) as sd_rating,'

In [515]:
q = [f'AVG({i}) as mean_{i}, STDDEV({i}) as sd_{i}' for i in kolone]

query = f'SELECT {", ".join(q)} FROM data_storage.ratings;'

In [516]:
query

'SELECT AVG(userid) as mean_userid, STDDEV(userid) as sd_userid, AVG(no genres listed) as mean_no genres listed, STDDEV(no genres listed) as sd_no genres listed, AVG(Action) as mean_Action, STDDEV(Action) as sd_Action, AVG(Adventure) as mean_Adventure, STDDEV(Adventure) as sd_Adventure, AVG(Animation) as mean_Animation, STDDEV(Animation) as sd_Animation, AVG(Children) as mean_Children, STDDEV(Children) as sd_Children, AVG(Comedy) as mean_Comedy, STDDEV(Comedy) as sd_Comedy, AVG(Crime) as mean_Crime, STDDEV(Crime) as sd_Crime, AVG(Documentary) as mean_Documentary, STDDEV(Documentary) as sd_Documentary, AVG(Drama) as mean_Drama, STDDEV(Drama) as sd_Drama, AVG(Fantasy) as mean_Fantasy, STDDEV(Fantasy) as sd_Fantasy, AVG(Film-Noir) as mean_Film-Noir, STDDEV(Film-Noir) as sd_Film-Noir, AVG(Horror) as mean_Horror, STDDEV(Horror) as sd_Horror, AVG(IMAX) as mean_IMAX, STDDEV(IMAX) as sd_IMAX, AVG(Musical) as mean_Musical, STDDEV(Musical) as sd_Musical, AVG(Mystery) as mean_Mystery, STDDEV(Myst

'AVG(userid) as mean_userid, STDDEV(userid) as sd_userid,AVG(no genres listed) as mean_no genres listed, STDDEV(no genres listed) as sd_no genres listed,AVG(Action) as mean_Action, STDDEV(Action) as sd_Action,AVG(Adventure) as mean_Adventure, STDDEV(Adventure) as sd_Adventure,AVG(Animation) as mean_Animation, STDDEV(Animation) as sd_Animation,AVG(Children) as mean_Children, STDDEV(Children) as sd_Children,AVG(Comedy) as mean_Comedy, STDDEV(Comedy) as sd_Comedy,AVG(Crime) as mean_Crime, STDDEV(Crime) as sd_Crime,AVG(Documentary) as mean_Documentary, STDDEV(Documentary) as sd_Documentary,AVG(Drama) as mean_Drama, STDDEV(Drama) as sd_Drama,AVG(Fantasy) as mean_Fantasy, STDDEV(Fantasy) as sd_Fantasy,AVG(Film-Noir) as mean_Film-Noir, STDDEV(Film-Noir) as sd_Film-Noir,AVG(Horror) as mean_Horror, STDDEV(Horror) as sd_Horror,AVG(IMAX) as mean_IMAX, STDDEV(IMAX) as sd_IMAX,AVG(Musical) as mean_Musical, STDDEV(Musical) as sd_Musical,AVG(Mystery) as mean_Mystery, STDDEV(Mystery) as sd_Mystery,AVG

In [466]:
###TRAIN.py

###
#samo za RZS
movies = pl.read_csv(r'https://raw.githubusercontent.com/BogdanSliskovic/ML/refs/heads/main/film/movies.csv')
movies.name = 'Movies'
ratings = pl.read_csv(r'https://raw.githubusercontent.com/BogdanSliskovic/ML/refs/heads/main/film/ratings_RZS.csv')
ratings.name = 'Ratings'

for df in [movies, ratings]:
  print(df.name , df.schema, df.shape)

user, movies_feat, df = prep_pipeline(ratings, movies)
X_user, X_movie, y, scalers = scale(df, user, movies_feat)

def prep_tf(user, movies, training_batch = 16):
  user, movies_feat, df = prep_pipeline(ratings, movies)
  X_user, X_movie, y, scalers = scale(df, user, movies_feat)
  data = (X_user, X_movie), y
  data = tf.data.Dataset.from_tensor_slices(data).batch(training_batch)
  return data
###
def split(data):
  (X_user_test, X_movie_test), y_test = next(iter(data))
  (X_user_dev, X_movie_dev), y_dev = next(iter(data.skip(1)))
  train_data = data.skip(2).prefetch(tf.data.AUTOTUNE).repeat()
  return ((X_user_test, X_movie_test, y_test), (X_user_dev, X_movie_dev, y_dev), train_data)

data = prep_tf(ratings, movies)
test_set, dev_set, train_data = split(data)

X_user_test, X_movie_test, y_test = test_set
X_user_dev, X_movie_dev, y_dev = dev_set


model = ColaborativeFiltering(20, 23 ,user_layers = [256, 128, 64],embedding=64, learning_rate=0.001)#, user_reg = [regularizers.l2(0.01), None, None])
callbacks = [EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True), ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)]
history = model.fit(train_data, epochs=5,validation_data = ((X_user_dev, X_movie_dev), y_dev), callbacks=callbacks, steps_per_epoch = int(10000/16))




Movies Schema([('movieid', Int64), ('title', String), ('genres', String)]) (87585, 3)
Ratings Schema([('userid', Int64), ('movieid', Int64), ('rating', Float64)]) (10000, 3)


SchemaError: invalid series dtype: expected `List`, got `str`

In [433]:
movies = pl.read_csv(r'https://raw.githubusercontent.com/BogdanSliskovic/ML/refs/heads/main/film/movies.csv')
movies.name = 'Movies'
ratings = pl.read_csv(r'https://raw.githubusercontent.com/BogdanSliskovic/ML/refs/heads/main/film/ratings_RZS.csv')
ratings.name = 'Ratings'

for df in [movies, ratings]:
  print(df.name , df.schema, df.shape)

user, movies_feat, df = prep_pipeline(ratings, movies)
X_user, X_movie, y, scalers = scale(df, user, movies_feat)


data = prep_tf(ratings, movies)


(X_user_test, X_movie_test, y_test), (X_user_dev, X_movie_dev, y_dev), train_data = split(data)


model = ColaborativeFiltering(20, 23 ,user_layers = [256, 128, 64],embedding=64, learning_rate=0.001)#, user_reg = [regularizers.l2(0.01), None, None])
callbacks = [EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True), ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6, verbose=1)]
history = model.fit(train_data, epochs=5,validation_data = ((X_user_dev, X_movie_dev), y_dev), callbacks=callbacks, steps_per_epoch = int(10000/16))



Movies Schema([('movieid', Int64), ('title', String), ('genres', String)]) (87585, 3)
Ratings Schema([('userid', Int64), ('movieid', Int64), ('rating', Float64)]) (10000, 3)
Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 0.0861 - mae: 0.2184 - mse: 0.0861 - val_loss: 0.0262 - val_mae: 0.1290 - val_mse: 0.0262 - learning_rate: 0.0010
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0291 - mae: 0.1271 - mse: 0.0291 - val_loss: 0.0176 - val_mae: 0.1037 - val_mse: 0.0176 - learning_rate: 0.0010
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0212 - mae: 0.1041 - mse: 0.0212 - val_loss: 0.0140 - val_mae: 0.0875 - val_mse: 0.0140 - learning_rate: 0.0010
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 0.0169 - mae: 0.0897 - mse: 0.0169 - val_loss: 0.0119 - val_mae: 0.0822 - val_mse: 0.0119 - learning_rate: 0.001

In [425]:
def split(data):
    test_data = list(data.take(10))
    dev_data = list(data.skip(10).take(10))
    train_data = data.skip(20).prefetch(tf.data.AUTOTUNE).repeat()

    X_user_test = tf.concat([b[0][0] for b in test_data], axis=0)
    X_movie_test = tf.concat([b[0][1] for b in test_data], axis=0)
    y_test = tf.concat([b[1] for b in test_data], axis=0)

    X_user_dev = tf.concat([b[0][0] for b in dev_data], axis=0)
    X_movie_dev = tf.concat([b[0][1] for b in dev_data], axis=0)
    y_dev = tf.concat([b[1] for b in dev_data], axis=0)

    return ((X_user_test, X_movie_test, y_test),
            (X_user_dev, X_movie_dev, y_dev),
            train_data)

In [431]:
(X_user_test, X_movie_test, y_test), (X_user_dev, X_movie_dev, y_dev), train_data = split(data)

In [430]:
X_user_test.shape, X_movie_test.shape, y_test.shape

(TensorShape([160, 20]), TensorShape([160, 23]), TensorShape([160, 1]))

In [416]:
test_data = data.take(10)  # uzmi prvih 10 batch-eva

# Pretvori u listu i raspakuj
X_user_test_list, X_movie_test_list, y_test_list = [], [], []

for (X_user, X_movie), y in test_data:
    X_user_test_list.append(X_user)
    X_movie_test_list.append(X_movie)
    y_test_list.append(y)

# Spoji sve batcheve u jedan tensor
X_user_test = tf.concat(X_user_test_list, axis=0)
X_movie_test = tf.concat(X_movie_test_list, axis=0)
y_test = tf.concat(y_test_list, axis=0)

In [419]:
X_movie_test

<tf.Tensor: shape=(160, 23), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.        , ..., -0.8354533 ,
         0.09542876, -2.0351791 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.8354533 ,
         0.12262866,  0.6113578 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.23720531,
         0.17702846,  0.4790311 ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.08764333,
        -0.04057075, -1.0427278 ],
       [ 0.        ,  0.        ,  1.        , ...,  2.1557865 ,
         0.12262866,  0.8319023 ],
       [ 0.        ,  0.        ,  0.        , ..., -0.23720531,
         0.027429  ,  0.4790311 ]], dtype=float32)>

In [385]:
test_data

<_TakeDataset element_spec=((TensorSpec(shape=(None, 20), dtype=tf.float32, name=None), TensorSpec(shape=(None, 23), dtype=tf.float32, name=None)), TensorSpec(shape=(None, 1), dtype=tf.float16, name=None))>

In [366]:
### inference.py
m_net = model.movie_net
m_embed = m_net.predict(X_movie)

X_user_id, maska ,y_id = scale(df, user, movies_feat, user_id = 28)
u_id = X_user_id[0,0]
X_user_id = X_user_id[0:1,1:]  ##SVAKI RED JE ISTI, A PRVA KOL JE USER_ID, MORA 0:1 DA BI SHAPE BIO (1,-1)

u_net = model.user_net
u_embed = u_net.predict(X_user_id)
print(u_embed.shape)

pred = tf.linalg.matmul(u_embed, m_embed, transpose_b= True)
pred_negledani = tf.boolean_mask(pred, ~maska, axis = 1)
val, idx = tf.math.top_k(pred_negledani, k = 10)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
(1, 64)


(10000, 64)

In [359]:
y_id

<tf.Tensor: shape=(4, 1), dtype=float16, numpy=
array([[-0.3335],
       [ 0.5557],
       [ 0.333 ],
       [ 0.7773]], dtype=float16)>

In [343]:
tf.boolean_mask(pred, maska, axis = 1)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[2.6139293, 4.6882124, 5.3946023, 5.656996 ]], dtype=float32)>

In [353]:
ratings.filter(pl.col('userid') == 28)

userid,movieid,rating
i64,i64,f64
28,165549,4.5
28,47099,3.5
28,4926,2.0
28,6268,4.0


In [342]:
user_id

88647

In [341]:
pred_negledani

<tf.Tensor: shape=(1, 9996), dtype=float32, numpy=
array([[ 1.0678334 , -7.1745405 , -0.10552454, ...,  5.006638  ,
         5.900188  , -2.6300309 ]], dtype=float32)>

In [324]:
pred = tf.linalg.matmul(u_embed, m_embed, transpose_b= True)

In [326]:
tf.mask

AttributeError: module 'tensorflow' has no attribute 'mask'

In [312]:
X_user_id, X_movie_id,maska, y_id, _ = scale(df, user, movies_feat, user_id = 28)
u_id = X_user_id[0,0]
# m_id =
X_user_id = X_user_id[0:1,1:]  ##SVAKI RED JE ISTI, A PRVA KOL JE USER_ID, MORA 0:1 DA BI SHAPE BIO (1,-1)

u_net = model.user_net
u_embed = u_net.predict(X_user_id,0)
print(u_embed.shape)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
(1, 64)


In [328]:
 tf.boolean_mask(pred, ~maska, axis = 1)

<tf.Tensor: shape=(1, 9998), dtype=float32, numpy=
array([[ 2.6139293,  4.6882124,  5.3946023, ...,  5.006638 ,  5.900188 ,
        -2.6300309]], dtype=float32)>

In [None]:
152948

In [315]:
X_user_id, X_movie_id,maska_152948, y_id, _ = scale(df, user, movies_feat, user_id = 152948)
maska_152948

<tf.Tensor: shape=(10000,), dtype=bool, numpy=array([False, False, False, ..., False, False, False])>

In [317]:
X_user_id, X_movie_id,maska_28, y_id, _ = scale(df, user, movies_feat, user_id = 28)
maska_28

<tf.Tensor: shape=(10000,), dtype=bool, numpy=array([ True,  True,  True, ..., False, False, False])>

In [318]:
tf.reduce_sum(tf.cast(tf.logical_not(maska_28), tf.int32))

<tf.Tensor: shape=(), dtype=int32, numpy=9996>

In [319]:
tf.reduce_sum(tf.cast(tf.logical_not(maska_152948), tf.int32))

<tf.Tensor: shape=(), dtype=int32, numpy=9998>

In [322]:
ratings.filter(pl.col('userid') == 28)

userid,movieid,rating
i64,i64,f64
28,165549,4.5
28,47099,3.5
28,4926,2.0
28,6268,4.0


In [323]:
ratings.filter(pl.col('userid') == 152948)

userid,movieid,rating
i64,i64,f64
152948,592,4.0
152948,78266,3.5


In [105]:
pred_negledani = tf.boolean_mask(pred, maska_movie, axis = 1)

In [103]:
pred

<tf.Tensor: shape=(1, 10000), dtype=float32, numpy=
array([[ 2.6139293,  4.6882124,  5.3946023, ...,  5.006638 ,  5.900188 ,
        -2.6300309]], dtype=float32)>

In [133]:
X_user_id, X_movie_id, maska_movie, y_id, _ = scale(df, user, movies_feat, user_id = 36683)
maska_movie = tf.reduce_all(tf.math.not_equal(X_movie_id[:,0:1], movie_ids_to_drop), 1) ##Vrati true samo gde su sve kolone true
negledani_filmovi = tf.boolean_mask(X_movie_id, maska_movie)

In [144]:
X_user_id_scaled,X_movie_id, maska_movie, y_scaled, scalers = scale(df, user, movies_feat, user_id = 36683)

In [146]:
maska_movie

<tf.Tensor: shape=(10000,), dtype=bool, numpy=array([ True,  True,  True, ...,  True,  True,  True])>

In [139]:
tf.reduce_all(tf.math.not_equal(X_movie_id[:,0:1], movie_ids_to_drop), 1)

<tf.Tensor: shape=(10000,), dtype=bool, numpy=array([False, False, False, ...,  True,  True,  True])>

In [138]:
tf.reduce_sum(tf.cast(tf.logical_not(maska_movie), tf.int32))

<tf.Tensor: shape=(), dtype=int32, numpy=7>

In [135]:
tf.boolean_mask(X_movie_id, maska_movie)

<tf.Tensor: shape=(9993, 24), dtype=float32, numpy=
array([[ 4.92600000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01,  9.54287574e-02, -2.03517914e+00],
       [ 6.26800000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01,  1.22628659e-01,  6.11357808e-01],
       [ 4.70990000e+04,  0.00000000e+00,  0.00000000e+00, ...,
        -2.37205312e-01,  1.77028462e-01,  4.79031086e-01],
       ...,
       [ 1.21300000e+03,  0.00000000e+00,  0.00000000e+00, ...,
         1.25841451e+00, -4.05707508e-02,  1.09655643e+00],
       [ 5.17700000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01, -6.93368375e-01,  1.27299201e+00],
       [ 4.30600000e+03,  0.00000000e+00,  0.00000000e+00, ...,
         1.10885262e+00,  1.09028704e-01,  8.94915521e-01]], dtype=float32)>

In [130]:
X_movie_id

<tf.Tensor: shape=(10000, 24), dtype=float32, numpy=
array([[ 4.92600000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01,  9.54287574e-02, -2.03517914e+00],
       [ 6.26800000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01,  1.22628659e-01,  6.11357808e-01],
       [ 4.70990000e+04,  0.00000000e+00,  0.00000000e+00, ...,
        -2.37205312e-01,  1.77028462e-01,  4.79031086e-01],
       ...,
       [ 1.21300000e+03,  0.00000000e+00,  0.00000000e+00, ...,
         1.25841451e+00, -4.05707508e-02,  1.09655643e+00],
       [ 5.17700000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01, -6.93368375e-01,  1.27299201e+00],
       [ 4.30600000e+03,  0.00000000e+00,  0.00000000e+00, ...,
         1.10885262e+00,  1.09028704e-01,  8.94915521e-01]], dtype=float32)>

In [141]:
X_user_id_scaled[0][0]

<tf.Tensor: shape=(), dtype=float32, numpy=36683.0>

In [142]:
ratings.filter(pl.col('userid') == 36683)

userid,movieid,rating
i64,i64,f64
36683,3741,4.0
36683,4103,3.5
36683,129659,3.5
36683,91707,3.5


In [40]:
X_user_id[0:1,1:]

<tf.Tensor: shape=(1, 20), dtype=float32, numpy=
array([[-0.03860969, -0.67639303, -0.5786355 , -0.28050056, -0.31071845,
         0.95101756, -0.46632147, -0.13443817,  0.89678454, -0.36559492,
        -0.10993176, -0.31134704, -0.22322536,  2.5499384 , -0.31943297,
         2.2691705 , -0.4885134 , -0.6316742 , -0.24418038, -0.15034235]],
      dtype=float32)>

In [41]:
X_movie_id[:,0]

<tf.Tensor: shape=(10000,), dtype=float32, numpy=array([ 4926.,  6268., 47099., ...,  1213.,  5177.,  4306.], dtype=float32)>

In [152]:
user_id = 36683

maska = tf.reduce_any(tf.equal(tf.expand_dims(X_user_id_scaled[:, 0], 1), tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)
X_user_id_scaled = tf.boolean_mask(X_user_id_scaled, maska)  #prva kolona je userid
movie_ids_to_drop = tf.boolean_mask(X_movie_scaled_id[:,0], maska)
maska_movie = tf.reduce_all(tf.math.not_equal(X_movie_id[:,0:1], movie_ids_to_drop), 1) ##Vrati true samo gde su sve kolone true
# negledani_filmovi = tf.boolean_mask(X_movie_id, maska_movie)
y_scaled = tf.boolean_mask(y_scaled, maska)

NameError: name 'X_movie_scaled_id' is not defined

In [156]:
X_user_id

<tf.Tensor: shape=(4, 21), dtype=float32, numpy=
array([[ 3.6683000e+04, -3.8609695e-02,  1.3148389e+00,  1.5425181e+00,
        -2.8050056e-01, -3.1071845e-01, -7.2803879e-01,  2.1733897e+00,
        -1.3443817e-01,  9.6070606e-01, -3.6559492e-01, -1.0993176e-01,
        -3.1134704e-01, -2.2322536e-01, -1.9584285e-01, -3.1943297e-01,
         1.9271525e+00, -4.8851341e-01,  1.6873536e+00,  3.5186434e+00,
        -1.5034235e-01],
       [ 3.6683000e+04, -3.8609695e-02,  1.3148389e+00,  1.5425181e+00,
        -2.8050056e-01, -3.1071845e-01, -7.2803879e-01,  2.1733897e+00,
        -1.3443817e-01,  9.6070606e-01, -3.6559492e-01, -1.0993176e-01,
        -3.1134704e-01, -2.2322536e-01, -1.9584285e-01, -3.1943297e-01,
         1.9271525e+00, -4.8851341e-01,  1.6873536e+00,  3.5186434e+00,
        -1.5034235e-01],
       [ 3.6683000e+04, -3.8609695e-02,  1.3148389e+00,  1.5425181e+00,
        -2.8050056e-01, -3.1071845e-01, -7.2803879e-01,  2.1733897e+00,
        -1.3443817e-01,  9.6070606e-0

In [261]:
y = tf.convert_to_tensor(df.select(pl.col('rating')).to_numpy(), dtype=tf.float16)

prva_user = df.columns.index('no genres listed')
poslednja_user = df.columns.index('Western')
###prva kolona u X_user_ud je userid!!!, trebace za preporuke, za treniranje koristiti X_user
X_user_id = tf.convert_to_tensor(df.select(['userid'] + df.columns[prva_user : poslednja_user + 1]).to_numpy(), dtype=tf.float32)
X_movie_df = df.select(['movieid','year','avg_rating', '#ratings_film'] + [col for col in df.columns if col.endswith('_right')])
movie_num = tf.convert_to_tensor(X_movie_df.select(['#ratings_film', 'year', 'avg_rating']).to_numpy(), dtype=tf.float32)
###u movie_cat ima i movieid
movie_cat = tf.convert_to_tensor(X_movie_df.select(pl.all().exclude(['#ratings_film', 'year', 'avg_rating'])).to_numpy(), dtype=tf.float32)
# Standardizacija user i movie numeričkih
X_user = X_user_id[:, 1:]
user_mean = tf.reduce_mean(X_user, axis=0)
user_std = tf.math.reduce_std(X_user, axis=0)
X_user_scaled = (X_user - user_mean) / (user_std+ 1e-8)
X_user_id_scaled = tf.concat([X_user_id[:, :1], X_user_scaled], axis=1)  # Skalirano sa ID kolonom
movie_mean = tf.reduce_mean(movie_num, axis=0)
movie_std = tf.math.reduce_std(movie_num, axis=0)
movie_num_scaled = (movie_num - movie_mean) / (movie_std)
X_movie_scaled = tf.concat([movie_cat[:,1:], movie_num_scaled], axis=1)
# Target skaliranje na [-1, 1]
y_scaled = 2 * (y - tf.reduce_min(y)) / (tf.reduce_max(y) - tf.reduce_min(y)) - 1
scalers = {"user_mean": user_mean, "user_std": user_std,"movie_mean": movie_mean,"movie_std": movie_std, "y_min": tf.reduce_min(y), "y_max": tf.reduce_max(y)}


In [230]:
ratings

userid,movieid,rating
i64,i64,f64
36683,3741,4.0
16712,4082,2.5
131952,116161,2.5
147475,8360,4.0
131743,42004,4.0
…,…,…
40967,1007,3.5
16508,196,3.0
152948,78266,3.5
144987,182529,3.0


In [231]:
ratings.filter(pl.col('userid') == 152948)

userid,movieid,rating
i64,i64,f64
152948,592,4.0
152948,78266,3.5


In [249]:
ratings['userid'].value_counts()

userid,count
i64,u32
19293,1
59851,1
16507,1
168363,1
97713,1
…,…
172718,1
5425,1
92626,1
48975,1


In [250]:

user_id = 88647

In [251]:
maska = tf.reduce_any(tf.equal(tf.expand_dims(X_user_id_scaled[:, 0], 1), tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)

In [252]:
tf.reduce_sum(tf.cast(tf.logical_not(maska), tf.int32))

<tf.Tensor: shape=(), dtype=int32, numpy=2>

In [253]:
X_user_id_scaled = tf.boolean_mask(X_user_id_scaled, maska)

In [254]:
X_movie_scaled_id = tf.concat([movie_cat, movie_num_scaled], axis=1)  #prva kolona je movieid
X_movie_scaled_id

<tf.Tensor: shape=(10000, 24), dtype=float32, numpy=
array([[ 4.92600000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01,  9.54287574e-02, -2.03517914e+00],
       [ 6.26800000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01,  1.22628659e-01,  6.11357808e-01],
       [ 4.70990000e+04,  0.00000000e+00,  0.00000000e+00, ...,
        -2.37205312e-01,  1.77028462e-01,  4.79031086e-01],
       ...,
       [ 1.21300000e+03,  0.00000000e+00,  0.00000000e+00, ...,
         1.25841451e+00, -4.05707508e-02,  1.09655643e+00],
       [ 5.17700000e+03,  0.00000000e+00,  0.00000000e+00, ...,
        -8.35453272e-01, -6.93368375e-01,  1.27299201e+00],
       [ 4.30600000e+03,  0.00000000e+00,  0.00000000e+00, ...,
         1.10885262e+00,  1.09028704e-01,  8.94915521e-01]], dtype=float32)>

In [255]:
movie_ids_to_drop = tf.boolean_mask(X_movie_scaled_id[:,0], maska)

ValueError: Shapes (10000,) and (2,) are incompatible

In [193]:
ratings.filter(pl.col('userid') == user_id)

userid,movieid,rating
i64,i64,f64
152948,592,4.0
152948,78266,3.5


In [198]:
tf.math.not_equal(X_movie_id[:,0:1], movie_ids_to_drop)

<tf.Tensor: shape=(10000, 4), dtype=bool, numpy=
array([[False,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True, False,  True],
       ...,
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])>

In [207]:
X_movie_id[:,0]

<tf.Tensor: shape=(10000,), dtype=float32, numpy=array([ 4926.,  6268., 47099., ...,  1213.,  5177.,  4306.], dtype=float32)>

In [206]:
movie_ids_to_drop

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([  4926.,   6268.,  47099., 165549.], dtype=float32)>

In [205]:
tf.where(X_movie_id[:,0],movie_ids_to_drop)

ValueError: x and y must both be non-None or both be None.

In [196]:
X_movie_id[:,0:1]

<tf.Tensor: shape=(10000, 1), dtype=float32, numpy=
array([[ 4926.],
       [ 6268.],
       [47099.],
       ...,
       [ 1213.],
       [ 5177.],
       [ 4306.]], dtype=float32)>

In [None]:
maska = tf.reduce_any(tf.equal(tf.expand_dims(X_user_id_scaled[:, 0], 1), tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)
X_user_id_scaled = tf.boolean_mask(X_user_id_scaled, maska)  #prva kolona je userid
X_movie_scaled_id = tf.concat([movie_cat, movie_num_scaled], axis=1)  #prva kolona je movieid
movie_ids_to_drop = tf.boolean_mask(X_movie_scaled_id[:,0], maska)
maska_movie = tf.reduce_all(tf.math.not_equal(X_movie_id[:,0:1], movie_ids_to_drop), 1) ##Vrati true samo gde su sve kolone true
# negledani_filmovi = tf.boolean_mask(X_movie_id, maska_movie)
y_scaled = tf.boolean_mask(y_scaled, maska)

In [209]:
match_mask = tf.reduce_any(tf.equal(tf.expand_dims(X_movie_id[:,0], 1), movie_ids_to_drop), axis=1)

# Izbroj koliko puta je True (tj. koliko se puta pojavio bilo koji od tih ID-jeva)
count = tf.reduce_sum(tf.cast(match_mask, tf.int32))

print("Broj pojavljivanja ID-jeva koje treba izbaciti:", count.numpy())

Broj pojavljivanja ID-jeva koje treba izbaciti: 8


In [210]:
match_mask

<tf.Tensor: shape=(10000,), dtype=bool, numpy=array([ True,  True,  True, ..., False, False, False])>

In [300]:
y = tf.convert_to_tensor(df.select(pl.col('rating')).to_numpy(), dtype=tf.float16)

prva_user = df.columns.index('no genres listed')
poslednja_user = df.columns.index('Western')
###prva kolona u X_user_ud je userid!!!, trebace za preporuke, za treniranje koristiti X_user
X_user_id = tf.convert_to_tensor(df.select(['userid'] + df.columns[prva_user : poslednja_user + 1]).to_numpy(), dtype=tf.float32)
X_movie_df = df.select(['movieid','year','avg_rating', '#ratings_film'] + [col for col in df.columns if col.endswith('_right')])
movie_num = tf.convert_to_tensor(X_movie_df.select(['#ratings_film', 'year', 'avg_rating']).to_numpy(), dtype=tf.float32)
###u movie_cat ima i movieid
movie_cat = tf.convert_to_tensor(X_movie_df.select(pl.all().exclude(['#ratings_film', 'year', 'avg_rating'])).to_numpy(), dtype=tf.float32)
# Standardizacija user i movie numeričkih
X_user = X_user_id[:, 1:]
user_mean = tf.reduce_mean(X_user, axis=0)
user_std = tf.math.reduce_std(X_user, axis=0)
X_user_scaled = (X_user - user_mean) / (user_std+ 1e-8)
X_user_id_scaled = tf.concat([X_user_id[:, :1], X_user_scaled], axis=1)  # Skalirano sa ID kolonom
movie_mean = tf.reduce_mean(movie_num, axis=0)
movie_std = tf.math.reduce_std(movie_num, axis=0)
movie_num_scaled = (movie_num - movie_mean) / (movie_std)
X_movie_scaled = tf.concat([movie_cat[:,1:], movie_num_scaled], axis=1)
# Target skaliranje na [-1, 1]
y_scaled = 2 * (y - tf.reduce_min(y)) / (tf.reduce_max(y) - tf.reduce_min(y)) - 1
scalers = {"user_mean": user_mean, "user_std": user_std,"movie_mean": movie_mean,"movie_std": movie_std, "y_min": tf.reduce_min(y), "y_max": tf.reduce_max(y)}


In [301]:
tf.expand_dims(X_user_id_scaled[:, 0], 1)

<tf.Tensor: shape=(10000, 1), dtype=float32, numpy=
array([[2.80000e+01],
       [2.80000e+01],
       [2.80000e+01],
       ...,
       [2.00809e+05],
       [2.00875e+05],
       [2.00895e+05]], dtype=float32)>

In [303]:
maska = tf.reduce_any(tf.equal(X_user_id_scaled[:, 0:1], tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)

In [304]:
X_user_id_scaled = tf.boolean_mask(X_user_id_scaled, maska)[0]  #prva kolona je userid


In [305]:
X_user_id_scaled

<tf.Tensor: shape=(21,), dtype=float32, numpy=
array([ 8.8647000e+04, -3.8609695e-02,  1.5993006e+00,  1.8455400e+00,
       -2.8050056e-01, -3.1071845e-01, -7.2803879e-01, -4.6632147e-01,
       -1.3443817e-01, -8.9301729e-01, -3.6559492e-01, -1.0993176e-01,
       -3.1134704e-01, -2.2322536e-01, -1.9584285e-01, -3.1943297e-01,
       -4.6697444e-01,  2.1798651e+00,  1.6873536e+00, -2.4418038e-01,
       -1.5034235e-01], dtype=float32)>

In [306]:
tf.reduce_any(tf.equal(X_movie_id[:,0:1], tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)

<tf.Tensor: shape=(10000,), dtype=bool, numpy=array([False, False, False, ..., False, False, False])>

In [309]:
tf.boolean_mask(X_movie_id[:,0:1], ~maska)

<tf.Tensor: shape=(9998, 1), dtype=float32, numpy=
array([[ 4926.],
       [ 6268.],
       [47099.],
       ...,
       [ 1213.],
       [ 5177.],
       [ 4306.]], dtype=float32)>

In [307]:
~maska

<tf.Tensor: shape=(10000,), dtype=bool, numpy=array([ True,  True,  True, ...,  True,  True,  True])>

In [None]:
maska = tf.reduce_any(tf.equal(tf.expand_dims(X_user_id_scaled[:, 0], 1), tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)
X_user_id_scaled = tf.boolean_mask(X_user_id_scaled, maska)  #prva kolona je userid
X_movie_scaled_id = tf.concat([movie_cat, movie_num_scaled], axis=1)  #prva kolona je movieid
movie_ids_to_drop = tf.boolean_mask(X_movie_scaled_id[:,0], maska)
maska_movie = tf.reduce_all(tf.math.not_equal(X_movie_id[:,0:1], movie_ids_to_drop), 1) ##Vrati true samo gde su sve kolone true
# negledani_filmovi = tf.boolean_mask(X_movie_id, maska_movie)
y_scaled = tf.boolean_mask(y_scaled, maska)


In [264]:
tf.reduce_any(tf.equal(tf.expand_dims(X_user_id_scaled[:, 0], 1), tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)

<tf.Tensor: shape=(2,), dtype=bool, numpy=array([ True,  True])>

In [271]:
tf.equal(X_user_id_scaled, tf.constant(user_id, dtype=X_user_id_scaled.dtype))

<tf.Tensor: shape=(2, 21), dtype=bool, numpy=
array([[ True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False],
       [ True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False]])>

In [272]:
X_user_id_scaled

<tf.Tensor: shape=(2, 21), dtype=float32, numpy=
array([[ 8.8647000e+04, -3.8609695e-02,  1.5993006e+00,  1.8455400e+00,
        -2.8050056e-01, -3.1071845e-01, -7.2803879e-01, -4.6632147e-01,
        -1.3443817e-01, -8.9301729e-01, -3.6559492e-01, -1.0993176e-01,
        -3.1134704e-01, -2.2322536e-01, -1.9584285e-01, -3.1943297e-01,
        -4.6697444e-01,  2.1798651e+00,  1.6873536e+00, -2.4418038e-01,
        -1.5034235e-01],
       [ 8.8647000e+04, -3.8609695e-02,  1.5993006e+00,  1.8455400e+00,
        -2.8050056e-01, -3.1071845e-01, -7.2803879e-01, -4.6632147e-01,
        -1.3443817e-01, -8.9301729e-01, -3.6559492e-01, -1.0993176e-01,
        -3.1134704e-01, -2.2322536e-01, -1.9584285e-01, -3.1943297e-01,
        -4.6697444e-01,  2.1798651e+00,  1.6873536e+00, -2.4418038e-01,
        -1.5034235e-01]], dtype=float32)>

In [263]:
maska = tf.reduce_any(tf.equal(tf.expand_dims(X_user_id_scaled[:, 0], 1), tf.constant(user_id, dtype=X_user_id_scaled.dtype)), axis=1)
X_user_id_scaled = tf.boolean_mask(X_user_id_scaled, maska)  #prva kolona je userid
X_movie_scaled_id = tf.concat([movie_cat, movie_num_scaled], axis=1)  #prva kolona je movieid
movie_ids_to_drop = tf.boolean_mask(X_movie_scaled_id[:,0], maska)
maska_movie = tf.reduce_all(tf.math.not_equal(X_movie_id[:,0:1], movie_ids_to_drop), 1) ##Vrati true samo gde su sve kolone true
# negledani_filmovi = tf.boolean_mask(X_movie_id, maska_movie)
y_scaled = tf.boolean_mask(y_scaled, maska)


ValueError: Shapes (10000,) and (2,) are incompatible

In [260]:
tf.boolean_mask(X_movie_scaled_id[:,0:1], maska)

ValueError: Shapes (10000,) and (0,) are incompatible

In [440]:
movies = movies.with_columns(pl.col("genres").str.split("|"))
unique_genres = sorted(set(g for genre in movies["genres"] for g in genre))
unique_genres[0] = unique_genres[0].replace('(', '').replace(')', '')

In [457]:
unique_genres

['no genres listed',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']