# Deep Learning Algorithms for Movie Recommendations
## Filipe Gonçalves, 98083
## Pedro Lopes, 97827

In [128]:
from __future__ import print_function
import numpy as np
import pandas as pd
import collections
from IPython import display
import sklearn
import sklearn.manifold
import tensorflow as tf

In [129]:
# Read the users and ratings datasets

users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./ml-100k/u.user', sep='|', names=users_cols)

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('./ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
    
movies_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] + genre_cols
movies = pd.read_csv('./ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')
#movies = movies.drop(columns=['release_date', 'video_release_date', 'imdb_url'])

#print(users)
#print(ratings)
#print(movies)

In [130]:
movies['year'] = movies['release_date'].apply(lambda x: str(x).split())

In [131]:
def split_dataframe(df, holdout_fraction=0.1):
    
    test = df.sample(frac=0.1, replace=False)
    train = df[~df.index.isin(test.index)]

    return train, test

In [132]:
# Group the rated movies with the user id
# https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/

rated_movies = (ratings[["user_id", "movie_id"]]
                .groupby("user_id", as_index=False)
                .aggregate(lambda x: list(x)))

In [133]:
# Creating the individual tensors for the batch input of the Neural Network model

yearsDict = {
    movie: year for movie, year in zip(movies["movie_id"], movies["year"])
}
genreDict = {}
active = []

In [134]:
# Getting the available genre for that particular movie

def foo_(movies, genres):
    
    def all_genres(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        return '-'.join(active)
    
    movies['all_genres'] = [all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]

    
foo_(movies, genre_cols)
genres_dict = {movie: genres.split('-') for movie, genres in zip(movies["movie_id"], movies["all_genres"])}

In [135]:
#or movie_id in movie_ids for x in genres_dict[movie_id]

movie = []
year = []
genre = []
label = []

for movie_ids in ratings['movie_id'].values:
    movie.append(movie_ids)
    genre.append([x for x in genres_dict[movie_ids]])
    
#genre

In [148]:
def make_batch(ratings, batch_size):

    # Function to fill null values to form sparse tensor
    def pad(x, fill):
        return pd.DataFrame.from_dict(x).fillna(fill).values

    movie = []
    year = []
    genre = []
    label = []

      # Fill the input with 4 features
    for movie_ids in ratings["movie_id"].values:
        movie.append([chr(movie_id) for movie_id in movie_ids])
        genre.append([x for movie_id in movie_ids for x in genres_dict[movie_id]])
        year.append([yearsDict[movie_id][0] for movie_id in movie_ids])
        label.append([chr(movie_id) for movie_id in movie_ids])

    # Creating the input tensors
    features = {
        "movie_id": pad(movie, ""),
        "year": pad(year, ""),
        "genre": pad(genre, ""),
        "label": pad(label, "")
    }
  
    # Creating a single batch for each iteraton
    batch = (
        tf.data.Dataset.from_tensor_slices(features)
        .shuffle(1000)
        .repeat()
        .batch(batch_size)
        # one_shot_iterator only in TF1.X
        #.make_one_shot_iterator()
        #.get_next()
        )
  
    return batch

In [137]:
def softmax_loss(user_embeddings, movie_embeddings, labels):
  
    # Verify that the embddings have compatible dimensions
    user_emb_dim = user_embeddings.shape[1].value
    movie_emb_dim = movie_embeddings.shape[1].value
    logits = tf.matmul(user_embeddings, movie_embeddings, transpose_b=True)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
    logits=logits, labels=labels))
    
    return loss

In [138]:
# Creating the training class

class Model(object):

    def __init__(self, embedding_vars, loss, metrics=None):
        self.embedding_vars = embedding_vars
        self.loss = loss
        self.metrics = metrics
        self.embeddings = {k: None for k in embedding_vars}
        self.session = None

    def embeddings(self):
        return self.embeddings

    def train(self, num_iterations=100, learning_rate=1.0, plot_results=True, optimizer=tf.keras.optimizers.SGD()):
        with self.loss.graph.as_default():

            # Minimize loss function
            train_op = optimizer(learning_rate).minimize(self.loss)

            # Initialise the operation
            local_init_op = tf.group(tf.variables_initializer(optimizer(learning_rate).variables()), tf.local_variables_initializer())

            if self.session is None:
                self.session = tf.Session()

            with self.session.as_default():
                self.session.run([tf.global_variables_initializer(), tf.tables_initializer()])

        with self.session.as_default():
            local_init_op.run()
            iterations = []
            metrics = {}
            metrics_vals = {}

        # Train and append results.
        for i in range(num_iterations + 1):
            _, results = self.session.run((train_op, metrics))
            if i == num_iterations:
                for metric_val, result in zip(metrics_vals, results):
                    # Embeddings are u and k respectively
                    for k, v in result.items():
                        metric_val[k].append(v)

        for k, v in self.embedding_vars.items():
            self.embeddings[k] = v.eval()

In [154]:
def build_softmax_model(rated_movies, embedding_cols, hidden_dims):
    
    def create_network(features):
        # Create a bag-of-words embedding for each sparse feature.
        inputs = tf.compat.v1.feature_column.input_layer(features, embedding_cols)
        # Hidden layers.
        input_dim = inputs.shape[1].value
        for i, output_dim in enumerate(hidden_dims):
            w = tf.get_variable(
                "hidden%d_w_" % i, shape=[input_dim, output_dim],
                initializer=tf.truncated_normal_initializer(
                    stddev=1./np.sqrt(output_dim))) / 10.
            
            outputs = tf.matmul(inputs, w)
            input_dim = output_dim
            inputs = outputs
        return outputs

    train_rated_movies, test_rated_movies = split_dataframe(rated_movies)
    train_batch = make_batch(train_rated_movies, 200)
    test_batch = make_batch(test_rated_movies, 100)

    with tf.compat.v1.variable_scope("model", reuse=False):
        # Train
        train_user_embeddings = create_network(train_batch)
        train_labels = select_random(train_batch["label"])
    with tf.compat.v1.variable_scope("model", reuse=True):
        # Test
        test_user_embeddings = create_network(test_batch)
        test_labels = select_random(test_batch["label"])
        movie_embeddings = tf.get_variable(
            "input_layer/movie_id_embedding/embedding_weights")

    test_loss = softmax_loss(
      test_user_embeddings, movie_embeddings, test_labels)
    train_loss = softmax_loss(
      train_user_embeddings, movie_embeddings, train_labels)
    _, test_precision_at_10 = tf.metrics.precision_at_k(
      labels=test_labels,
      predictions=tf.matmul(test_user_embeddings, movie_embeddings, transpose_b=True),
      k=10)

    metrics = (
      {"train_loss": train_loss, "test_loss": test_loss},
      {"test_precision_at_10": test_precision_at_10}
    )
    embeddings = {"movie_id": movie_embeddings}
    return CFModel(embeddings, train_loss, metrics)

In [155]:
# Create feature embedding columns
def make_embedding_col(key, embedding_dim):
    categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(key=key, vocabulary_list=list(set(movies[key].values)), num_oov_buckets=0)
    return tf.feature_column.embedding_column(categorical_column=categorical_col, dimension=embedding_dim, combiner='mean')

# Initialise graph for training 
with tf.Graph().as_default():
    softmax_model = build_softmax_model(
                  rated_movies,  # Input
                  embedding_cols=[ # Embeddings learned
                  make_embedding_col("movie_id", 35)],
                  hidden_dims=[35]) # Dimension of the embedding

# Perform training 
softmax_model.train(learning_rate=8., num_iterations=3000, optimizer=tf.train.AdagradOptimizer)

AttributeError: 'BatchDataset' object has no attribute 'values'