In [2]:
import tensorflow as tf
from tensorflow import feature_column
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# category columns
genres_list = ['Sci-Fi', 'Fantasy', "Children's", 'Horror', 'Action', 'Mystery', 'Film-Noir', 'Musical', 'Crime', 'Adventure', 'Animation', 'Comedy', 'Drama', 'Romance', 'Documentary', 'Thriller', 'Western', 'War']
genres = feature_column.categorical_column_with_vocabulary_list('genres', genres_list)

age_list = [1, 18, 25, 35, 45, 50, 56]
age = feature_column.categorical_column_with_vocabulary_list('age', age_list)

gender = feature_column.categorical_column_with_vocabulary_list('gender', [0, 1])

occupation_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
occupation = feature_column.categorical_column_with_vocabulary_list('occupation', occupation_list)

user_cate = feature_column.categorical_column_with_vocabulary_list('userId', np.arange(1, 6041))
movie_cate = feature_column.categorical_column_with_vocabulary_list('movieId', np.arange(1, 3884))


In [4]:
# continuous columns
user_cont = feature_column.numeric_column('userId', dtype=tf.int32)
movie_cont = feature_column.numeric_column('movieId', dtype=tf.int32)
year_cont = feature_column.numeric_column('year', dtype=tf.int32)
age_cont = feature_column.numeric_column('age', dtype=tf.int32)
gender_cont = feature_column.numeric_column('gender', dtype=tf.int32)

In [5]:
# wide columns
crossed_columns = [
    feature_column.crossed_column([genres, age], hash_bucket_size=1000),
    feature_column.crossed_column([genres, occupation], hash_bucket_size=1000),
]
wide_columns = [user_cate, movie_cate, gender, occupation, crossed_columns[0], crossed_columns[1]]


In [6]:
# deep columns
deep_columns = [
    feature_column.embedding_column(user_cont, dimension=32),
    feature_column.embedding_column(movie_cont, dimension=32),
    gender_cont,
    year_cont,
    age_cont,
    feature_column.indicator_column(genres),
]

In [31]:
def config_column_genres_in_dict(df_dict, max_feature=3):
    gen = df_dict.pop('genres', None)
    new_gen = []
    for i in gen:
        tmp = i.split('|')
        while(len(tmp) < max_feature):
            tmp.append('null')
        new_gen.append(tmp)
    df_dict['genres'] = new_gen
    return df_dict

In [20]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('rating')
    df_dict = config_column_genres_in_dict(dataframe.to_dict(orient='list'))
    ds = tf.data.Dataset.from_tensor_slices((df_dict, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [34]:
# read csv
# fix genres columns in dataframe
dataframe = pd.read_csv('./cleaned-data/ml-1m.csv')


In [35]:
dataframe

Unnamed: 0,userId,movieId,rating,title,genres,year,gender,age,occupation
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama,1975,0,1,10
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical,1996,0,1,10
2,1,914,3,My Fair Lady (1964),Musical|Romance,1964,0,1,10
3,1,3408,4,Erin Brockovich (2000),Drama,2000,0,1,10
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998,0,1,10
...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,Weekend at Bernie's (1989),Comedy,1989,1,25,6
1000205,6040,1094,5,"Crying Game, The (1992)",Drama|Romance|War,1992,1,25,6
1000206,6040,562,5,Welcome to the Dollhouse (1995),Comedy|Drama,1995,1,25,6
1000207,6040,1096,4,Sophie's Choice (1982),Drama,1982,1,25,6


In [36]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(dataframe, test_size=0.25)
print(len(train))
print(len(val))
print(len(test))

750156
250053
200042


In [37]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [38]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['genres'])
  print('A batch of targets:', label_batch )

Every feature: ['userId', 'movieId', 'title', 'year', 'gender', 'age', 'occupation', 'genres']
A batch of ages: tf.Tensor(
[[b'Comedy' b'null' b'null']
 [b'Drama' b'Mystery' b'null']
 [b'Comedy' b'Romance' b'null']
 [b'Crime' b'Drama' b'Romance']
 [b'Action' b'Sci-Fi' b'Thriller']
 [b'Action' b'Adventure' b'Sci-Fi']
 [b'Horror' b'Thriller' b'null']
 [b'Adventure' b"Children's" b'null']
 [b'Comedy' b'Crime' b'null']
 [b'Comedy' b'null' b'null']
 [b'Drama' b'Thriller' b'null']
 [b'Action' b'Drama' b'War']
 [b'Action' b'Adventure' b'Sci-Fi']
 [b'Drama' b'null' b'null']
 [b'Action' b'War' b'null']
 [b'Musical' b'Romance' b'War']
 [b'Action' b'Adventure' b'Sci-Fi']
 [b'Action' b'Adventure' b'null']
 [b'Action' b"Children's" b'Fantasy']
 [b'Drama' b'null' b'null']
 [b'Adventure' b'Sci-Fi' b'null']
 [b'Comedy' b'null' b'null']
 [b'Action' b'Sci-Fi' b'Thriller']
 [b'Comedy' b'Drama' b'null']
 [b'Comedy' b'null' b'null']
 [b'Comedy' b'Romance' b'null']
 [b'Drama' b'null' b'null']
 [b'Drama' b'n