
# Deep Learning Recommendations system

In [40]:
from itertools import chain
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.layers import (
    Input, Reshape, Flatten, Add, Activation, Lambda, Concatenate, Dense, Dropout, Conv1D, GlobalMaxPooling1D
)

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2, l1_l2

In [3]:
%run Functions.ipynb

In [4]:
# Varibles
is_test = True

sample_data = 500_000 #None

if is_test:
    filter_date = dt.datetime(2021, 3, 10, 0, 0, 0).date()
else:
    filter_date = dt.datetime(2021, 4, 1, 0, 0, 0).date()

categories_list = [
                   'accion', 'animacion', 'animales', 'aventura', 'belico', 'biografia', 'ciencia',
                   'ciencia ficcion', 'cocina', 'comedia', 'competencia', 'crimen', 'cultura', 'deporte',
                   'dibujos animados', 'documental', 'drama', 'entretenimiento', 'entrevistas', 'espectaculo',
                   'familia', 'fantasia', 'historia', 'humor', 'infantil', 'interes general', 'investigacion',
                   'magazine', 'moda', 'musica', 'naturaleza', 'periodistico', 'policial', 'politico', 'reality',
                   'religion', 'restauracion', 'romance', 'suspenso', 'teatro', 'terror', 'viajes', 'western'
                  ]

In [5]:
if is_test:
    df, df_test = create_dfs(sample_data=sample_data, clean=True)  # to test
else:
    df, _ = create_dfs(sample_data=sample_data, ret_test=False, clean=True)  # to submit
df.tail(3)

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,resume,min_watching,tunein_hour,content_id,released_year,...,restauracion,romance,suspenso,teatro,terror,viajes,western,title,keywords,ranking
438259,27657,48596,STB,13230.0,2021-01-15 23:05:00,1,2.0,23,2992.0,2017.0,...,0,0,0,0,0,0,0,jordskott,"thriller,crimen,detectives",1
2824123,70924,89386,STATIONARY,29950.0,2021-02-28 15:19:00,0,26.0,15,1409.0,2021.0,...,0,0,0,0,0,0,0,02/08 - corte y confeccion famosos,"competencia,belleza,diseño,moda,celebridades",2
2189254,561,26480,STB,28602.0,2021-01-15 11:06:00,0,12.0,11,2163.0,2016.0,...,0,0,0,0,0,0,0,gallina pintadita mini,educativo,10


In [6]:
def tokenize_string(data, vocab_size=200):
    
    # Map strngs to numbers
    tokens = [one_hot(words, vocab_size) for words in data]
    
    max_len = np.max(list(map(len, tokens)))
    
    pad_corp = pad_sequences(tokens, maxlen=max_len, padding='post', value=0.0)
    
    return pad_corp, max_len

In [60]:
class EmbeddingLayer1D:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors

    def __call__(self, x):
        x = Embedding(input_dim=self.n_items, 
                      output_dim=self.n_factors,
                      embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
#         x = Conv1D(self.n_factors, 3,  activation="relu", padding="valid", strides=3)(x)
#         x = GlobalMaxPooling1D()(x)
        return x


class EmbeddingLayer2D:
    def __init__(self, max_len, embedding_size, vocab_size):
        self.max_len = max_len
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size

    def __call__(self, x):
        x = Embedding(input_dim=self.vocab_size,
                      output_dim=self.embedding_size,
                      embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6),
                      input_length=self.max_len)(x)

        x = Flatten()(x)
        return x

In [61]:
class CollaborativeFilterKeras:
    def __init__(self, users, content, rating, keywords, n_factors, 
                 embedding_size, vocab_size):
        
        self.n_factors = n_factors
        
        self.embedding_size = embedding_size
        self.max_len = keywords.shape[1]
        self.vocab_size = vocab_size

        self.n_users = users.nunique()
        self.n_content = content.nunique()
        self.min_rating = min(rating)
        self.max_rating = max(rating)
        # Encode ids
        self.user_enc = LabelEncoder()
        self.content_enc = LabelEncoder()
        
        self.users = self.user_enc.fit_transform(users.values)
        self.content = self.content_enc.fit_transform(content.values)
        
        self.keywords = keywords.astype(np.float32)
        self.rating = rating.values.astype(np.float32)

        self.model = None
        self.history = None

    def compile_mode(self):
        user = Input(shape=(1,))
        u = EmbeddingLayer1D(self.n_users, self.n_factors)(user)

        content = Input(shape=(1,))
        c = EmbeddingLayer1D(self.n_content, self.n_factors)(content)

#         keyword = Input(shape=(max_len,))
#         w = EmbeddingLayer2D(self.max_len, self.embedding_size, self.vocab_size)(keyword)
        
        x = Concatenate()([u, c])
#         x = Concatenate()([x, w])
        x = Dropout(0.5)(x)

        x = Dense(self.n_factors, 
                  activation='relu',
                  kernel_initializer='he_normal',
                  kernel_regularizer=l1_l2()
                 )(x)
        x = Dropout(0.3)(x)

        x = Dense(1, kernel_initializer='he_normal')(x)
        x = Activation('sigmoid')(x)
        x = Lambda(lambda x: x * (self.max_rating - self.min_rating) + self.min_rating)(x)
        
        model = Model(inputs=[user, content], outputs=x, name="Flow")
        model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.01))
        self.model = model

    def summary(self):
        return self.model.summary()

    def fit_model(self, batch_size=64, epochs=10):
        self.history = self.model.fit(x=[self.users, self.content], y=self.rating, batch_size=batch_size, epochs=epochs, verbose=1)

In [44]:
agg_func = {
    'ranking': 'max',
    'keywords': lambda xs: ','.join(set(chain(*[x.split(',') for x in xs])))
}

df_views = df.groupby(['account_id', 'content_id'], 
                      as_index=False).agg(agg_func)

df_views.tail()

Unnamed: 0,account_id,content_id,ranking,keywords
206510,112214,657.0,1,"instituto,adolescentes"
206511,112237,3377.0,1,"guerra mundial,supervivencia,golden globe,viaj..."
206512,112254,1539.0,8,"de libros,fantasia,dimensiones,40s"
206513,112255,2043.0,10,"droga,robo,venganza,crimen,pandillas,mafia"
206514,112348,1983.0,8,"feminismo,mujeres,de libros,abusos,crimen"


In [19]:
vocab_size = 20
embeddings_size = 5
keywords_vec, max_len = tokenize_string(df_views.keywords.values, vocab_size)

keywords_vec[:5]

array([[ 4,  6,  7,  7,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 16,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 4,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [10,  7,  6, 13,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9,  1,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [62]:
flow_model = CollaborativeFilterKeras(users=df_views.account_id, 
                                      content=df_views.content_id, 
                                      rating=df_views.ranking,
                                      keywords=keywords_vec,
                                      n_factors=50,
                                      embedding_size=5,
                                      vocab_size=vocab_size)

flow_model.compile_mode()
flow_model.summary()

Model: "Flow"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_29 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_30 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_29 (Embedding)        (None, 1, 50)        3596450     input_29[0][0]                   
__________________________________________________________________________________________________
embedding_30 (Embedding)        (None, 1, 50)        179700      input_30[0][0]                   
_______________________________________________________________________________________________

In [63]:
%%time

flow_model.fit_model(batch_size=512, epochs=20)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 35min 3s, sys: 1min 23s, total: 36min 26s
Wall time: 6min 51s


In [140]:
# recommendations = flow_model.predict_all(df_views[df_views.account_id.isin(df_views.account_id.values[:1])])