# Обработка данных
Для валидации было выбрано 25% последних рекомендаций для каждого пользователя. Если эти 25% включают более 100 фильмов, то выбираются только последние 100. Иначе единичные пользователи, имеющие очень большое число рецензий, будут вносить крайне значительный вклад в валидацию, чего бы не хотелось.

In [1]:
import numpy as np
import pandas as pd

def get_movie_sequences(data, user_val_part=0.25, max_user_val_movies=100):
    # make train and validation sequences based on order of timestamp. For validation we take 25% of user's ratings.
    # for user that have too many ratings we take only "max_user_val_movies"
    train_user_ratings, val_user_ratings = [], []
    train_user_movie_seq, val_user_movie_seq = [], []
    total_n_val = 0
    for _user_id, user_movies in data.groupby('userId'):
        user_movies = user_movies.sort_values(by=['timestamp'])
        user_ratings = user_movies['rating'].values.tolist()
        user_movie_seq = user_movies['movieId'].values.tolist()

        n_val = min(int(len(user_movie_seq) * user_val_part), max_user_val_movies)
        total_n_val += n_val

        train_user_ratings.append(user_ratings[:n_val])
        val_user_ratings.append(user_ratings[n_val:])

        train_user_movie_seq.append(user_movie_seq[:n_val])
        val_user_movie_seq.append(user_movie_seq[n_val:])
        
    print('part of data that is taken for validation:', total_n_val / len(data))    
    return train_user_ratings, val_user_ratings, train_user_movie_seq, val_user_movie_seq

def encode_movieId(data):
    # map movie id to new index based on frequency of this movie's occurrence in dataset:
    # most frequent movie will be encoded with index 0, and less frequent will have max index value
    cnts = data['movieId'].value_counts()
    ind_range = [i for i, _x in enumerate(cnts)]
    movie_id2ind = dict(zip(cnts.index, ind_range))
    ind2movie_id = dict(zip(ind_range, cnts.index))
    data['movieId'] = data['movieId'].map(movie_id2ind)
    return data, ind2movie_id, len(cnts)

def get_data():
    data = pd.read_csv('data/ratings.csv')
    data, ind2movie_id, n_movies = encode_movieId(data)
    splitted_prepared_data = get_movie_sequences(data)
    return (*splitted_prepared_data, ind2movie_id, n_movies)

train_user_ratings, val_user_ratings, train_user_movie_seq, val_user_movie_seq, _ind2movie_id, n_movies = get_data()

part of data that is taken for validation: 0.2011420049826345


# Movie2vec
с помощью skip-gram negative sampling (SGNS) получаем матрицу похожести фильмов

In [5]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

def build_model(n_movies, embedding_size=300):
    input_target = Input((1,))
    input_context = Input((1,))

    embedding = Embedding(n_movies, embedding_size, input_length=1, name='embedding')
    target = embedding(input_target)
    target = Reshape((embedding_size, 1))(target)
    context = embedding(input_context)
    context = Reshape((embedding_size, 1))(context)

    x = Dot(axes=1)([target, context])
    x = Reshape((1,))(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=[input_target, input_context], outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    
    similarity = Dot(axes=0, normalize=True)([target, context])
    similarity_model = Model(inputs=[input_target, input_context], outputs=similarity)
    
    return model, similarity_model

def get_train_pairs(train_user_movie_seq, val_user_movie_seq, sampling_table, window_size=2):
    targets, contexts, labels = [], [], []
    for movie_seq1, movie_seq2 in zip(train_user_movie_seq, val_user_movie_seq):
        movie_seq = movie_seq1 + movie_seq2
        
        pairs = []
        while (len(pairs) == 0):
            pairs, pairs_labels = skipgrams(movie_seq, len(sampling_table), window_size=window_size,
                                            sampling_table=sampling_table)
        target, context = zip(*pairs)

        targets += target
        contexts += context
        labels += pairs_labels
        
    targets = np.array(targets, dtype="int32")
    contexts = np.array(contexts, dtype="int32")
    
    return targets, contexts, labels

model, similarity_model = build_model(n_movies)
sampling_table = sequence.make_sampling_table(n_movies)

# n_epochs = 1
# for _epoch in range(epochs):
#     # target ~= ancor for window
#     targets, contexts, labels = get_train_pairs(train_user_movie_seq, val_user_movie_seq, sampling_table)
#     model.fit([targets, contexts], labels, batch_size=1)

epochs = 200000
placeholder_1 = np.zeros((1,))
placeholder_2 = np.zeros((1,))
placeholder_3 = np.zeros((1,))
targets, contexts, labels = get_train_pairs(train_user_movie_seq, val_user_movie_seq, sampling_table)
for cnt in range(epochs):
    ind = np.random.randint(0, len(labels)-1)
    placeholder_1[0,] = targets[ind]
    placeholder_2[0,] = contexts[ind]
    placeholder_3[0,] = labels[ind]
    loss = model.train_on_batch([placeholder_1, placeholder_2], placeholder_3)
    if cnt % 100 == 0:
        print("Iteration {}/{}, loss={}".format(cnt, epochs, loss))

Iteration 0/200000, loss=0.6951915621757507
Iteration 100/200000, loss=0.7001964449882507
Iteration 200/200000, loss=0.7202941179275513
Iteration 300/200000, loss=0.7075939178466797
Iteration 400/200000, loss=0.6857410073280334
Iteration 500/200000, loss=0.7215298414230347
Iteration 600/200000, loss=0.7082899808883667
Iteration 700/200000, loss=0.6965871453285217
Iteration 800/200000, loss=0.6947771906852722
Iteration 900/200000, loss=0.6861074566841125
Iteration 1000/200000, loss=0.6703883409500122
Iteration 1100/200000, loss=0.686747670173645
Iteration 1200/200000, loss=0.7007666826248169
Iteration 1300/200000, loss=0.7109870910644531
Iteration 1400/200000, loss=0.6915712356567383
Iteration 1500/200000, loss=0.6862387657165527
Iteration 1600/200000, loss=0.6964664459228516
Iteration 1700/200000, loss=0.6839592456817627
Iteration 1800/200000, loss=0.6990758180618286
Iteration 1900/200000, loss=0.6977675557136536
Iteration 2000/200000, loss=0.6762588024139404
Iteration 2100/200000, los

Iteration 17400/200000, loss=0.6838504672050476
Iteration 17500/200000, loss=0.6946412324905396
Iteration 17600/200000, loss=0.671798050403595
Iteration 17700/200000, loss=0.6957928538322449
Iteration 17800/200000, loss=0.6897964477539062
Iteration 17900/200000, loss=0.7141585350036621
Iteration 18000/200000, loss=0.6678675413131714
Iteration 18100/200000, loss=0.7016836404800415
Iteration 18200/200000, loss=0.6788962483406067
Iteration 18300/200000, loss=0.6845566630363464
Iteration 18400/200000, loss=0.7214685678482056
Iteration 18500/200000, loss=0.6979836821556091
Iteration 18600/200000, loss=0.7119431495666504
Iteration 18700/200000, loss=0.6975154876708984
Iteration 18800/200000, loss=0.6932817101478577
Iteration 18900/200000, loss=0.7248334288597107
Iteration 19000/200000, loss=0.6884815692901611
Iteration 19100/200000, loss=0.676348865032196
Iteration 19200/200000, loss=0.685945987701416
Iteration 19300/200000, loss=0.6843099594116211
Iteration 19400/200000, loss=0.727106153964

Iteration 34600/200000, loss=0.6865646839141846
Iteration 34700/200000, loss=0.7012349963188171
Iteration 34800/200000, loss=0.7029752135276794
Iteration 34900/200000, loss=0.7191710472106934
Iteration 35000/200000, loss=0.7136729955673218
Iteration 35100/200000, loss=0.6925614476203918
Iteration 35200/200000, loss=0.6935650110244751
Iteration 35300/200000, loss=0.695820152759552
Iteration 35400/200000, loss=0.6920179724693298
Iteration 35500/200000, loss=0.7051374912261963
Iteration 35600/200000, loss=0.695364773273468
Iteration 35700/200000, loss=0.691085159778595
Iteration 35800/200000, loss=0.6608214974403381
Iteration 35900/200000, loss=0.722712516784668
Iteration 36000/200000, loss=0.6997109055519104
Iteration 36100/200000, loss=0.6697403192520142
Iteration 36200/200000, loss=0.6992559432983398
Iteration 36300/200000, loss=0.6882035136222839
Iteration 36400/200000, loss=0.6776135563850403
Iteration 36500/200000, loss=0.6870596408843994
Iteration 36600/200000, loss=0.6903831958770

Iteration 51800/200000, loss=0.6777366399765015
Iteration 51900/200000, loss=0.6840250492095947
Iteration 52000/200000, loss=0.7137592434883118
Iteration 52100/200000, loss=0.7081481218338013
Iteration 52200/200000, loss=0.7054582834243774
Iteration 52300/200000, loss=0.6537220478057861
Iteration 52400/200000, loss=0.6871859431266785
Iteration 52500/200000, loss=0.6987343430519104
Iteration 52600/200000, loss=0.6859804391860962
Iteration 52700/200000, loss=0.6979339122772217
Iteration 52800/200000, loss=0.7159368991851807
Iteration 52900/200000, loss=0.689119279384613
Iteration 53000/200000, loss=0.7090778946876526
Iteration 53100/200000, loss=0.7101702094078064
Iteration 53200/200000, loss=0.6713685989379883
Iteration 53300/200000, loss=0.6981118321418762
Iteration 53400/200000, loss=0.6812922358512878
Iteration 53500/200000, loss=0.6781622767448425
Iteration 53600/200000, loss=0.7165370583534241
Iteration 53700/200000, loss=0.7037896513938904
Iteration 53800/200000, loss=0.6742451190

Iteration 69000/200000, loss=0.7139599323272705
Iteration 69100/200000, loss=0.6645857095718384
Iteration 69200/200000, loss=0.7001447677612305
Iteration 69300/200000, loss=0.6736506223678589
Iteration 69400/200000, loss=0.702328622341156
Iteration 69500/200000, loss=0.682843029499054
Iteration 69600/200000, loss=0.6751419305801392
Iteration 69700/200000, loss=0.7435110807418823
Iteration 69800/200000, loss=0.6957598328590393
Iteration 69900/200000, loss=0.6541163325309753
Iteration 70000/200000, loss=0.7155376076698303
Iteration 70100/200000, loss=0.681094229221344
Iteration 70200/200000, loss=0.6943890452384949
Iteration 70300/200000, loss=0.7025951147079468
Iteration 70400/200000, loss=0.7010681629180908
Iteration 70500/200000, loss=0.7180209159851074
Iteration 70600/200000, loss=0.6770004630088806
Iteration 70700/200000, loss=0.6904580593109131
Iteration 70800/200000, loss=0.653156042098999
Iteration 70900/200000, loss=0.7262290716171265
Iteration 71000/200000, loss=0.6950562000274

Iteration 86200/200000, loss=0.7359734773635864
Iteration 86300/200000, loss=0.6920121312141418
Iteration 86400/200000, loss=0.6633764505386353
Iteration 86500/200000, loss=0.7222808003425598
Iteration 86600/200000, loss=0.6875405311584473
Iteration 86700/200000, loss=0.6930172443389893
Iteration 86800/200000, loss=0.6982826590538025
Iteration 86900/200000, loss=0.6548715829849243
Iteration 87000/200000, loss=0.7381437420845032
Iteration 87100/200000, loss=0.6938593983650208
Iteration 87200/200000, loss=0.6806643009185791
Iteration 87300/200000, loss=0.7035122513771057
Iteration 87400/200000, loss=0.7219244837760925
Iteration 87500/200000, loss=0.748417317867279
Iteration 87600/200000, loss=0.7128223776817322
Iteration 87700/200000, loss=0.6968430280685425
Iteration 87800/200000, loss=0.6709698438644409
Iteration 87900/200000, loss=0.690081000328064
Iteration 88000/200000, loss=0.7390635013580322
Iteration 88100/200000, loss=0.6799781918525696
Iteration 88200/200000, loss=0.71091049909

Iteration 103300/200000, loss=0.6475465297698975
Iteration 103400/200000, loss=0.6671529412269592
Iteration 103500/200000, loss=0.7530296444892883
Iteration 103600/200000, loss=0.7571340203285217
Iteration 103700/200000, loss=0.6482071876525879
Iteration 103800/200000, loss=0.7641058564186096
Iteration 103900/200000, loss=0.6688756346702576
Iteration 104000/200000, loss=0.6420685648918152
Iteration 104100/200000, loss=0.7452533841133118
Iteration 104200/200000, loss=0.6631569862365723
Iteration 104300/200000, loss=0.7395238280296326
Iteration 104400/200000, loss=0.7063288688659668
Iteration 104500/200000, loss=0.6754046678543091
Iteration 104600/200000, loss=0.7373908758163452
Iteration 104700/200000, loss=0.7568931579589844
Iteration 104800/200000, loss=0.6777749061584473
Iteration 104900/200000, loss=0.7284717559814453
Iteration 105000/200000, loss=0.6597802639007568
Iteration 105100/200000, loss=0.6806949377059937
Iteration 105200/200000, loss=0.7204013466835022
Iteration 105300/200

Iteration 120100/200000, loss=0.6772424578666687
Iteration 120200/200000, loss=0.6844866275787354
Iteration 120300/200000, loss=0.7163321375846863
Iteration 120400/200000, loss=0.7213020324707031
Iteration 120500/200000, loss=0.6744191646575928
Iteration 120600/200000, loss=0.7023179531097412
Iteration 120700/200000, loss=0.6943390369415283
Iteration 120800/200000, loss=0.6771208047866821
Iteration 120900/200000, loss=0.6884981989860535
Iteration 121000/200000, loss=0.6875671148300171
Iteration 121100/200000, loss=0.6917904615402222
Iteration 121200/200000, loss=0.6919592618942261
Iteration 121300/200000, loss=0.6895256042480469
Iteration 121400/200000, loss=0.6757872104644775
Iteration 121500/200000, loss=0.7076008319854736
Iteration 121600/200000, loss=0.7080901265144348
Iteration 121700/200000, loss=0.6492502689361572
Iteration 121800/200000, loss=0.7180042266845703
Iteration 121900/200000, loss=0.7020112872123718
Iteration 122000/200000, loss=0.6341026425361633
Iteration 122100/200

Iteration 136900/200000, loss=0.6610421538352966
Iteration 137000/200000, loss=0.7297428846359253
Iteration 137100/200000, loss=0.7489959001541138
Iteration 137200/200000, loss=0.7097965478897095
Iteration 137300/200000, loss=0.663169264793396
Iteration 137400/200000, loss=0.6377471685409546
Iteration 137500/200000, loss=0.5835635662078857
Iteration 137600/200000, loss=0.6749016046524048
Iteration 137700/200000, loss=0.7295860052108765
Iteration 137800/200000, loss=0.6569421291351318
Iteration 137900/200000, loss=0.731208860874176
Iteration 138000/200000, loss=0.6742280721664429
Iteration 138100/200000, loss=0.7294996380805969
Iteration 138200/200000, loss=0.6530367136001587
Iteration 138300/200000, loss=0.6751075983047485
Iteration 138400/200000, loss=0.7382983565330505
Iteration 138500/200000, loss=0.6802161335945129
Iteration 138600/200000, loss=0.6894387602806091
Iteration 138700/200000, loss=0.7251121401786804
Iteration 138800/200000, loss=0.6687840223312378
Iteration 138900/20000

Iteration 153700/200000, loss=0.6994280815124512
Iteration 153800/200000, loss=0.6857548356056213
Iteration 153900/200000, loss=0.6407324075698853
Iteration 154000/200000, loss=0.716506838798523
Iteration 154100/200000, loss=0.7279421091079712
Iteration 154200/200000, loss=0.7427209615707397
Iteration 154300/200000, loss=0.6812722086906433
Iteration 154400/200000, loss=0.6626780033111572
Iteration 154500/200000, loss=0.6933448314666748
Iteration 154600/200000, loss=0.6478520035743713
Iteration 154700/200000, loss=0.7106404304504395
Iteration 154800/200000, loss=0.713575541973114
Iteration 154900/200000, loss=0.6559085249900818
Iteration 155000/200000, loss=0.6775311231613159
Iteration 155100/200000, loss=0.7160018682479858
Iteration 155200/200000, loss=0.6703318357467651
Iteration 155300/200000, loss=0.6784307360649109
Iteration 155400/200000, loss=0.6723775863647461
Iteration 155500/200000, loss=0.6909247636795044
Iteration 155600/200000, loss=0.6972183585166931
Iteration 155700/20000

Iteration 170500/200000, loss=0.7183527946472168
Iteration 170600/200000, loss=0.706599771976471
Iteration 170700/200000, loss=0.7111468315124512
Iteration 170800/200000, loss=0.7081155180931091
Iteration 170900/200000, loss=0.6652135252952576
Iteration 171000/200000, loss=0.7459118366241455
Iteration 171100/200000, loss=0.6620849370956421
Iteration 171200/200000, loss=0.7035768628120422
Iteration 171300/200000, loss=0.6725472807884216
Iteration 171400/200000, loss=0.7648245096206665
Iteration 171500/200000, loss=0.7112056612968445
Iteration 171600/200000, loss=0.6317949295043945
Iteration 171700/200000, loss=0.7020347118377686
Iteration 171800/200000, loss=0.6496875286102295
Iteration 171900/200000, loss=0.6714088916778564
Iteration 172000/200000, loss=0.6712769269943237
Iteration 172100/200000, loss=0.7355681657791138
Iteration 172200/200000, loss=0.6328980326652527
Iteration 172300/200000, loss=0.7945435047149658
Iteration 172400/200000, loss=0.7320613861083984
Iteration 172500/2000

Iteration 187300/200000, loss=0.6541406512260437
Iteration 187400/200000, loss=0.7873542904853821
Iteration 187500/200000, loss=0.7334285974502563
Iteration 187600/200000, loss=0.5279873609542847
Iteration 187700/200000, loss=0.7084088325500488
Iteration 187800/200000, loss=0.6772411465644836
Iteration 187900/200000, loss=0.7371497750282288
Iteration 188000/200000, loss=0.8274521827697754
Iteration 188100/200000, loss=0.6908352971076965
Iteration 188200/200000, loss=0.7606729865074158
Iteration 188300/200000, loss=0.7443906664848328
Iteration 188400/200000, loss=0.814255952835083
Iteration 188500/200000, loss=0.6987701654434204
Iteration 188600/200000, loss=0.6985394358634949
Iteration 188700/200000, loss=0.6753919720649719
Iteration 188800/200000, loss=0.6835858225822449
Iteration 188900/200000, loss=0.6834619641304016
Iteration 189000/200000, loss=0.6651592254638672
Iteration 189100/200000, loss=0.6604185104370117
Iteration 189200/200000, loss=0.649382472038269
Iteration 189300/20000

In [6]:
model.save_weights('data/model.h5')
similarity_model.save_weights('data/similarity_model.h5')

получаем матрицу S расстояний между фильмами, так как в статье отмечено, что она вычислена заранее

In [None]:
S = np.eye(n_movies)
for i in range(n_movies):
    input_1 = np.zeros((1,))
    input_1[0,] = i
    for j in range(i + 1, n_movies):
        input_2 = np.zeros((1,))
        input_2[0,] = j
        
        cos_dist = similarity_model.predict_on_batch([input_1, input_2])
        S[i, j] = S[j, i] = cos_dist
        
np.save('data/SGNS_dists.npy', S)

# Factorization using learned S

не успел реализовать

In [None]:
class MF_SGNS:
    # d - size of latent vectors
    def __init__(self, user_ratings, user_movie_seq, S, n_movies, n_iter=20, d=128, k=5, a=0.5, l=0.1, lr=0.001):
        self.k = k
        self.a = a
        self.l = l 
        self.d = d
        self.lr = lr
        self.n_iter = n_iter
        self.__build_R(user_ratings, user_movie_seq, n_movies)
        self.n_movies = R.shape[1]
        self.S = S        
        self.P = np.random.normal(scale=1/d, size=(len(R), d))
        self.Q = np.random.normal(scale=1/d, size=(n_movies, d))
        self.b_u = np.zeros(len(R))
        self.b_i = np.zeros(n_movies)
        self.m = np.mean(self.R[np.where(self.R != 0)])      
            
        def train(self):
            for i in range(self.n_iter):
                self.__sgd()
                mse = np.sqrt(np.sum((self.R - self.__get_R_pred()) ** 2))
                print("Iteration {}. mse: {}".format(i, mse))
                    
        def predict_r(self, i, j):
            return self.m + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)           

        def __build_R(user_ratings, user_movie_seq, n_movies):
            self.R = np.zeros((len(user_ratings), n_movies))
            for user_id, (ratings, movies) in enumerate(zip(user_ratings, user_movie_seq)):
                self.R[user_id, movies] = ratings          
        
        def __sgd(self):
                pass
                
        def __get_R_pred(self):
            return self.m + self.b_u[:,np.newaxis] + self.b_i + self.P.dot(self.Q.T)