In [43]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
import numpy as np
from keras.layers import Embedding, Reshape, Merge, Dropout, Dense
from keras.models import Sequential

In [145]:
RATINGS_CSV_FILE = 'train.csv'
MODEL_WEIGHTS_FILE = 'ml1m_weights.h5'
K_FACTORS = 1000
RNG_SEED = 1

In [146]:
ratings = pd.read_csv(RATINGS_CSV_FILE,              
                      encoding='latin-1', 
                      usecols=['TrainDataID', 'UserID', 'MovieID', 'Rating'])
max_userid = ratings['UserID'].drop_duplicates().max()
max_movieid = ratings['MovieID'].drop_duplicates().max()

In [147]:
shuffled_ratings = ratings.sample(frac=1., random_state=RNG_SEED)
Users = shuffled_ratings['UserID'].values
print ('Users:', Users, ', shape =', Users.shape)
Movies = shuffled_ratings['MovieID'].values
print ('Movies:', Movies, ', shape =', Movies.shape)
Ratings = shuffled_ratings['Rating'].values
print ('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [2707 1695 4333 ..., 4791 2122 1197] , shape = (899873,)
Movies: [1228 1037 2041 ..., 3614 1097 2115] , shape = (899873,)
Ratings: [4 3 5 ..., 2 2 4] , shape = (899873,)


In [148]:
class CFModel(Sequential):

    def __init__(self, n_users, m_items, k_factors, **kwargs):
        P = Sequential()
        P.add(Embedding(n_users, k_factors, input_length=1))
        P.add(Dropout(0.2))
        P.add(Reshape((k_factors,)))
        Q = Sequential()
        Q.add(Embedding(m_items, k_factors, input_length=1))
        Q.add(Dropout(0.2))
        Q.add(Reshape((k_factors,)))
        super(CFModel, self).__init__(**kwargs)
        self.add(Merge([P, Q], mode='dot', dot_axes=1))
#dot => concat, 
#model.add(Dense(128))
#
#model.add(Dense(1))
#

    def rate(self, user_id, item_id):
        return self.predict([np.array([user_id]), np.array([item_id])])[0][0]

In [149]:
model = CFModel(max_userid, max_movieid, K_FACTORS)
model.compile(loss='mse', optimizer='adamax')



In [150]:
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE, save_best_only=True)]
history = model.fit([Users, Movies],Ratings, nb_epoch=3000000, batch_size=10000, validation_split=.1, verbose=1, callbacks=callbacks)



Train on 809885 samples, validate on 89988 samples
Epoch 1/3000000
Epoch 2/3000000
Epoch 3/3000000
Epoch 4/3000000
Epoch 5/3000000
Epoch 6/3000000
Epoch 7/3000000
Epoch 8/3000000
Epoch 9/3000000
Epoch 10/3000000
Epoch 11/3000000
Epoch 12/3000000
Epoch 13/3000000
Epoch 14/3000000
Epoch 15/3000000
Epoch 16/3000000
Epoch 17/3000000
Epoch 18/3000000
Epoch 19/3000000
Epoch 20/3000000
Epoch 21/3000000
Epoch 22/3000000
Epoch 23/3000000
Epoch 24/3000000
Epoch 25/3000000
Epoch 26/3000000


In [151]:
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
trained_model.load_weights(MODEL_WEIGHTS_FILE)



In [152]:
test_data = pd.read_csv("test.csv")
test = np.asarray(test_data)
#TestDataID/UserID(1~6040)/MovieID(1~3952), shape(100336, 3)

In [153]:
answer = trained_model.predict([test[:,1],test[:,2]])

In [154]:
answer.shape

(100336, 1)

In [86]:
for i in range(100336):
    if answer[i] == 0:
        answer[i] = 1

In [155]:
submi = pd.read_csv("SampleSubmisson.csv")

In [156]:
aa = pd.DataFrame(answer)

In [157]:
aa.to_csv("aa.csv")

In [None]:
answer

In [59]:
user_ratings = ratings[ratings['UserID'] == 3000][['UserID', 'MovieID', 'Rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(3000, x['MovieID']), axis=1)
user_ratings.sort_values(by='Rating', 
                         ascending=False).head(10)

Unnamed: 0,UserID,MovieID,Rating,prediction
109326,3000,3289,5,1.823372
109334,3000,306,5,2.699907
109341,3000,1214,4,4.682123
109327,3000,3481,4,3.941334
109329,3000,3159,4,4.024564
109338,3000,3751,4,0.009479
109337,3000,3745,4,3.522266
109323,3000,3002,4,1.982718
109333,3000,3510,4,2.963198
109322,3000,3793,3,4.123916


In [None]:
user_ratings

In [30]:
ratings = pd.read_csv("train.csv")
#train_data = np.asarray(train_data)
#train_data.shape
# TrainDataID/UserID(1~6040)/MovieID(1~3952)/Rating(1~5), shape(899873, 4)
#最大5最小1
test_data = pd.read_csv("test.csv")
#test_data = np.asarray(test_data)
#test_data.shape
#TestDataID/UserID(1~6040)/MovieID(1~3952), shape(100336, 3)
users = pd.read_csv("users.csv",sep='::')
#users = np.asarray(user_data)
#users.shape
# UserID::Gender::Age::Occupation::Zip-code, shape(6040, 1)
movies = pandas.read_csv('movies.csv', engine='python',sep='::')

users.age = users.Age.astype('category')
users.gender = users.Gender.astype('category')
users.occupation = users.Occupation.astype('category')
ratings.movieid = ratings.MovieID.astype('category')
ratings.userid = ratings.UserID.astype('category')



In [None]:
train_data

In [None]:
# And finally, set up a y variable with the rating,
# as a one-hot encoded matrix.
#
# note the '- 1' for the rating. That's because ratings
# go from 1 to 5, while the matrix columns go from 0 to 4

y = np.zeros((ratings.shape[0], 5))
y[np.arange(ratings.shape[0]), ratings.rating - 1] = 1

In [None]:
# Dummy classifier! Just see how well stupid can do.
pred = dummy.DummyClassifier(strategy='prior')
pred.fit(ratings[['userid', 'movieid']], ratings.rating)

print(metrics.mean_absolute_error(ratings.rating, pred.predict(ratings[['userid', 'movieid']])))

In [None]:
# Now, the deep learning classifier

# First, we take the movie and vectorize it.
# The embedding layer is normally used for sequences (think, sequences of words)
# so we need to flatten it out.
# The dropout layer is also important in preventing overfitting
movie_input = keras.layers.Input(shape=[1])
movie_vec = keras.layers.Flatten()(keras.layers.Embedding(n_movies + 1, 32)(movie_input))
movie_vec = keras.layers.Dropout(0.5)(movie_vec)

# Same thing for the users
user_input = keras.layers.Input(shape=[1])
user_vec = keras.layers.Flatten()(keras.layers.Embedding(n_users + 1, 32)(user_input))
user_vec = keras.layers.Dropout(0.5)(user_vec)

# Next, we join them all together and put them
# through a pretty standard deep learning architecture
input_vecs = keras.layers.merge([movie_vec, user_vec], mode='concat')
nn = keras.layers.Dropout(0.5)(keras.layers.Dense(128, activation='relu')(input_vecs))
nn = keras.layers.normalization.BatchNormalization()(nn)
nn = keras.layers.Dropout(0.5)(keras.layers.Dense(128, activation='relu')(nn))
nn = keras.layers.normalization.BatchNormalization()(nn)
nn = keras.layers.Dense(128, activation='relu')(nn)

# Finally, we pull out the result!
result = keras.layers.Dense(5, activation='softmax')(nn)

# And make a model from it that we can actually run.
model = kmodels.Model([movie_input, user_input], result)
model.compile('adam', 'categorical_crossentropy')

# If we wanted to inspect part of the model, for example, to look
# at the movie vectors, here's how to do it. You don't need to 
# compile these models unless you're going to train them.
final_layer = kmodels.Model([movie_input, user_input], nn)
movie_vec = kmodels.Model(movie_input, movie_vec)

In [None]:
# Split the data into train and test sets...
a_movieid, b_movieid, a_userid, b_userid, a_y, b_y = cross_validation.train_test_split(movieid, userid, y)

In [None]:
# And of _course_ we need to make sure we're improving, so we find the MAE before
# training at all.
metrics.mean_absolute_error(np.argmax(b_y, 1)+1, np.argmax(model.predict([b_movieid, b_userid]), 1)+1)

In [None]:
try:
    history = model.fit([a_movieid, a_userid], a_y, 
                         nb_epoch=20, 
                         validation_data=([b_movieid, b_userid], b_y))
    plot(history.history['loss'])
    plot(history.history['val_loss'])
except KeyboardInterrupt:
    pass

In [None]:
# This is the number that matters. It's the held out 
# test set score. Note the + 1, because np.argmax will
# go from 0 to 4, while our ratings go 1 to 5.
metrics.mean_absolute_error(
    np.argmax(b_y, 1)+1, 
    np.argmax(model.predict([b_movieid, b_userid]), 1)+1)

In [None]:
# For comparison's sake, here's the score on the training set.
metrics.mean_absolute_error(
    np.argmax(a_y, 1)+1, 
    np.argmax(model.predict([a_movieid, a_userid]), 1)+1)