In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy.sparse.linalg import svds

In [3]:
dt_watch = pd.read_csv('data/content_watch.csv')
dt_rates = pd.read_csv('data/content_rates.csv')

In [4]:
dt_watch.count()

user_index        743885
item_id           743885
watch_duration    743882
dtype: int64

In [5]:
dt_rates.count()

user_index    3795
item_id       3795
rate          3795
dtype: int64

In [6]:
test = pd.read_csv('data/test_set.csv')
test['watch'] = 1
test.head()

Unnamed: 0,user_index,item_id,watch
0,1412855001,-1861073847,1
1,-231141499,-1003171457,1
2,648878904,1710156343,1
3,648878904,-2032527344,1
4,-490168493,985953043,1


In [7]:
#train.head()

In [8]:
# check how many test users are in train

test_users = set(list(test.user_index.values))
train_users = set(list(dt_rates.user_index.values) + list(dt_watch.user_index.values))
common_users = [u for u in test_users if u in train_users]

print ('test_users count: ',  len(test_users))
print ('train_users count: ',  len(train_users))
print ('common_users count: ',  len(common_users))

('test_users count: ', 3662)
('train_users count: ', 108124)
('common_users count: ', 3662)


In [9]:
# check how many test items are in train
test_items = set(list(test.item_id.values))
train_items = set(list(dt_rates.item_id.values) + list(dt_watch.item_id.values))
common_items = [u for u in test_items if u in train_items]

print ('test_items count: ', len(test_items))
print ('train_items count: ', len(train_items))
print ('common_items count: ', len(common_items))

('test_items count: ', 4131)
('train_items count: ', 8759)
('common_items count: ', 4034)


In [10]:
# check how many test items are in rated train
test_items = set(list(test.item_id.values))
train_items = set(list(dt_rates.item_id.values))
common_items = [u for u in test_items if u in train_items]

print ('test_items count: ', len(test_items))
print ('train_items count: ', len(train_items))
print ('common_items count: ', len(common_items))

('test_items count: ', 4131)
('train_items count: ', 2727)
('common_items count: ', 942)


In [31]:
# merge data from watch and rates into train set
# remove rows without rates and 0 duration
train = dt_watch.merge(dt_rates, how='outer', on=['user_index', 'item_id'])
train = train.groupby(['user_index', 'item_id'])['watch_duration', 'rate'].agg({'watch_duration': 'sum', 'rate': 'mean'}).reset_index()
#train = train[(train.watch_duration > 0) | (train.rate.notnull())]
train.count()

user_index        737517
item_id           737517
watch_duration    735158
rate                3795
dtype: int64

In [32]:
USER_COUNT = train['user_index'].nunique()
ITEM_COUNT = train['item_id'].nunique()
print ('Total of unique users=', USER_COUNT)
print ('Total of unique films=', ITEM_COUNT)

('Total of unique users=', 108124)
('Total of unique films=', 8759)


In [13]:
# prepare user-rate matrix for SVD
ur_matrix = train[['user_index', 'item_id', 'rate']].pivot(index = 'user_index', columns ='item_id', values = 'rate').fillna(0)
#print ur_matrix.head()
ur_matrix.shape

MemoryError: 

# Второй вариант с разреженными матрицами

In [33]:
users_dict = dict()
i = 0
for u in train['user_index'].unique():
    users_dict[u] = i
    i += 1

items_dict = dict()
j = 0
for u in train['item_id'].unique():
    items_dict[u] = j
    j += 1

In [34]:
from scipy.sparse import csr_matrix
import csv

def readUrm():
    urm = np.zeros(shape=(USER_COUNT, ITEM_COUNT), dtype=np.float32)
    with open('data/content_rates.csv', 'rb') as trainFile:
        urmReader = csv.reader(trainFile, delimiter=',')
        next(urmReader, None)  # skip header
        for row in urmReader:
            urm[users_dict[row[0]], items_dict[row[1]]] = float(row[2])

    return csr_matrix(urm, dtype=np.float32)

In [35]:
m = readUrm()

MemoryError: 

In [None]:
# prepare user-rate matrix for SVD
ur_matrix = dt_rates[['user_index', 'item_id', 'rate']].pivot(index = 'user_index', columns ='item_id', values = 'rate').fillna(0)
#print urm.head()
ur_matrix.shape

In [None]:
# R = ur_matrix.as_matrix()
# print R.shape
# mean_rate_by_user = np.mean(R, axis = 1)
# mean_rate_by_user.reshape(-1, 1).shape

In [None]:
print 'users len: %s' % ur_matrix.index.values.shape

In [None]:
# compute SVD from user-rate matrix, k is a parameter
def computeSVD(ur_matrix, k):
    R = ur_matrix.as_matrix()
    mean_rate_by_user = np.mean(R, axis = 1)
    R_demeaned = R - mean_rate_by_user.reshape(-1, 1)
    U, sigma, Vt = svds(R, k = 50)
    sigma = np.diag(sigma)
    prediction_matrix = np.dot(np.dot(U, sigma), Vt) + mean_rate_by_user.reshape(-1, 1)
    
    return prediction_matrix

In [None]:
k = 50
R = ur_matrix.as_matrix()
prediction_matrix = computeSVD(ur_matrix, k)
# convert matrix to DataFrame, use user_index for index
preds_df = pd.DataFrame(prediction_matrix, columns = ur_matrix.columns)
preds_df['user_index'] = ur_matrix.index.values
preds_df = preds_df.set_index('user_index')
preds_df.head()

In [None]:
#preds_df.describe()

In [None]:
# minrate = np.nanmin(preds_df.iloc[:, :].values)
# maxrate = np.nanmax(preds_df.iloc[:, :].values)
# print minrate
# print maxrate

In [None]:
def MSE_error(m1, m2):
    if (m1.shape != m2.shape):
        raise ValueError('Can''t compute MSE for matices of different shape: [%i, %i] and [%i, %i]' % (m1.shape[0],m1.shape[1], m2.shape[0],m2.shape[1]))
    n = sum(m1.shape)
    err = 0
    for i in xrange(m1.shape[0]):
        for j in xrange(m1.shape[1]):
            err += (m1[i, j] - m2[i,j])**2
    return err/n

In [None]:
err = MSE_error(ur_matrix.as_matrix(), prediction_matrix)
print 'Mean square error for SVD, k=%i: %f' % (k, err)

In [None]:
def recommend_movies(predictions_df, user_index, all_content, num_recommendations=5, verbose=True):
    
    # get sorted predictions
    sorted_predictions = predictions_df.loc[user_index].sort_values(ascending=False)
    
    # get content user has already seen/rated
    user_data = all_content[all_content.user_index == (user_index)].sort_values(['rate'], ascending=False)

    # recommend the highest rated content that the user hasn't seen/rated yet
    recommendations = (all_content[~all_content['item_id'].isin(user_data['item_id'])].
         merge(pd.DataFrame(sorted_predictions).reset_index(), how = 'left', on = 'item_id').
         rename(columns = {user_index: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )
    if (verbose):
        print 'User {0} has already seen/rated {1} content items.'.format(user_index, user_data.shape[0])
        print 'Recommending the highest {0} predicted ratings for content not seen/rated.'.format(num_recommendations)
        print recommendations

    return user_data, recommendations.item_id.values

In [None]:
user_id_test = -231141499
items_seen_count = train[(train.user_index == user_id_test) & (train.watch_duration > 0)].shape[0]
has_seen, predictions = recommend_movies(preds_df, -231141499, train, items_seen_count / 2)

In [None]:
predictions

In [None]:
def prediction_error(predict_items, test_items, l):
    intersect_count = len([i for i in predict_items if i in test_items])
    print ('intersect_count:', intersect_count)
    precision = float(intersect_count)/l
    recall = float(intersect_count)/len(test_items)
    #F = 2*precision*recall/(precision + recall)
    return precision, recall#, F

In [None]:
answer = test[test.user_index == user_id_test]['item_id'].values
print answer
print len(answer)

In [None]:
p, r = prediction_error(predictions, answer, 10)

In [None]:
print('precision: ', p)
print('recall: ', r)