In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv('../datamovies/rating.csv.zip')

In [3]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [4]:
data.shape

(20000263, 4)

In [5]:
data['userId'].unique().shape, data['movieId'].unique().shape

((138493,), (26744,))

In [6]:
data['date'] = data['timestamp'].apply(lambda x: re.split(' ', x)[0])

In [7]:
data['date'].unique().shape, data['date'].min(), data['date'].max()

((6911,), '1995-01-09', '2015-03-31')

In [8]:
data.drop('timestamp', axis=1, inplace=True)

# Фильтрация на фильмы

In [9]:
#calc users on movies
us_on_mov = data[['userId', 'movieId']].groupby('movieId').count()

us_on_mov = us_on_mov.reset_index()
us_on_mov = us_on_mov.rename(columns={'userId':'count'})

In [10]:
us_on_mov['count'].describe()

count    26744.000000
mean       747.841123
std       3085.818268
min          1.000000
25%          3.000000
50%         18.000000
75%        205.000000
max      67310.000000
Name: count, dtype: float64

In [11]:
us_on_mov_filt = us_on_mov.loc[us_on_mov['count'] > 500]

In [12]:
us_on_mov_filt.shape, us_on_mov.shape

((4483, 2), (26744, 2))

In [13]:
data = data.merge(us_on_mov_filt['movieId'], on='movieId', how='inner')

In [14]:
data.shape

(18714467, 4)

# Фильтрация на юзеров

In [15]:
mov_on_us = data[['userId', 'movieId']].groupby('userId').count().reset_index()
mov_on_us = mov_on_us.rename(columns={'movieId':'count'})

In [16]:
mov_on_us['count'].describe()

count    138493.000000
mean        135.129335
std         194.030616
min           6.000000
25%          34.000000
50%          66.000000
75%         150.000000
max        3837.000000
Name: count, dtype: float64

In [17]:
mov_on_us_filt = mov_on_us.loc[mov_on_us['count'] > 400]

In [18]:
mov_on_us_filt.shape, mov_on_us.shape

((9678, 2), (138493, 2))

In [19]:
data = data.merge(mov_on_us_filt['userId'], on='userId', how='inner')

In [20]:
data.shape

(6731885, 4)

# Разделение на трейн/тест

In [21]:
data['date'].unique().shape, data['date'].min(), data['date'].max()

((6804,), '1996-02-20', '2015-03-31')

In [22]:
test = data.loc[data.date > '2010-01-01']
train = data.loc[data.date <= '2010-01-01']

train.shape, test.shape 

((5566601, 4), (1165284, 4))

In [23]:
len(set(train['userId']).intersection(set(test['userId']))), train['userId'].unique().shape[0], test['userId'].unique().shape[0]

(2570, 8481, 3767)

In [24]:
test = test.merge(pd.Series(train['userId'].unique(), name='userId'), on='userId', how='inner')
test.shape[0]

376229

In [34]:
test = test.merge(pd.Series(train['movieId'].unique(), name='movieId'), on='movieId', how='inner')
test.shape[0]

281497

In [25]:
test.head()

Unnamed: 0,userId,movieId,rating,date
0,271,337,3.0,2011-10-29
1,271,593,4.0,2011-10-04
2,271,1246,4.0,2013-09-02
3,271,2918,4.0,2014-09-06
4,271,6888,2.0,2011-04-09


In [26]:
#train.to_parquet('test.parquet', index=False)

In [27]:
#train = pd.read_parquet('train.parquet')

In [26]:
num_ids = train['userId'].unique().shape[0]

In [27]:
new_user_id = pd.Series(train['userId'].unique()).rename_axis('new_user_id').reset_index(name='userId')

In [29]:
new_user_id.head(10)

Unnamed: 0,new_user_id,userId
0,0,54
1,1,91
2,2,116
3,3,131
4,4,156
5,5,251
6,6,271
7,7,278
8,8,298
9,9,347


In [30]:
train = train.merge(new_user_id, on='userId', how='left')
test = test.merge(new_user_id, on='userId', how='left')

In [31]:
test.head()

Unnamed: 0,userId,movieId,rating,date,new_user_id
0,271,337,3.0,2011-10-29,6
1,271,593,4.0,2011-10-04,6
2,271,1246,4.0,2013-09-02,6
3,271,2918,4.0,2014-09-06,6
4,271,6888,2.0,2011-04-09,6


In [32]:
new_mov_id = pd.Series(train['movieId'].unique()).rename_axis('new_movie_id').reset_index(name='movieId')

In [33]:
new_mov_id.head()

Unnamed: 0,new_movie_id,movieId
0,0,2
1,1,32
2,2,47
3,3,50
4,4,223


In [35]:
train = train.merge(new_mov_id, on='movieId', how='left')
test = test.merge(new_mov_id, on='movieId', how='left')

In [36]:
train.drop('userId', axis=1, inplace=True)
test.drop('userId', axis=1, inplace=True)

In [37]:
train = train.rename(columns={'new_user_id':'userId'})
test = test.rename(columns={'new_user_id':'userId'})

In [38]:
train.drop('movieId', axis=1, inplace=True)
test.drop('movieId', axis=1, inplace=True)

In [39]:
train = train.rename(columns={'new_movie_id':'movieId'})
test = test.rename(columns={'new_movie_id':'movieId'})

In [41]:
train.shape, test.shape

((5566601, 4), (281497, 4))

In [45]:
train.isna().any()

rating     False
date       False
userId     False
movieId    False
dtype: bool

In [42]:
train.head()

Unnamed: 0,rating,date,userId,movieId
0,3.0,2000-11-22,0,0
1,5.0,2000-11-21,0,1
2,4.0,2000-11-21,0,2
3,4.0,2000-11-21,0,3
4,5.0,2000-11-21,0,4


In [44]:
test.isna().any()

rating     False
date       False
userId     False
movieId    False
dtype: bool

In [43]:
test.head()

Unnamed: 0,rating,date,userId,movieId
0,3.0,2011-10-29,6,8
1,4.0,2010-09-22,60,8
2,4.5,2013-12-24,133,8
3,3.5,2010-10-15,140,8
4,3.5,2013-04-08,153,8


In [46]:
train.to_parquet('../datamovies/train.parquet', index=False)

In [47]:
test.to_parquet('../datamovies/test.parquet', index=False)

# Расчёт среднего для трейна

In [None]:
train = pd.read_parquet('../datamovies/train.parquet')

In [None]:
test = pd.read_parquet('../datamovies/test.parquet')

In [75]:
m_user_rat = train[['userId', 'rating']].groupby('userId')['rating'].mean().reset_index()
m_user_rat = m_user_rat.rename(columns={'rating':'user_bias'})
train = train.merge(m_user_rat, on='userId', how='left')
train['rating'] = train['rating'] - train['user_bias']

In [76]:
train.head()

Unnamed: 0,userId,movieId,rating,date,user_bias
0,54,2,-0.442857,2000-11-22,3.442857
1,54,32,1.557143,2000-11-21,3.442857
2,54,47,0.557143,2000-11-21,3.442857
3,54,50,0.557143,2000-11-21,3.442857
4,54,223,1.557143,2000-11-21,3.442857


In [119]:
del_zero_ratings = train.groupby('userId').agg({'rating':'sum'})
del_zero_ratings = del_zero_ratings[del_zero_ratings['rating'] != 0].reset_index()

In [77]:
test = test.merge(m_user_rat, on='userId', how='left')
test['rating'] = test['rating'] - test['user_bias']

In [125]:
train = train.merge(pd.Series(del_zero_ratings['userId'], name='userId'), on='userId', how='inner')
test = test.merge(pd.Series(del_zero_ratings['userId'], name='userId'), on='userId', how='inner')

# Creating URM

In [231]:
from scipy import sparse
from pandas.api.types import CategoricalDtype
import gc

In [232]:
#Create URM
userId_c = CategoricalDtype(sorted(train.userId.unique()), ordered=True)
movieId_c = CategoricalDtype(sorted(train.movieId.unique()), ordered=True)

row = train.userId.astype(userId_c).cat.codes
col = train.movieId.astype(movieId_c).cat.codes

In [233]:
urm_sparse = sparse.csr_matrix((train["rating"], (row, col)), \
                           shape=(new_id_c.categories.size, movieId_c.categories.size), dtype=np.float32)

In [234]:
sparse.save_npz('urm_sparse.npz', urm_sparse)

# Calc weights

In [235]:
from scipy import sparse
import pandas as pd

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [236]:
urm_sparse = sparse.load_npz('urm_sparse.npz')

In [237]:
urm_sparse

<8389x4237 sparse matrix of type '<class 'numpy.float32'>'
	with 5518624 stored elements in Compressed Sparse Row format>

In [238]:
%%time
w = urm_sparse.dot(urm_sparse.T)

Wall time: 35.9 s


In [239]:
#sparse.save_npz('w.npz', w)

In [240]:
w = sparse.load_npz('w.npz')

In [241]:
vars = np.asarray(urm_sparse.power(2).sum(axis=1))

In [242]:
dispn = np.sqrt(vars.dot(vars.T))

In [243]:
dispn.shape

(8389, 8389)

In [244]:
%%time
w_1 = np.divide(w.toarray(), dispn)

Wall time: 630 ms


# Filt w

In [245]:
np.fill_diagonal(w_1, 0)

In [246]:
s = w_1.shape

ind = np.argsort(w_1)[:,:s[1]-50]
rows = np.arange(s[0])[:,None]

In [247]:
w_1[rows, ind] = 0

In [248]:
w_1_sparse = sparse.csr_matrix(w_1)

In [249]:
w_1_sparse

<8389x8389 sparse matrix of type '<class 'numpy.float32'>'
	with 419450 stored elements in Compressed Sparse Row format>

In [250]:
sparse.save_npz('w_1_sparse.npz', w_1_sparse)

# Make predictions

In [251]:
train.columns

Index(['rating', 'date', 'user_bias', 'userId', 'movieId'], dtype='object')

In [252]:
v_rating = train[['userId', 'movieId', 'rating']].drop_duplicates().sort_values('userId').reset_index(drop=True)

In [253]:
v_rating.head()

Unnamed: 0,userId,movieId,rating
0,0,0,-0.442857
1,0,462,-1.442857
2,0,463,-0.442857
3,0,464,0.557143
4,0,465,0.557143


In [333]:
v_rating_new = v_rating.groupby('movieId').agg({'rating':lambda x: list(x), 'userId': lambda x: list(x)})

In [260]:
test = pd.read_parquet('test.parquet')

In [343]:
test['movieId'] = test['movieId'].astype(int)

In [364]:
%%time

test['pred'] = np.zeros(test.shape[0])
preds = []

for i in range(test.shape[0]):
    usId = test['userId'][i]
    movId = test['movieId'][i]
    
    rat = v_rating_new.iloc[movId]
    
    w_s = np.take(w_1_sparse[usId].toarray()[0], rat['userId'])
    
    preds.append(np.sum(w_s*np.array(rat['rating']))/(np.abs(w_s).sum()))

  if sys.path[0] == '':


Wall time: 3min 4s


In [363]:
276000/500*376/1000

207.552

In [365]:
test['pred'] = preds

In [366]:
test.shape

(276123, 6)

In [370]:
test = test.dropna(subset=['pred'])
test.shape

(239973, 6)

In [378]:
np.mean(np.abs(test['rating']-test['pred']))

0.6833797485025371