В изысканиях опирался на данную статью: https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe
Предварительно прослушал про ALS https://www.youtube.com/watch?v=NlNLtPqlCK0

In [4]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

import implicit

### Load Data

In [5]:
train = pd.read_csv('input/train_2.csv')
test = pd.read_csv('input/test_2.csv')
user_features = pd.read_csv('input/user-features_2.csv')
item_features = pd.read_csv('input/item-features_2.csv')

### Preprocessing

In [6]:
train_full = train.merge(user_features, how='left', left_on='user_id', right_on='user_id')
train_full = train_full.merge(item_features, how='left', left_on='item_id', right_on='item_id')

In [7]:
train_full

Unnamed: 0,user_id,item_id,like,timestamp,0_x,1_x,2_x,3_x,4_x,5_x,...,22_y,23_y,24_y,25_y,26_y,27_y,28_y,29_y,30_y,31_y
0,140,342,0,1490936622,0.001300,-0.002943,-0.002750,0.003901,-0.000931,0.001282,...,-0.002098,-0.000349,0.000561,-0.000540,-0.000996,-0.000996,0.000211,-0.000584,-0.000584,0.000149
1,378,172,1,1490936628,0.003204,-0.007252,-0.006776,0.009613,-0.002295,0.003158,...,-0.013597,-0.002263,0.003634,-0.003498,-0.006457,-0.006457,0.001369,-0.003785,-0.003785,0.000964
2,150,182,0,1490936650,0.002554,-0.005780,-0.005401,0.007662,-0.001829,0.002517,...,-0.010278,-0.001710,0.002747,-0.002644,-0.004881,-0.004881,0.001035,-0.002861,-0.002861,0.000728
3,455,17,0,1490936704,0.004649,-0.010523,-0.009833,0.013950,-0.003330,0.004583,...,-0.013269,-0.002208,0.003546,-0.003414,-0.006301,-0.006301,0.001336,-0.003694,-0.003694,0.000940
4,350,409,0,1490936735,0.000919,-0.002081,-0.001944,0.002759,-0.000658,0.000906,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.000210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8669,161,312,0,1491215519,0.000602,-0.001362,-0.001273,0.001806,-0.000431,0.000593,...,-0.002098,-0.000349,0.000561,-0.000540,-0.000996,-0.000996,0.000211,-0.000584,-0.000584,0.000149
8670,406,208,0,1491215543,0.002357,-0.005335,-0.004985,0.007072,-0.001688,0.002323,...,-0.003634,-0.000605,0.000971,-0.000935,-0.001726,-0.001726,0.000366,-0.001012,-0.001012,0.000258
8671,196,43,0,1491215576,0.001667,-0.003772,-0.003525,0.005001,-0.001194,0.001643,...,-0.008392,-0.001397,0.002243,-0.002159,-0.003985,-0.003985,0.000845,-0.002336,-0.002336,0.000595
8672,84,100,0,1491215579,0.001433,-0.003243,-0.003030,0.004299,-0.001026,0.001412,...,-0.007268,-0.001209,0.001942,-0.001870,-0.003451,-0.003451,0.000732,-0.002023,-0.002023,0.000515


### Feature Engineers

In [4]:
# convert to datetime trains

train_full['created'] = pd.to_datetime(train_full['timestamp'],unit='s')

# create new date features

train_full['created_month'] = train_full['created'].dt.month
train_full['created_day'] = train_full['created'].dt.day
train_full['created_dayofweek'] = train_full['created'].dt.dayofweek
train_full['created_hour'] = train_full['created'].dt.round('H').dt.hour

def f(x):
    if (x > 4) and (x <= 8):
        return 'Early Morning'
    elif (x > 8) and (x <= 12 ):
        return 'Morning'
    elif (x > 12) and (x <= 16):
        return'Noon'
    elif (x > 16) and (x <= 20) :
        return 'Eve'
    elif (x > 20) and (x <= 24):
        return'Night'
    elif (x <= 4):
        return'Late Night'
    
    
train_full['part_of_day'] = train_full['created_hour'].apply(f)

# train_full = train_full.drop(['created', 'timestamp'], axis=1)

# для One-Hot Encoding в pandas есть готовая функция - get_dummies. Особенно радует параметр dummy_na
train_full = pd.get_dummies(train_full, columns=[ 'part_of_day',], dummy_na=True)

# удалю окончательно колонку даты и времени нажатия на баннер
train_full = train_full.drop(['created'], axis = 1)

if train_full.part_of_day_nan.isnull().sum() == 0:
    train_full = train_full.drop(['part_of_day_nan'], axis = 1)

In [5]:
# convert to datetime test

test['created'] = pd.to_datetime(test['timestamp'],unit='s')

# create new date features

test['created_month'] = test['created'].dt.month
test['created_day'] = test['created'].dt.day
test['created_dayofweek'] = test['created'].dt.dayofweek
test['created_hour'] = test['created'].dt.round('H').dt.hour

def f(x):
    if (x > 4) and (x <= 8):
        return 'Early Morning'
    elif (x > 8) and (x <= 12 ):
        return 'Morning'
    elif (x > 12) and (x <= 16):
        return'Noon'
    elif (x > 16) and (x <= 20) :
        return 'Eve'
    elif (x > 20) and (x <= 24):
        return'Night'
    elif (x <= 4):
        return'Late Night'
    
    
test['part_of_day'] = test['created_hour'].apply(f)

# train_full = train_full.drop(['created', 'timestamp'], axis=1)

# для One-Hot Encoding в pandas есть готовая функция - get_dummies. Особенно радует параметр dummy_na
test = pd.get_dummies(test, columns=[ 'part_of_day',], dummy_na=True)

# удалю окончательно колонку даты и времени нажатия на баннер
test = test.drop(['created'], axis = 1)

if test.part_of_day_nan.isnull().sum() == 0:
    test = test.drop(['part_of_day_nan'], axis = 1)

In [8]:
banner_pivot=train_full.pivot(index='user_id',columns='item_id',values='like')
banner_pivot=banner_pivot.fillna(0)
banner_pivot.shape

(497, 444)

In [7]:
sparse_matrix = train_full.pivot(index='user_id',columns='item_id',values='like')
sparse_matrix = sparse_matrix.fillna(0)
sparse_matrix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 0 to 496
Columns: 444 entries, 0 to 443
dtypes: float64(444)
memory usage: 1.7 MB


In [10]:
# sparse_matrix_sparse = csr_matrix(sparse_matrix.values)
# sparse_matrix_sparse

NameError: name 'csr_matrix' is not defined

In [8]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((train_full['like'].astype(float), (train_full['item_id'], train_full['user_id'])))
sparse_user_item = sparse.csr_matrix((train_full['like'].astype(float), (train_full['user_id'], train_full['item_id'])))

In [9]:
sparse_user_item

<497x444 sparse matrix of type '<class 'numpy.float64'>'
	with 8674 stored elements in Compressed Sparse Row format>

### Model

In [13]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=1, regularization=0.01, iterations=40)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)  



HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




### Find Similar Items

In [91]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar
item_id = 400
n_similar = 20

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print(train_full.item_id.loc[train_full.item_id == idx].iloc[0])


400
304
382
292
287
308
290
286
396
330
406
393
440
428
441
379
288
327
244
340


### Recommendations

In [14]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

recommendations = {}

# Create recommendations for user
for u in test['user_id']:
# for u in range(2):
    
    user_id = u

    # Use the implicit recommender.
    recommended = model.recommend(user_id, sparse_user_item, 20)

    item_id = []
    scores = []

    # Get artist names from ids
    for item in recommended:
        idx, score = item
        item_id.append(train_full.item_id.loc[train_full.item_id == idx].iloc[0])
        scores.append(score)

    # Create a dataframe of artist names and scores
    # recommendations = pd.DataFrame({'user_id': user_id, 'item_id': item_id, 'score': scores})
    recommendations.update({user_id: item_id})


rec_df = pd.DataFrame.from_dict(recommendations)
    
print(rec_df.T)


      0   1   2   3   4   5   6   7   8   9   10   11   12  13  14   15   16  \
166  76  22  35  72  40  80  37  58  65  66   11   32   21   5  60   87  146   
26   76  22  35  72  40  80  37  58  65  66   11   67   32  21   5   60   87   
41   76  22  35  72  40  80  37  58  65  66   11   67   32  21   5   60   87   
286  76  22  35  72  80  37  58  65  66  11   67   32   21   5  60   87  146   
108  76  22  35  72  40  80  37  58  66  11   67   32   21   5  60   87  146   
..   ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ..  ..  ...  ...   
190  76  35  37  65  66  32  21   5  60  87  146  119   33  36  70   30  147   
181  76  22  35  72  40  80  37  58  65  66   11   67   32  21   5   60  146   
448  76  22  35  72  37  58  65  66  67  32   21   60   87   7  33   17   36   
124  76  22  35  72  40  80  58  65  66  11   67   32   21   5  87  146    7   
167  22  40  80  37  58  66  11  21   5  60   87  146  119  78  33  172  147   

      17   18   19  
166    7   44  119

In [15]:
# Экспорт предсказаний в csv
pd.DataFrame(rec_df.T).to_csv('input/out.csv', index=True)