Refference:

https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/

https://medium.com/recombee-blog/machine-learning-for-recommender-systems-part-1-algorithms-evaluation-and-cold-start-6f696683d0ed

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
np.random.seed(0)
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
import gc
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from IPython.display import FileLink

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['productid_attr.csv', 'train.csv', 'test.csv', 'img_features_DN121.csv']


In [2]:
# Evaluaion Function
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    actual = list(actual)
    predicted = list(predicted)
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
def show_all(df, nrow=None, ncol=None):
    with pd.option_context('display.max_rows', nrow, 'display.max_columns', ncol):
        display(df)

In [4]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [5]:
pa_df = pd.read_csv('../input/productid_attr.csv')
img_feat_df = pd.read_csv('../input/img_features_DN121.csv')

### Collaborative filtering

In [6]:
n_users = train_df['UserId'].nunique()
n_products = train_df['productid'].nunique()

In [7]:
data_matrix_df = pd.DataFrame(np.zeros((n_users, n_products)), index=sorted(train_df['UserId'].unique()), columns=sorted(train_df['productid'].unique()))

In [8]:
data_matrix_df.head()

Unnamed: 0,11139192,11139194,11139524,11139560,11139588,11139650,11141306,11141308,11141318,11141320,11141324,11141326,11141328,11141330,11141338,11141340,11141342,11141346,11141354,11141530,11141538,11141644,11144136,11144260,11145600,11145602,11145612,11145614,11145620,11145624,11145626,11145634,11145640,11145642,11145654,11145664,11145666,11145684,11145702,11145726,...,13773572,13773574,13773576,13773578,13773580,14101712,14104334,14110862,14110864,14110866,14110868,14121822,14121824,14121826,14121828,14121830,14121832,14121834,14121836,14121838,14121840,14121842,14121844,14121846,14121848,14122346,14122348,14122744,14122748,14122750,14122758,14122762,14122766,14122818,14122820,14122822,14122832,14122834,14128359,14129477
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
%%time
for i, line in train_df.iterrows():
    data_matrix_df.loc[line['UserId'], line['productid']] = data_matrix_df.loc[line['UserId'], line['productid']] + line['Quantity']

CPU times: user 51 s, sys: 472 ms, total: 51.4 s
Wall time: 51.4 s


In [10]:
data_matrix_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27778 entries, 0 to 27777
Columns: 3026 entries, 11139192 to 14129477
dtypes: float64(3026)
memory usage: 642.8 MB


In [11]:
%%time
user_similarity = pairwise_distances(data_matrix_df, metric='cosine')

CPU times: user 4min 56s, sys: 8.45 s, total: 5min 4s
Wall time: 1min 24s


In [12]:
user_similarity_df = pd.DataFrame(user_similarity, index=data_matrix_df.index.values, columns=data_matrix_df.index.values)

In [13]:
# we are getting similarity, higher is better
user_similarity_df = 1 - user_similarity_df

In [14]:
user_similarity_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27778 entries, 0 to 27777
Columns: 27778 entries, 0 to 27777
dtypes: float64(27778)
memory usage: 5.7 GB


In [15]:
del user_similarity
gc.collect()

7

In [16]:
%%time
product_similarity = pairwise_distances(data_matrix_df.transpose(), metric='cosine')

CPU times: user 33.9 s, sys: 1.34 s, total: 35.2 s
Wall time: 10.2 s


In [17]:
product_similarity_df = pd.DataFrame(product_similarity, index=data_matrix_df.columns.values, columns=data_matrix_df.columns.values)

In [18]:
# we are getting similarity, higher is better
product_similarity_df = 1 - product_similarity_df

In [19]:
product_similarity_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3026 entries, 11139192 to 14129477
Columns: 3026 entries, 11139192 to 14129477
dtypes: float64(3026)
memory usage: 69.9 MB


In [20]:
del product_similarity
gc.collect()

7

In [21]:
# Collaborative filtering, based on user similarity and product similarity
def predict(user_quantity, similarity, type='user'):
    if type == 'user':
        mean_user_quantity = user_quantity.mean(axis=1)
        #We use np.newaxis so that mean_user_quantity has same format as user_quantity
        quantity_diff = (user_quantity - mean_user_quantity[:, np.newaxis])
        pred = mean_user_quantity[:, np.newaxis] + similarity.dot(quantity_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = user_quantity.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

#### Item similarity based Collaborative filtering

In [22]:
%%time
item_prediction = predict(data_matrix_df.values, product_similarity_df.values, type='item')

CPU times: user 32.5 s, sys: 1.69 s, total: 34.1 s
Wall time: 9.23 s


In [23]:
item_prediction_df = pd.DataFrame(item_prediction, index=data_matrix_df.index.values, columns=data_matrix_df.columns.values)

In [24]:
del item_prediction
gc.collect()

0

In [25]:
item_prediction_df.shape

(27778, 3026)

In [40]:
# prediction
item_prediction_df.head()

Unnamed: 0,11139192,11139194,11139524,11139560,11139588,11139650,11141306,11141308,11141318,11141320,11141324,11141326,11141328,11141330,11141338,11141340,11141342,11141346,11141354,11141530,11141538,11141644,11144136,11144260,11145600,11145602,11145612,11145614,11145620,11145624,11145626,11145634,11145640,11145642,11145654,11145664,11145666,11145684,11145702,11145726,...,13773572,13773574,13773576,13773578,13773580,14101712,14104334,14110862,14110864,14110866,14110868,14121822,14121824,14121826,14121828,14121830,14121832,14121834,14121836,14121838,14121840,14121842,14121844,14121846,14121848,14122346,14122348,14122744,14122748,14122750,14122758,14122762,14122766,14122818,14122820,14122822,14122832,14122834,14128359,14129477
0,0.00045,0.00045,0.00045,0.00045,0.000449,0.00045,0.000449,0.0,0.0,0.000449,0.000449,0.000447,0.000448,0.000449,0.000449,0.0,0.000449,0.00045,0.0,0.000449,0.00045,0.000449,0.00045,0.000446,0.00045,0.00045,0.00045,0.00045,0.00045,0.000449,0.000449,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.0,0.00045,...,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.0,0.000449,0.000443,0.00045,0.00045,0.000449,0.00045,0.00045,0.000449,0.00045,0.000448,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.000443,0.0,0.0,0.00045,0.000457,0.000446,0.000447,0.0,0.00045,0.0,0.000449,0.00045,0.000449,0.00045,0.00045,0.00045,0.00045
1,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000455,0.0,0.0,0.000455,0.000456,0.000453,0.000455,0.000456,0.000455,0.0,0.000456,0.000456,0.0,0.000455,0.000456,0.000456,0.000456,0.000455,0.000456,0.000456,0.000456,0.000456,0.000456,0.000455,0.000455,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.0,0.000456,...,0.000456,0.000454,0.000456,0.000456,0.000456,0.000456,0.0,0.000455,0.000449,0.000456,0.000456,0.000455,0.000456,0.000456,0.000455,0.000456,0.000455,0.000456,0.000456,0.000456,0.000456,0.000456,0.000456,0.00045,0.0,0.0,0.000456,0.000451,0.000452,0.000453,0.0,0.000456,0.0,0.000455,0.000456,0.000451,0.000456,0.000456,0.000456,0.000456
2,0.00039,0.00039,0.00039,0.00039,0.000389,0.00039,0.000389,0.0,0.0,0.000389,0.000389,0.000388,0.000389,0.00039,0.000389,0.0,0.000389,0.00039,0.0,0.000389,0.00039,0.00039,0.00039,0.000386,0.00039,0.00039,0.00039,0.00039,0.00039,0.000454,0.000389,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.0,0.00039,...,0.000393,0.000399,0.00039,0.00039,0.00039,0.00039,0.0,0.000389,0.000384,0.00039,0.00039,0.000389,0.00039,0.00039,0.000389,0.00039,0.000389,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.000384,0.0,0.0,0.00039,0.000386,0.000387,0.000388,0.0,0.00039,0.0,0.000389,0.00039,0.000386,0.00039,0.00039,0.00039,0.00039
3,0.007118,0.007121,0.007121,0.007112,0.007109,0.00712,0.007122,0.050079,0.0,0.007106,0.007109,0.007073,0.007117,0.007122,0.007105,0.0,0.007101,0.007118,0.0,0.007096,0.007128,0.00711,0.007123,0.007053,0.007125,0.007119,0.007123,0.007122,0.007123,0.007097,0.0071,0.00712,0.007119,0.007119,0.007118,0.007141,0.007118,0.007146,0.0,0.007119,...,0.007126,0.007126,0.007116,0.007114,0.007122,0.00712,0.0,0.007108,0.00701,0.007118,0.007116,0.007103,0.007119,0.007126,0.007106,0.007118,0.007094,0.007118,0.007118,0.007119,0.007121,0.007116,0.007121,0.007016,0.0,0.0,0.007123,0.007039,0.007057,0.007074,0.0,0.007121,0.0,0.007098,0.00712,0.007043,0.007117,0.007116,0.007121,0.007123
4,0.00045,0.00045,0.00045,0.000461,0.000449,0.00045,0.000449,0.0,0.0,0.000449,0.000449,0.000447,0.000448,0.000449,0.000449,0.0,0.000448,0.000449,0.0,0.000448,0.00045,0.000449,0.00045,0.000445,0.00045,0.00045,0.00045,0.00045,0.00045,0.000451,0.000448,0.000453,0.00045,0.00045,0.00045,0.000449,0.00045,0.000449,0.0,0.00045,...,0.000449,0.000455,0.00045,0.000449,0.00045,0.000449,0.0,0.000449,0.000443,0.00045,0.00045,0.000449,0.00045,0.00045,0.000458,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.000449,0.00045,0.000449,0.0,0.0,0.00045,0.000445,0.000446,0.000447,0.0,0.00045,0.0,0.000448,0.00045,0.000445,0.000449,0.000449,0.00045,0.00045


#### User similarity based Collaborative filtering

In [26]:
%%time
user_prediction = predict(data_matrix_df.values, user_similarity_df.values, type='user')

CPU times: user 4min 56s, sys: 7.38 s, total: 5min 4s
Wall time: 1min 21s


In [27]:
user_prediction.shape

(27778, 3026)

In [28]:
user_prediction_df = pd.DataFrame(user_prediction, index=data_matrix_df.index.values, columns=data_matrix_df.columns.values)

In [29]:
# prediction
user_prediction_df.head()

Unnamed: 0,11139192,11139194,11139524,11139560,11139588,11139650,11141306,11141308,11141318,11141320,11141324,11141326,11141328,11141330,11141338,11141340,11141342,11141346,11141354,11141530,11141538,11141644,11144136,11144260,11145600,11145602,11145612,11145614,11145620,11145624,11145626,11145634,11145640,11145642,11145654,11145664,11145666,11145684,11145702,11145726,...,13773572,13773574,13773576,13773578,13773580,14101712,14104334,14110862,14110864,14110866,14110868,14121822,14121824,14121826,14121828,14121830,14121832,14121834,14121836,14121838,14121840,14121842,14121844,14121846,14121848,14122346,14122348,14122744,14122748,14122750,14122758,14122762,14122766,14122818,14122820,14122822,14122832,14122834,14128359,14129477
0,-0.005088,-0.003196,0.012564,-0.00887,-0.009501,-0.005088,0.000586,-0.010131,-0.010131,-0.006349,-0.001305,-0.009501,-0.007609,-0.001305,-0.006979,-0.010131,-0.006979,-0.002778,-0.010131,-0.004457,-0.005718,-0.006979,-0.00887,-0.009501,0.003738,-0.009501,-0.000675,-0.00824,-0.009501,-0.00887,-0.009501,-0.00887,-0.00824,-0.009501,-0.00824,-0.002566,-0.006349,-0.007609,-0.010131,-0.00824,...,-0.001746,-0.006789,0.003108,-0.006979,-0.005718,-0.003196,-0.010131,-0.003196,-0.009501,-0.004457,-0.003196,-0.009501,-0.006979,-0.006979,-0.00887,-0.00824,-0.00824,-0.00887,-0.00824,-0.00887,-0.007609,-0.00824,-0.007609,-0.009501,-0.010131,-0.010131,-0.009501,-0.00657,-0.009501,-0.009501,-0.010131,-0.007609,-0.010131,-0.009501,-0.00887,-0.007992,0.014456,-0.000675,-0.00824,-0.00887
1,-0.006672,-0.003926,0.022587,-0.012164,-0.013079,-0.006672,0.001565,-0.013994,-0.013994,-0.008503,-0.001181,-0.013079,-0.010333,-0.001181,-0.009418,-0.013994,-0.006348,-0.002154,-0.013994,-0.005757,-0.007587,-0.009418,-0.012164,-0.009018,0.006141,-0.013079,-0.000266,-0.011248,-0.013079,-0.012164,-0.013079,-0.012164,-0.011248,-0.013079,-0.011248,-0.003011,-0.008503,-0.010333,-0.013994,-0.011248,...,-0.003011,-0.010333,0.005226,-0.009418,-0.007587,-0.003926,-0.013994,-0.003926,-0.013079,-0.005757,-0.003926,-0.013079,-0.009418,-0.009418,-0.012164,-0.011248,-0.011248,-0.012164,-0.011248,-0.012164,-0.010333,-0.011248,-0.010333,-0.013079,-0.013994,-0.013994,-0.013079,-0.012164,-0.013079,-0.013079,-0.013994,-0.010333,-0.013994,-0.013079,-0.012164,-0.013079,0.0217,-0.000266,-0.011248,-0.012164
2,-0.003995,-0.002727,0.007842,-0.006532,-0.006954,-0.003995,-0.00019,-0.007377,-0.007377,-0.004841,-0.001459,-0.006954,-0.005686,-0.001459,-0.005263,-0.007377,-0.005263,-0.002727,-0.007377,-0.003572,-0.004418,-0.005263,-0.006532,-0.006954,0.001923,-0.006954,-0.001036,-0.006109,-0.006954,0.00095,-0.006954,-0.006532,-0.006109,-0.006954,-0.006109,-0.002304,-0.004841,-0.005686,-0.007377,-0.006109,...,0.001118,-0.002264,0.001501,-0.005263,-0.004418,-0.002727,-0.007377,-0.002727,-0.006954,-0.003572,-0.002727,-0.006954,-0.005263,-0.005263,-0.006532,-0.006109,-0.006109,-0.006532,-0.006109,-0.006532,-0.005686,-0.006109,-0.005686,-0.006954,-0.007377,-0.007377,-0.006954,-0.006532,-0.006954,-0.006954,-0.007377,-0.005686,-0.007377,-0.006954,-0.006532,-0.006954,0.00911,-0.001036,-0.006109,-0.006532
3,-0.000947,0.001399,0.020943,-0.005637,-0.006419,-0.000947,0.011446,-0.006087,-0.007201,-0.00251,0.003744,-0.006419,-0.002413,0.004783,-0.003292,-0.007201,-0.003292,0.001887,-0.007201,-0.000165,0.000499,-0.003292,-0.005637,-0.006419,0.011112,-0.006419,0.004526,-0.004856,-0.006419,-0.005637,-0.006419,-0.005637,-0.004856,-0.006419,-0.004856,0.004258,-0.00251,-0.00296,-0.007201,-0.004856,...,0.003138,-0.003117,0.009216,-0.003292,-0.001729,0.00638,-0.007201,0.001399,-0.006419,-0.000165,0.001399,-0.006419,-0.003292,-0.000564,-0.005637,-0.004856,-0.004856,-0.005637,-0.004856,-0.005637,-0.004074,-0.004856,-0.004074,-0.006419,-0.007201,-0.007201,-0.006419,-0.005637,-0.006419,-0.006419,-0.007201,-0.004074,-0.007201,-0.006419,-0.005637,-0.006419,0.023288,0.004526,-0.004856,-0.005637
4,-0.005308,-0.003188,0.014476,-0.001624,-0.010254,-0.005308,0.001051,-0.010961,-0.010961,-0.006721,-0.001069,-0.010254,-0.008135,-0.001069,-0.007428,-0.010961,-0.007428,-0.003188,-0.010961,-0.004602,-0.006015,-0.007428,-0.009548,-0.010254,0.004584,-0.010254,-0.000362,-0.008841,-0.010254,-0.004087,-0.010254,-0.003442,-0.008841,-0.010254,-0.008841,-0.002482,-0.006721,-0.008135,-0.010961,-0.008841,...,-0.002482,0.000479,0.009335,-0.007428,-0.006015,-0.003188,-0.010961,-0.003188,-0.010254,-0.004602,0.003862,-0.010254,-0.007428,-0.007428,-0.001356,-0.008841,-0.008368,-0.009548,-0.008841,-0.009548,-0.008135,-0.008841,-0.008135,-0.006393,-0.010961,-0.010961,-0.010254,-0.009548,-0.010254,-0.010254,-0.010961,-0.008135,-0.010961,-0.010254,-0.009548,-0.010254,0.016596,-0.000362,-0.008841,-0.009548


### Content Based Filtering

In [30]:
prod_feats_df = img_feat_df.merge(pa_df, on='productid', how='left')

In [31]:
prod_feats_df.fillna(0, inplace=True)

In [32]:
prod_feats_df.sort_values(ascending=True, by='productid', inplace=True)

In [33]:
prod_feats_df = prod_feats_df.set_index('productid')

In [34]:
prod_feats_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,attributevalue_194,attributevalue_195,attributevalue_196,attributevalue_197,attributevalue_198,attributevalue_199,attributevalue_200,attributevalue_201,attributevalue_202,attributevalue_203,attributevalue_204,attributevalue_205,attributevalue_206,attributevalue_207,attributevalue_208,attributevalue_209,attributevalue_210,attributevalue_211,attributevalue_212,attributevalue_213,attributevalue_214,attributevalue_215,attributevalue_216,attributevalue_217,attributevalue_218,attributevalue_219,attributevalue_220,attributevalue_221,attributevalue_222,attributevalue_223,attributevalue_224,attributevalue_225,attributevalue_226,attributevalue_227,attributevalue_228,attributevalue_229,attributevalue_230,attributevalue_231,attributevalue_232,attributevalue_233
productid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
11139192,0.002809,0.141478,0.006376,0.007968,0.173497,0.002809,0.009415,0.003392,0.08794,0.023752,0.141605,0.003434,0.008882,0.002139,0.008461,0.028743,0.001273,0.023865,0.002335,0.001389,0.031475,0.079425,0.267076,0.002454,0.105888,0.141864,0.32313,0.001989,0.00467,0.144718,0.006516,0.096374,0.077092,0.066699,0.210829,0.078881,0.02144,0.036647,1.453079,0.033968,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11139194,0.003088,0.129808,0.008812,0.006858,0.192971,0.002551,0.00929,0.003388,0.063547,0.024097,0.119526,0.003048,0.008769,0.002243,0.009709,0.02968,0.001545,0.012371,0.002184,0.001716,0.032679,0.085714,0.291426,0.002575,0.17244,0.060002,0.309105,0.002239,0.005433,0.144477,0.00465,0.095803,0.095482,0.069577,0.178093,0.058798,0.019926,0.039278,1.467373,0.036233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11139524,0.002465,0.150426,0.012537,0.008641,0.193621,0.002874,0.007975,0.002605,0.123589,0.019207,0.130641,0.004288,0.010044,0.00332,0.009822,0.021989,0.00148,0.015524,0.002639,0.001093,0.03435,0.081671,0.253273,0.002739,0.106045,0.098959,0.316819,0.002427,0.006494,0.15924,0.007658,0.101226,0.090361,0.066757,0.221146,0.062729,0.012088,0.032989,1.411619,0.02887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11139560,0.002333,0.064706,0.014644,0.010657,0.081996,0.002827,0.007093,0.003096,0.076784,0.016698,0.150028,0.00364,0.00782,0.00393,0.009372,0.020984,0.001911,0.017014,0.003296,0.000919,0.027199,0.061142,0.227683,0.002693,0.056398,0.100444,0.317477,0.002516,0.007308,0.191347,0.011845,0.090973,0.073453,0.050319,0.237367,0.072556,0.016927,0.038256,1.328586,0.026655,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11139588,0.002665,0.213627,0.008963,0.009901,0.160075,0.002989,0.005288,0.002089,0.075852,0.019718,0.150696,0.00339,0.008626,0.002246,0.011107,0.028415,0.001403,0.014761,0.00256,0.001216,0.032018,0.101339,0.22791,0.002436,0.120356,0.038717,0.316635,0.001547,0.007106,0.136396,0.006629,0.132243,0.076432,0.047606,0.185864,0.099494,0.013652,0.029453,1.4331,0.032901,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
prod_feats_df.shape

(3026, 499)

#### Find User profile product  matrix for all users who are in test set. weighted sum of product features is used.

In [36]:
user_prof_df = pd.DataFrame(index=sorted(test_df['UserId'].unique()), columns=prod_feats_df.columns.values)

In [37]:
for user in sorted(test_df['UserId'].unique()):
    user_prof_df.loc[user] = ((prod_feats_df.values  * data_matrix_df.loc[user].values.reshape(3026,1)).sum(axis=0)) / data_matrix_df.loc[user].sum()

In [41]:
user_prof_df.shape

(2350, 499)

In [38]:
user_prof_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,attributevalue_194,attributevalue_195,attributevalue_196,attributevalue_197,attributevalue_198,attributevalue_199,attributevalue_200,attributevalue_201,attributevalue_202,attributevalue_203,attributevalue_204,attributevalue_205,attributevalue_206,attributevalue_207,attributevalue_208,attributevalue_209,attributevalue_210,attributevalue_211,attributevalue_212,attributevalue_213,attributevalue_214,attributevalue_215,attributevalue_216,attributevalue_217,attributevalue_218,attributevalue_219,attributevalue_220,attributevalue_221,attributevalue_222,attributevalue_223,attributevalue_224,attributevalue_225,attributevalue_226,attributevalue_227,attributevalue_228,attributevalue_229,attributevalue_230,attributevalue_231,attributevalue_232,attributevalue_233
2,0.00240316,0.0570552,0.016046,0.01389,0.112636,0.00248553,0.00902976,0.0037423,0.0688358,0.0217775,0.152798,0.00306676,0.00882331,0.0032352,0.0113432,0.0186459,0.00166386,0.0134963,0.00366296,0.000970252,0.0293301,0.0626684,0.234586,0.00251082,0.208099,0.0941951,0.305567,0.00275978,0.00503578,0.12935,0.00830982,0.0594594,0.112797,0.0681741,0.355862,0.0375611,0.0142055,0.0371733,1.23923,0.0212428,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28,0.00226293,0.0677918,0.0200609,0.0122073,0.0996468,0.00222733,0.00779636,0.0043594,0.0723457,0.0195489,0.114618,0.00292939,0.00844551,0.00320119,0.0098201,0.0235089,0.00172839,0.0172065,0.00292557,0.00109987,0.0283268,0.0564325,0.229732,0.00272123,0.165637,0.168229,0.317399,0.00268555,0.00629415,0.131822,0.0115195,0.056956,0.0987509,0.0533434,0.263957,0.0459274,0.0160782,0.0373872,1.33606,0.0201894,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.111111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36,0.00229675,0.0772629,0.0196574,0.0157015,0.118922,0.00254686,0.0068339,0.00307378,0.0735576,0.0181622,0.112142,0.00312441,0.00756511,0.00287913,0.0119174,0.0186249,0.00192643,0.011676,0.00284197,0.00104553,0.0256678,0.0610224,0.198041,0.00279389,0.120591,0.0486721,0.316422,0.00230069,0.00631624,0.123782,0.010797,0.0828692,0.0743526,0.0546044,0.244236,0.040934,0.0163101,0.0352981,1.31845,0.0227424,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
38,0.00207412,0.124231,0.0165958,0.0147922,0.113071,0.00242978,0.00747137,0.00352825,0.0762359,0.0179274,0.117859,0.00305936,0.00778421,0.00302131,0.00895657,0.0215592,0.00184837,0.01466,0.00278833,0.00118969,0.0282519,0.0715532,0.213578,0.00288567,0.0763704,0.0783012,0.340685,0.002594,0.00615773,0.122209,0.00888564,0.0710897,0.0922297,0.047779,0.238585,0.0401775,0.0178007,0.0345901,1.31957,0.0246873,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41,0.00227038,0.16644,0.0148195,0.0164105,0.0964283,0.00259993,0.00672736,0.00287127,0.094876,0.0180548,0.109315,0.0028799,0.00774829,0.0030107,0.0102697,0.01611,0.00151835,0.0170009,0.00309528,0.00109477,0.026781,0.0811763,0.201564,0.0029855,0.168608,0.0246068,0.318943,0.00284331,0.00541361,0.128175,0.0117229,0.0715103,0.10321,0.0492597,0.307712,0.0522079,0.0174528,0.0336505,1.28109,0.0271702,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
user_prof_prods_similarity = pairwise_distances(user_prof_df, prod_feats_df, metric='cosine')

In [48]:
user_prof_prods_similarity_df = 1 - pd.DataFrame(user_prof_prods_similarity, index=user_prof_df.index.values, columns=prod_feats_df.index.values)

In [49]:
# prediction
user_prof_prods_similarity_df.head()

Unnamed: 0,11139192,11139194,11139524,11139560,11139588,11139650,11141306,11141308,11141318,11141320,11141324,11141326,11141328,11141330,11141338,11141340,11141342,11141346,11141354,11141530,11141538,11141644,11144136,11144260,11145600,11145602,11145612,11145614,11145620,11145624,11145626,11145634,11145640,11145642,11145654,11145664,11145666,11145684,11145702,11145726,...,13773572,13773574,13773576,13773578,13773580,14101712,14104334,14110862,14110864,14110866,14110868,14121822,14121824,14121826,14121828,14121830,14121832,14121834,14121836,14121838,14121840,14121842,14121844,14121846,14121848,14122346,14122348,14122744,14122748,14122750,14122758,14122762,14122766,14122818,14122820,14122822,14122832,14122834,14128359,14129477
2,0.773045,0.781728,0.867489,0.844236,0.733601,0.852248,0.747821,0.725141,0.703944,0.705724,0.724746,0.691491,0.694826,0.670169,0.727237,0.721792,0.677691,0.718812,0.728294,0.759165,0.781923,0.780848,0.767787,0.785269,0.823262,0.819253,0.817612,0.810761,0.816447,0.782337,0.806853,0.750268,0.828003,0.788172,0.799552,0.783797,0.820774,0.774044,0.818047,0.816226,...,0.762018,0.759481,0.801522,0.762784,0.769309,0.806028,0.639576,0.71243,0.73022,0.709571,0.705006,0.779572,0.771631,0.76899,0.781065,0.7569,0.786874,0.783049,0.765268,0.775359,0.783611,0.766466,0.763719,0.767017,0.764989,0.677971,0.672979,0.729602,0.800444,0.795369,0.789794,0.789662,0.792421,0.827565,0.796804,0.849889,0.79303,0.794027,0.679587,0.769251
28,0.843897,0.832445,0.886408,0.876519,0.796577,0.895249,0.780446,0.796414,0.777643,0.772864,0.792429,0.774705,0.755372,0.741166,0.794986,0.78638,0.751389,0.77955,0.796777,0.827386,0.863015,0.855293,0.865311,0.861628,0.848868,0.895194,0.887617,0.88854,0.858608,0.857974,0.902149,0.829023,0.872081,0.833902,0.879967,0.88547,0.895594,0.875695,0.861945,0.870821,...,0.848097,0.858368,0.872338,0.861244,0.859998,0.87977,0.704777,0.784958,0.788909,0.781705,0.779446,0.862362,0.863022,0.866939,0.879931,0.843933,0.877256,0.886303,0.86502,0.863616,0.856808,0.839695,0.853641,0.855745,0.836813,0.751481,0.759929,0.783985,0.86334,0.901059,0.881509,0.8896,0.884421,0.906885,0.892858,0.919505,0.919441,0.899009,0.780642,0.848202
36,0.854691,0.82897,0.899553,0.884751,0.825268,0.883165,0.823234,0.846922,0.826288,0.80824,0.823878,0.846187,0.786627,0.772248,0.832943,0.838415,0.793352,0.808078,0.831105,0.835853,0.863052,0.842702,0.8899,0.850989,0.891769,0.90439,0.87742,0.89518,0.888754,0.905833,0.893384,0.824407,0.888323,0.87757,0.904846,0.912249,0.915423,0.914892,0.886905,0.905968,...,0.910583,0.878151,0.890108,0.891584,0.89353,0.877472,0.734053,0.826704,0.823478,0.818754,0.805397,0.913993,0.890612,0.883388,0.912228,0.88931,0.904419,0.902243,0.899174,0.90587,0.899617,0.900441,0.888323,0.889718,0.89424,0.767059,0.786507,0.806232,0.885015,0.928658,0.91289,0.913139,0.918585,0.892483,0.870617,0.88457,0.912467,0.912618,0.796806,0.884355
38,0.877229,0.857486,0.918777,0.913497,0.836881,0.91365,0.831959,0.853514,0.839205,0.816615,0.834906,0.846325,0.799054,0.785552,0.842955,0.846832,0.804013,0.812522,0.839379,0.853632,0.874845,0.868188,0.899756,0.867663,0.891109,0.920758,0.892016,0.906194,0.893228,0.909356,0.906762,0.844586,0.914456,0.889547,0.9068,0.924314,0.928105,0.911469,0.908602,0.923866,...,0.899719,0.878484,0.896604,0.892162,0.889066,0.881969,0.743103,0.842069,0.846148,0.834715,0.826432,0.909435,0.89268,0.887481,0.909731,0.894256,0.908991,0.904501,0.897283,0.906538,0.905962,0.902172,0.901518,0.887561,0.894415,0.769542,0.78881,0.81055,0.883671,0.932489,0.913019,0.915446,0.920857,0.907243,0.870233,0.915209,0.909347,0.917764,0.794917,0.886163
41,0.830266,0.808754,0.900972,0.898813,0.807747,0.868522,0.814633,0.838072,0.819518,0.783981,0.812453,0.832582,0.769706,0.774487,0.830074,0.842534,0.772166,0.783473,0.815401,0.809269,0.863983,0.824931,0.872322,0.827893,0.890051,0.897123,0.848941,0.868636,0.860495,0.884093,0.856982,0.804808,0.894638,0.87051,0.875609,0.881292,0.900688,0.890656,0.910002,0.90553,...,0.878791,0.82972,0.850019,0.858712,0.860731,0.836044,0.688555,0.819591,0.829975,0.815715,0.787811,0.883561,0.879081,0.86724,0.882985,0.881726,0.883062,0.883031,0.870855,0.888919,0.887419,0.884486,0.880812,0.856772,0.885619,0.713246,0.74201,0.779043,0.83705,0.907789,0.895194,0.89823,0.900051,0.858757,0.818601,0.842505,0.846523,0.90131,0.732272,0.865191


### Get top 10 predictions

get prediction from item_prediction_df

In [54]:
item_pred_sub = pd.DataFrame(columns=['UserId', 'product_list'])
item_pred_sub['UserId'] = test_df['UserId'].values

In [75]:
sub_list = []
for user in item_pred_sub['UserId'].values:
    sub_list.append(item_prediction_df.loc[user].sort_values(ascending=False)[0:10].index.values.tolist())

In [77]:
item_pred_sub['product_list'] = sub_list

In [80]:
item_pred_sub.head()

Unnamed: 0,UserId,product_list
0,2,"[11659418, 12936758, 11407244, 12995900, 12406..."
1,28,"[13650254, 13039772, 12407206, 11659818, 12437..."
2,36,"[13650238, 11360718, 12936666, 11150104, 11714..."
3,38,"[11659790, 11659724, 12423464, 11147300, 11407..."
4,41,"[11360904, 13650186, 11659466, 12995900, 11659..."


In [81]:
item_pred_sub.to_csv('item_pred_sub.csv', index=False)

In [83]:
FileLink('item_pred_sub.csv')

0.0045755938 Public LB score

get prediction from user_prediction_df

In [84]:
user_pred_sub = pd.DataFrame(columns=['UserId', 'product_list'])
user_pred_sub['UserId'] = test_df['UserId'].values

In [85]:
sub_list = []
for user in user_pred_sub['UserId'].values:
    sub_list.append(user_prediction_df.loc[user].sort_values(ascending=False)[0:10].index.values.tolist())

In [86]:
user_pred_sub['product_list'] = sub_list
user_pred_sub.head()

Unnamed: 0,UserId,product_list
0,2,"[11660064, 12407154, 12658512, 12371280, 12973..."
1,28,"[12360404, 11659838, 12658512, 13039778, 12407..."
2,36,"[12360448, 12407648, 12658512, 11460916, 12407..."
3,38,"[12406972, 11659712, 11659798, 12658512, 12407..."
4,41,"[11659236, 12658512, 12407154, 12973004, 12371..."


In [87]:
user_pred_sub.to_csv('user_pred_sub.csv', index=False)
FileLink('user_pred_sub.csv')

0.0232522796 public LB score

get prediction from user_prof_prods_similarity_df

In [88]:
content_pred_sub = pd.DataFrame(columns=['UserId', 'product_list'])
content_pred_sub['UserId'] = test_df['UserId'].values

In [89]:
sub_list = []
for user in content_pred_sub['UserId'].values:
    sub_list.append(user_prof_prods_similarity_df.loc[user].sort_values(ascending=False)[0:10].index.values.tolist())

In [90]:
content_pred_sub['product_list'] = sub_list
content_pred_sub.head()

Unnamed: 0,UserId,product_list
0,2,"[11660064, 13653526, 12407202, 11407206, 12657..."
1,28,"[12407652, 11659838, 11659818, 12360402, 13039..."
2,36,"[12407648, 12360448, 11407416, 11460916, 11360..."
3,38,"[11659798, 11659238, 12658406, 11659832, 12371..."
4,41,"[11659236, 11659238, 11659232, 12407916, 12407..."


In [91]:
content_pred_sub.to_csv('content_pred_sub.csv', index=False)
FileLink('content_pred_sub.csv')

#### 0.0289288529 public LB score | Final model

get predictions of average score of content based and user user similarity based collaborative filtering.

In [96]:
avg_pred_df = user_prediction_df.loc[sorted(test_df['UserId'].values)] + user_prof_prods_similarity_df

In [97]:
avg_cont_user_pred_sub = pd.DataFrame(columns=['UserId', 'product_list'])
avg_cont_user_pred_sub['UserId'] = test_df['UserId'].values

sub_list = []
for user in avg_cont_user_pred_sub['UserId'].values:
    sub_list.append(avg_pred_df.loc[user].sort_values(ascending=False)[0:10].index.values.tolist())

avg_cont_user_pred_sub['product_list'] = sub_list
avg_cont_user_pred_sub.head()

Unnamed: 0,UserId,product_list
0,2,"[11660064, 12371280, 12407202, 12407154, 13653..."
1,28,"[12360404, 11659838, 12658512, 13039778, 12407..."
2,36,"[12360448, 12407648, 11460916, 12658512, 12407..."
3,38,"[12406972, 11659712, 11659798, 12658512, 12407..."
4,41,"[11659236, 12658512, 12407154, 12657360, 12407..."


In [98]:
avg_cont_user_pred_sub.to_csv('avg_content_user_pred_sub.csv', index=False)
FileLink('avg_content_user_pred_sub.csv')

0.024782055611842845. public LB

weighted avg of content and user-user similarity based collaborative filtering

In [105]:
avg_pred_df = (user_prediction_df.loc[sorted(test_df['UserId'].values)] * 0.1) + (user_prof_prods_similarity_df * 0.9)

In [106]:
we1_cont_user_pred_sub = pd.DataFrame(columns=['UserId', 'product_list'])
we1_cont_user_pred_sub['UserId'] = test_df['UserId'].values

sub_list = []
for user in we1_cont_user_pred_sub['UserId'].values:
    sub_list.append(avg_pred_df.loc[user].sort_values(ascending=False)[0:10].index.values.tolist())

we1_cont_user_pred_sub['product_list'] = sub_list
we1_cont_user_pred_sub.head()

Unnamed: 0,UserId,product_list
0,2,"[11660064, 13653526, 12407202, 12371280, 11407..."
1,28,"[11659838, 12360404, 12407652, 13039778, 11659..."
2,36,"[12360448, 12407648, 11460916, 12407730, 11407..."
3,38,"[12406972, 11659798, 11659712, 12658406, 11659..."
4,41,"[11659236, 11659238, 11659232, 12407916, 12407..."


In [107]:
we1_cont_user_pred_sub.to_csv('we1_content_user_pred_sub.csv', index=False)
FileLink('we1_content_user_pred_sub.csv')

0.0274791799567213 public LB

It looks like content based filtering is giving high score

In [110]:
del data_matrix_df, user_similarity_df, product_similarity_df, user_prof_df
gc.collect()

983

### Alternating Least Squares Collaborative filtering

In [113]:
import scipy.sparse as sparse
import implicit
import csv

In [112]:
train_df = train_df.copy()

sparse_item_user = sparse.csr_matrix((train_df['Quantity'].astype(float), (train_df['productid'], train_df['UserId'])))
sparse_user_item = sparse.csr_matrix((train_df['Quantity'].astype(float), (train_df['UserId'], train_df['productid'])))

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=70, regularization=0.6, iterations=20)

# Calculate the confidence by multiplying it by alpha value.
alpha_val = 10
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)

100%|██████████| 20.0/20 [01:38<00:00,  4.16s/it]


In [114]:
# Create recommendations for all users
fields=['UserId','product_list']
filename = 'submission_als.csv'
with open(filename,'a') as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    userids = test_df['UserId'].drop_duplicates().values.tolist()
    for user in userids:
        products = []
        scores = []
        results = []
        results.append(user)
        recommended = model.recommend(user, sparse_user_item)

        for item in recommended:
            idx, score = item
            products.append(idx)
            scores.append(score)
        results.append(products)
        #print products
        #print scores
        writer.writerow(results)

In [115]:
os.listdir()

['submission_als.csv',
 '.ipynb_checkpoints',
 'we1_content_user_pred_sub.csv',
 'item_pred_sub.csv',
 'content_pred_sub.csv',
 'user_pred_sub.csv',
 'avg_content_user_pred_sub.csv',
 '__notebook_source__.ipynb']

In [117]:
pd.read_csv('submission_als.csv').head()

Unnamed: 0,UserId,product_list
0,2,"[11360778, 12407154, 11659236, 12658406, 11407..."
1,28,"[11659836, 11659538, 11660064, 12407726, 12360..."
2,36,"[12658482, 12407006, 12360452, 11659632, 12407..."
3,38,"[12360390, 12407734, 12322666, 12658230, 11659..."
4,41,"[12657360, 12407010, 12407436, 11659998, 11659..."


In [118]:
FileLink('submission_als.csv')

0.01 public LB