In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.neighbors import NearestNeighbors

df_rates = pd.read_csv('data/rates.csv')
df_bands = pd.read_csv('data/bands.csv')

In [2]:
idx = df_bands.BandID.isin(df_rates.BandID)
df_bands = df_bands.loc[idx]

enc_band = LabelEncoder()
enc_bar = LabelEncoder()
enc_band = enc_band.fit(df_rates.BandID.values)
enc_bar = enc_bar.fit(df_rates.BarID.values)

df_rates.BarID = enc_bar.transform(df_rates.BarID.values)
df_rates.BandID = enc_band.transform(df_rates.BandID.values)
df_bands.BandID = enc_band.transform(df_bands.BandID.values)

In [3]:
R = coo_matrix((df_rates.Consert.values, (df_rates.BarID.values, df_rates.BandID.values))).asfptype()  # BAR-BAND-yep-no
u, s, vt = svds(R, k=5)  # szatoe priznakovoe prostranstvo

k_nn = 5
nn = NearestNeighbors(n_neighbors=k_nn)
nn.fit(vt.T)
_, idn = nn.kneighbors(vt.T, n_neighbors=k_nn)

In [4]:
band_titles = df_bands.sort_values('BandID').loc[:, 'Names'].values
cols = ['band']+['nn_{}'.format(i) for i in range(1, k_nn)]
df_nn = pd.DataFrame(data=band_titles[idn], columns=cols)

In [5]:
df_nn

Unnamed: 0,band,nn_1,nn_2,nn_3,nn_4
0,50 Second to Mars,Ariana Grande,Pentatonix,word 9,some word 77
1,Ariana Grande,50 Second to Mars,Pentatonix,word 9,some word 77
2,Imagine Dragons,50 Second to Mars,Pentatonix,word 9,Ariana Grande
3,Antitela,50 Second to Mars,Pentatonix,word 9,Ariana Grande
4,Pentatonix,word 9,50 Second to Mars,Ariana Grande,Antitela
5,word 9,Pentatonix,50 Second to Mars,some word 77,Ariana Grande
6,some word 77,50 Second to Mars,word 9,Ariana Grande,Pentatonix


# User-similatity

In [61]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine, pdist, squareform
import numpy as np

In [62]:
D = cosine_similarity(R)

def similarity(u, v):
    idx = (u != 0) & (v != 0)
    if np.any(idx):
        sim = -cosine(u[idx], v[idx])+1
        return sim
    else:
        return 0
    
    
res = pdist(R.toarray(), similarity)
squareform(res)

array([[0.        , 0.        , 1.        , 0.        , 1.        ,
        0.24987802, 1.        ],
       [0.        , 0.        , 0.        , 1.        , 1.        ,
        0.        , 0.        ],
       [1.        , 0.        , 0.        , 1.        , 1.        ,
        1.        , 0.        ],
       [0.        , 1.        , 1.        , 0.        , 0.90582163,
        0.        , 0.        ],
       [1.        , 1.        , 1.        , 0.90582163, 0.        ,
        1.        , 1.        ],
       [0.24987802, 0.        , 1.        , 0.        , 1.        ,
        0.        , 1.        ],
       [1.        , 0.        , 0.        , 0.        , 1.        ,
        1.        , 0.        ]])

# Turicreate

In [6]:
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")

data = df_rates

def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['conserts_dummy'] = 1
    return data_dummy

data_dummy = create_data_dummy(data)
data_dummy

Unnamed: 0,BarID,BandID,Consert,conserts_dummy
0,3,1,10,1
1,3,0,4,1
2,3,4,1,1
3,4,6,2,1
4,4,4,2,1
5,4,1,5,1
6,4,0,5,1
7,2,3,1,1
8,2,5,3,1
9,1,0,1,1


In [7]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='Consert', index='BarID', columns='BandID')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index() 
    d.index.names = ['Consert'] 
    data_norm = pd.melt(d, id_vars=['BarID'], value_name='Consert').dropna()
    return data_norm

data_norm = normalize_data(data)
data_norm

Unnamed: 0,BarID,BandID,Consert
1,1,0,0.0
3,3,0,0.75
4,4,0,1.0
10,3,1,1.0
11,4,1,0.0
21,0,3,0.0
23,2,3,0.0
26,5,3,1.0
30,2,4,1.0
31,3,4,0.0


In [14]:
def split_data(data):
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [15]:
users_to_recommend = list(set(data['BarID']))
n_rec = 5  # number of items to recommend
n_display = 10  # to display the first few rows in an output dataset

In [16]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target)
        
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target, 
                                                      similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target, 
                                                    similarity_type='pearson')
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [17]:
popularity_model = model(train_data_norm, 'popularity', 'BarID', 'BandID', 'Consert', users_to_recommend, n_rec, n_display)

+-------+--------+--------------------+------+
| BarID | BandID |       score        | rank |
+-------+--------+--------------------+------+
|   0   |   1    |        1.0         |  1   |
|   0   |   0    |        0.75        |  2   |
|   0   |   4    | 0.4444444444444444 |  3   |
|   1   |   1    |        1.0         |  1   |
|   1   |   0    |        0.75        |  2   |
|   1   |   5    |        0.5         |  3   |
|   1   |   4    | 0.4444444444444444 |  4   |
|   1   |   6    |       0.375        |  5   |
|   2   |   1    |        1.0         |  1   |
|   2   |   0    |        0.75        |  2   |
+-------+--------+--------------------+------+
[27 rows x 4 columns]



In [20]:
name = 'cosine'
cos = model(train_data, name, 'BarID', 'BandID', 'Consert', users_to_recommend, n_rec, n_display)

+-------+--------+---------------------+------+
| BarID | BandID |        score        | rank |
+-------+--------+---------------------+------+
|   0   |   0    |  0.4962916374206543 |  1   |
|   0   |   4    |  0.2876780033111572 |  2   |
|   0   |   1    |  0.2876780033111572 |  3   |
|   0   |   2    |         0.0         |  4   |
|   1   |   1    | 0.44854262471199036 |  1   |
|   1   |   4    |  0.1725163757801056 |  2   |
|   1   |   6    | 0.08271527290344238 |  3   |
|   1   |   3    |         0.0         |  4   |
|   1   |   5    |         0.0         |  5   |
|   2   |   0    |  1.3801310062408447 |  1   |
+-------+--------+---------------------+------+
[32 rows x 4 columns]



In [21]:
name = 'pearson'
pear = model(train_data, name, 'BarID', 'BandID', 'Consert', users_to_recommend, n_rec, n_display)

+-------+--------+--------------------+------+
| BarID | BandID |       score        | rank |
+-------+--------+--------------------+------+
|   0   |   2    |        15.0        |  1   |
|   0   |   1    | 7.6550817886988325 |  2   |
|   0   |   0    | 3.3333333333333335 |  3   |
|   0   |   4    | 3.155081788698832  |  4   |
|   1   |   1    | 7.355531556265695  |  1   |
|   1   |   5    |        5.0         |  2   |
|   1   |   3    |        4.0         |  3   |
|   1   |   6    | 3.132966237408774  |  4   |
|   1   |   4    | 2.708210127694266  |  5   |
|   2   |   2    |        15.0        |  1   |
+-------+--------+--------------------+------+
[32 rows x 4 columns]



In [22]:
models_w_counts = [popularity_model, cos, pear]

names_w_counts = ['Popularity Model on Consert Counts', 'Cosine Similarity on Consert Counts', 'Pearson Similarity on Consert Counts']

In [23]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Consert Counts

Precision and recall summary statistics by cutoff
+--------+----------------+-------------+
| cutoff | mean_precision | mean_recall |
+--------+----------------+-------------+
|   1    |      0.0       |     0.0     |
|   2    |      0.0       |     0.0     |
|   3    |      0.0       |     0.0     |
|   4    |      0.0       |     0.0     |
|   5    |      0.0       |     0.0     |
|   6    |      0.0       |     0.0     |
|   7    |      0.0       |     0.0     |
|   8    |      0.0       |     0.0     |
|   9    |      0.0       |     0.0     |
|   10   |      0.0       |     0.0     |
+--------+----------------+-------------+
[10 rows x 3 columns]


Overall RMSE: 1.5209443135559801

Per User RMSE (best)
+-------+--------------------+-------+
| BarID |        rmse        | count |
+-------+--------------------+-------+
|   3   | 0.5555555555555556 |   1   |
+-------+--------------------+-------+
[1 rows x 3 columns]


Per 