# Item-similarity

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.neighbors import NearestNeighbors


def similarity_matrix(input_df, k_nn = 5):
    enc = LabelEncoder()
    enc = enc.fit(input_df.iloc[:, 0].values)
    input_df.iloc[:, 0] = enc.transform(input_df.iloc[:, 0].values)

    enc = enc.fit(input_df.iloc[:, 1].values)
    input_df.iloc[:, 1] = enc.transform(input_df.iloc[:, 1].values)

    R = coo_matrix((input_df.iloc[:, 2].values, (input_df.iloc[:, 0].values, input_df.iloc[:, 1].values))).asfptype()
    u, s, vt = svds(R, k=k_nn)
    
    nn = NearestNeighbors(n_neighbors=k_nn)
    nn.fit(vt.T)
    _, idn = nn.kneighbors(vt.T, n_neighbors=k_nn)
    return idn


def table_related_names(df_input, matrix, k_nn=5):
    titles = df_input.sort_values(df_input.columns[0]).iloc[:, 1].values
    cols = ['Name']+['related_{}'.format(i) for i in range(1, k_nn)]
    result = pd.DataFrame(data=titles[matrix], columns=cols)
    return result


In [2]:
df_rates = pd.read_csv('data/rates.csv')
df_bands = pd.read_csv('data/bands.csv')

similarity_matrix(df_rates)
table_related_names(df_bands, similarity_matrix(df_rates))

Unnamed: 0,Name,related_1,related_2,related_3,related_4
0,50 Second to Mars,Ariana Grande,Pentatonix,some 8,word 9
1,Ariana Grande,50 Second to Mars,Pentatonix,some 8,word 9
2,Imagine Dragons,50 Second to Mars,Pentatonix,some 8,Ariana Grande
3,Antitela,50 Second to Mars,Pentatonix,some 8,Ariana Grande
4,Pentatonix,some 8,50 Second to Mars,Ariana Grande,Antitela
5,some 8,Pentatonix,50 Second to Mars,word 9,Ariana Grande
6,word 9,50 Second to Mars,some 8,Ariana Grande,Pentatonix


# User-similatity

In [3]:
from scipy.spatial.distance import cosine, pdist, squareform
import numpy as np


def similarity(u, v):
    idx = (u != 0) & (v != 0)
    if np.any(idx):
        sim = -cosine(u[idx], v[idx])+1
        return sim
    else:
        return 0


def user_similarity(input_df):    
    R = coo_matrix((input_df.iloc[:, 2].values, (input_df.iloc[:, 0].values, input_df.iloc[:, 1].values))).asfptype()
    res = pdist(R.toarray(), similarity)
    return squareform(res)


user_similarity(df_rates)

array([[0.        , 0.        , 1.        , 0.        , 1.        ,
        0.24987802, 1.        ],
       [0.        , 0.        , 0.        , 1.        , 1.        ,
        0.        , 0.        ],
       [1.        , 0.        , 0.        , 1.        , 1.        ,
        1.        , 0.        ],
       [0.        , 1.        , 1.        , 0.        , 0.90582163,
        0.        , 0.        ],
       [1.        , 1.        , 1.        , 0.90582163, 0.        ,
        1.        , 1.        ],
       [0.24987802, 0.        , 1.        , 0.        , 1.        ,
        0.        , 1.        ],
       [1.        , 0.        , 0.        , 0.        , 1.        ,
        1.        , 0.        ]])

# Turicreate models

In [19]:
import turicreate as tc
from sklearn.model_selection import train_test_split
import sys


def normalize_data(data):
    cols = data.columns
    df_matrix = pd.pivot_table(data, values=cols[2], index=cols[0], columns=cols[1])
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    
    d = df_matrix_norm.reset_index() 
    d.index.names = [cols[2]+'-norm']

    data_norm = pd.melt(d, id_vars=[cols[0]], value_name=cols[2]).dropna()
    return data_norm


def split_data(data):
    train, test = train_test_split(data, test_size = .2)
    return train, test


def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target)
        
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target, 
                                                      similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, user_id=user_id, item_id=item_id, target=target, 
                                                    similarity_type='pearson')
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model


def modeling (data, method='popularity', n_rec = 5, n_display = 15):
    cols = data.columns
    users_to_recommend = list(set(data[cols[0]]))
    data = tc.SFrame(data)
    m = model(data, method, cols[0], cols[1], cols[2], users_to_recommend, n_rec, n_display)
    return m

In [20]:
data_norm = normalize_data(df_rates)
train_data, test_data = split_data(data_norm)

popularity_model = modeling(train_data)
# cos = modeling(train_data, 'cosine')
# pear = modeling(train_data, 'pearson')

popularity_model.recommend()

+-------+--------+--------------------+------+
| BarID | BandID |       score        | rank |
+-------+--------+--------------------+------+
|   0   |   3    |        1.0         |  1   |
|   0   |   0    | 0.5833333333333334 |  2   |
|   0   |   4    | 0.4444444444444444 |  3   |
|   0   |   1    |        0.0         |  4   |
|   1   |   3    |        1.0         |  1   |
|   1   |   5    |        0.5         |  2   |
|   1   |   4    | 0.4444444444444444 |  3   |
|   1   |   6    |       0.375        |  4   |
|   1   |   1    |        0.0         |  5   |
|   2   |   3    |        1.0         |  1   |
|   2   |   0    | 0.5833333333333334 |  2   |
|   2   |   6    |       0.375        |  3   |
|   2   |   1    |        0.0         |  4   |
|   3   |   3    |        1.0         |  1   |
|   3   |   5    |        0.5         |  2   |
+-------+--------+--------------------+------+
[23 rows x 4 columns]



BarID,BandID,score,rank
0,3,1.0,1
0,0,0.5833333333333334,2
0,4,0.4444444444444444,3
0,1,0.0,4
3,3,1.0,1
3,5,0.5,2
3,6,0.375,3
3,1,0.0,4
4,3,1.0,1
4,5,0.5,2


In [21]:
# models_w_counts = [popularity_model, cos, pear]
# names_w_counts = ['Popularity Model on Consert Counts', 'Cosine Similarity on Consert Counts', 'Pearson Similarity on Consert Counts']

# eval_counts = tc.recommender.util.compare_models(tc.SFrame(test_data), models_w_counts, model_names=names_w_counts)