In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
from helpers.helpers_similarity import *

In [18]:
# Standard deviation of spherical Gaussian distribution
mu = 0.0
sigma_u = 1.0
sigma_v = 1.0
sigma = 1.0

# Dimensionality and number of users and items
D = 2
N = 1000
M = 500
alpha=[1,5,10,50,100]

# Generate U, V from the Gaussian distribution
U = np.random.normal(mu, sigma_u, size=(D,N))
V = np.random.normal(mu, sigma_v, size=(D,M))

# Generate noise and calculate ratings matrix R
noise = np.random.normal(mu, sigma, size=(N,M))
R = np.matmul(U.T,V) + noise

# Print the shape of U
print(U.shape)


(2, 1000)


In [19]:
df = pd.DataFrame(R)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.754525,-0.608284,-0.978242,0.789683,2.604163,-2.517925,1.813498,0.720028,-1.664498,1.583161,...,-1.75659,0.934951,1.081316,1.08784,0.026592,0.201692,0.455388,-4.404193,0.612668,-2.97314
1,1.732976,0.866422,-1.35343,-0.267973,-0.627372,-0.431919,-1.226767,0.660556,-1.360145,-2.244928,...,2.286301,-0.47874,0.829299,0.59421,2.601453,-0.147793,-0.034965,-0.96978,-0.447499,0.135791
2,-1.737401,-1.491388,0.671938,-0.881608,0.520194,1.066066,1.983844,0.953293,-1.134261,-2.700296,...,-5.903321,0.272599,-0.362228,1.930308,-6.833919,-2.349123,-3.124359,-3.463335,0.387442,-0.381235
3,-0.109015,-0.495978,-1.359528,1.249195,3.173697,-0.757982,-0.350015,1.015415,-3.156791,-0.67801,...,-5.533012,0.016949,-2.390928,3.228163,-2.772264,-3.110711,0.702685,-3.517137,0.601546,-2.43253
4,-2.036571,-2.780601,0.5927,-0.654696,0.550949,-1.433675,2.483433,0.836676,-0.2431,-1.126684,...,-4.526402,0.914005,-1.558377,0.698868,-3.351611,-1.616076,0.774147,-3.370705,0.92198,-1.655704


#### Discretize the ratings in R

In [20]:
df2 = df.applymap(condition)
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,4,2,2,4,5,1,4,4,2,4,...,2,4,4,4,3,3,3,1,4,1
1,4,4,2,3,2,3,2,4,2,1,...,5,3,4,4,5,3,3,2,3,3
2,2,2,4,2,4,4,4,4,2,1,...,1,3,3,4,1,1,1,1,3,3
3,3,3,2,4,5,2,3,4,1,2,...,1,3,1,5,1,1,4,1,4,1
4,1,1,4,2,4,2,5,4,3,2,...,1,4,2,4,1,2,4,1,4,2


In [21]:
# Compute the cosine similarity of the first user to all other users and add it as a new column
sim_array = cosine_similarity(df2)[0]
df2['similarity_u1'] = sim_array.tolist()

df2.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,similarity_u1
0,4,2,2,4,5,1,4,4,2,4,...,4,4,4,3,3,3,1,4,1,1.0
1,4,4,2,3,2,3,2,4,2,1,...,3,4,4,5,3,3,2,3,3,0.905072
2,2,2,4,2,4,4,4,4,2,1,...,3,3,4,1,1,1,1,3,3,0.848527
3,3,3,2,4,5,2,3,4,1,2,...,3,1,5,1,1,4,1,4,1,0.927756
4,1,1,4,2,4,2,5,4,3,2,...,4,2,4,1,2,4,1,4,2,0.903182


In [23]:
# Sort the dataframe by the similarity column in descending order
df2 = df2.sort_values(by='similarity_u1', ascending=False)
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,similarity_u1
0,4,2,2,4,5,1,4,4,2,4,...,4,4,4,3,3,3,1,4,1,1.0
776,4,2,4,4,5,1,4,4,1,2,...,4,2,4,3,2,3,1,4,1,0.949317
603,4,1,4,2,4,2,3,4,2,3,...,4,3,5,2,3,4,1,5,1,0.948791
482,3,2,3,4,5,2,4,4,2,2,...,3,3,5,2,4,3,1,2,2,0.947195
609,3,1,3,3,4,1,5,3,2,1,...,2,4,5,2,2,4,1,4,1,0.947162


In [24]:
user1_sim_list= df2.index
user1_sim_list[:20]

Int64Index([  0, 776, 603, 482, 609, 474, 579, 532, 834, 380, 805, 341, 191,
            719, 803, 141, 869, 831, 998, 207],
           dtype='int64')

In [25]:
tau, p_value = stats.kendalltau(user1_sim_list[:20], user1_sim_list[:20])
tau

1.0