In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(2022)

In [2]:
# Sample Data
data = [
    [1., None, 1., None],
    [None, 1., 1., None],
    [1., None, 1., 1.],
]

df = pd.DataFrame(
    data=data,
    index=["userA", "userB", "userC"],
    columns=["itemA", "itemB", "itemC", "itemD"],
)
df

Unnamed: 0,itemA,itemB,itemC,itemD
userA,1.0,,1.0,
userB,,1.0,1.0,
userC,1.0,,1.0,1.0


In [3]:
# 결측치 제거
df = df.fillna(0)
df

Unnamed: 0,itemA,itemB,itemC,itemD
userA,1.0,0.0,1.0,0.0
userB,0.0,1.0,1.0,0.0
userC,1.0,0.0,1.0,1.0


In [4]:
# 유클리디안 유사도
from sklearn.metrics.pairwise import euclidean_distances

In [5]:
euclidean_distances(
    X=df.loc[["userA"]], 
    Y=df.loc[["userB"]],
)

array([[1.41421356]])

In [6]:
euclidean_distances(df)

array([[0.        , 1.41421356, 1.        ],
       [1.41421356, 0.        , 1.73205081],
       [1.        , 1.73205081, 0.        ]])

In [7]:
distance = euclidean_distances(df)
similarity = 1 / (distance + 1e-5)
similarity

array([[1.00000000e+05, 7.07101781e-01, 9.99990000e-01],
       [7.07101781e-01, 1.00000000e+05, 5.77346936e-01],
       [9.99990000e-01, 5.77346936e-01, 1.00000000e+05]])

In [8]:
# 코사인 유사도
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
cosine_similarity(
    X=df.loc[["userA"]], 
    Y=df.loc[["userB"]],
)

array([[0.5]])

In [10]:
cosine_similarity(df)

array([[1.        , 0.5       , 0.81649658],
       [0.5       , 1.        , 0.40824829],
       [0.81649658, 0.40824829, 1.        ]])

In [11]:
# 피어슨 유사도
# 유저의 선호도를 반영한 Sample Data
data = [
    [4., 5., 4., 3.],
    [3., 4., 3., 2.],
    [4., 4., 5., 3.],
]

df = pd.DataFrame(
    data=data,
    index=["userA", "userB", "userC"],
    columns=["itemA", "itemB", "itemC", "itemD"],
)

In [12]:
df

Unnamed: 0,itemA,itemB,itemC,itemD
userA,4.0,5.0,4.0,3.0
userB,3.0,4.0,3.0,2.0
userC,4.0,4.0,5.0,3.0


In [13]:
np.corrcoef(df)

array([[1. , 1. , 0.5],
       [1. , 1. , 0.5],
       [0.5, 0.5, 1. ]])

In [14]:
# 코사인 유사도
df.mean(axis=1)

userA    4.0
userB    3.0
userC    4.0
dtype: float64

In [15]:
user_mean = df.mean(axis=1)
df_sub = df.sub(user_mean, axis=0)

In [16]:
df_sub

Unnamed: 0,itemA,itemB,itemC,itemD
userA,0.0,1.0,0.0,-1.0
userB,0.0,1.0,0.0,-1.0
userC,0.0,0.0,1.0,-1.0


In [17]:
cosine_similarity(df_sub)

array([[1. , 1. , 0.5],
       [1. , 1. , 0.5],
       [0.5, 0.5, 1. ]])

In [18]:
# 자카드 유사도
# 유저마다 다른 아이템에 대한 선호도를 반영한 Sample Data
data = [
    [4., 0., 4., 3., 0.],
    [3., 4., 0., 2., 0.],
    [0., 0., 4., 5., 3.],
]

df = pd.DataFrame(
    data=data,
    index=["userA", "userB", "userC"],
    columns=["itemA", "itemB", "itemC", "itemD", "itemE"],
)
df

Unnamed: 0,itemA,itemB,itemC,itemD,itemE
userA,4.0,0.0,4.0,3.0,0.0
userB,3.0,4.0,0.0,2.0,0.0
userC,0.0,0.0,4.0,5.0,3.0


In [19]:
from sklearn.metrics import jaccard_score

In [20]:
df[df > 0] = 1
df

Unnamed: 0,itemA,itemB,itemC,itemD,itemE
userA,1.0,0.0,1.0,1.0,0.0
userB,1.0,1.0,0.0,1.0,0.0
userC,0.0,0.0,1.0,1.0,1.0


In [21]:
jaccard_score(
    df.loc["userB"],
    df.loc["userC"],
)

0.2