In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity # 코사인 유사도 사용

In [None]:
dummy_rating = pd.read_csv("https://grepp-reco-test.s3.ap-northeast-2.amazonaws.com/dummy_rating.csv", index_col=0)

In [None]:
dummy_rating.head()

Unnamed: 0,scifi1,scifi2,scifi3,comedy1,comedy2,comedy3
user1,4.0,5.0,3.0,,2.0,1.0
user2,5.0,3.0,3.0,2.0,2.0,
user3,1.0,,,4.0,5.0,4.0
user4,,2.0,1.0,4.0,,3.0
user5,1.0,,2.0,3.0,3.0,4.0


In [None]:
dummy_rating = dummy_rating.T #.T : transpose

In [None]:
dummy_rating.head()

Unnamed: 0,user1,user2,user3,user4,user5
scifi1,4.0,5.0,1.0,,1.0
scifi2,5.0,3.0,,2.0,
scifi3,3.0,3.0,,1.0,2.0
comedy1,,2.0,4.0,4.0,3.0
comedy2,2.0,2.0,5.0,,3.0


In [None]:
dummy_rating.fillna(0, inplace=True)
dummy_rating

Unnamed: 0,user1,user2,user3,user4,user5
scifi1,4.0,5.0,1.0,0.0,1.0
scifi2,5.0,3.0,0.0,2.0,0.0
scifi3,3.0,3.0,0.0,1.0,2.0
comedy1,0.0,2.0,4.0,4.0,3.0
comedy2,2.0,2.0,5.0,0.0,3.0
comedy3,1.0,0.0,4.0,3.0,4.0


In [None]:
# 평점 정보를 보정. 이후에 코사인 유사도를 사용하면 이는 피어슨 유사도에 해당
def standardize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    return new_row

# 행렬을 transpose해서 데이터 프레임을 생성
dummy_rating_std = dummy_rating.apply(standardize)
dummy_rating_std.head()

Unnamed: 0,user1,user2,user3,user4,user5
scifi1,0.3,0.5,-0.266667,-0.416667,-0.291667
scifi2,0.5,0.1,-0.466667,0.083333,-0.541667
scifi3,0.1,0.1,-0.466667,-0.166667,-0.041667
comedy1,-0.5,-0.1,0.333333,0.583333,0.208333
comedy2,-0.1,-0.1,0.533333,-0.416667,0.208333


In [None]:
# 아이템간의 유사도 측정 행렬 만들기
corrMatrix = pd.DataFrame(cosine_similarity(dummy_rating_std),index=dummy_rating.index,columns=dummy_rating.index)
corrMatrix

Unnamed: 0,scifi1,scifi2,scifi3,comedy1,comedy2,comedy3
scifi1,1.0,0.620156,0.676031,-0.8336,-0.185071,-0.975307
scifi2,0.620156,1.0,0.628122,-0.626058,-0.716583,-0.743085
scifi3,0.676031,0.628122,1.0,-0.715679,-0.55747,-0.682763
comedy1,-0.8336,-0.626058,-0.715679,1.0,0.06077,0.787228
comedy2,-0.185071,-0.716583,-0.55747,0.06077,1.0,0.337811
comedy3,-0.975307,-0.743085,-0.682763,0.787228,0.337811,1.0


In [None]:
def get_similar(movie_name):
    # 주어진 영화 이름에 해당하는 유사도 컬럼을 읽어서 평점 정보에 rating를 곱함
    similar_score = corrMatrix[movie_name]
    # 앞서 보정된 값을 가지고 평점의 내림차순으로 정렬
    similar_score = similar_score.sort_values(ascending=False)
    return similar_score

In [None]:
movie_i_liked = "scifi1"

# 3개의 영화 평점을 가진 사용자를 기반으로 비슷한 아이템을 찾아보자
similar_scores = pd.DataFrame(get_similar(movie_i_liked)) #, ignore_index=True)
similar_scores
# 유사도 높은 순으로 정렬된 것을 볼 수 있다. (본인은 당연히 1)
# 음수인 것은 당연히 추천 안해야 함. scifi3를 추천하는게 제일 좋아보임

Unnamed: 0,scifi1
scifi1,1.0
scifi3,0.676031
scifi2,0.620156
comedy2,-0.185071
comedy1,-0.8336
comedy3,-0.975307
