In [1]:
!head ml-25m/movies.csv

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action


In [5]:
import csv

movies = {}
genres = {}

with open("ml-25m/movies.csv", "r") as f:
    csvreader = csv.reader(f)
    print(f"첫번째 줄 스킵 : {next(csvreader)}")

    for id, title, genre in csvreader:
        movies[int(id)] = title
        genres[int(id)] = set(genre.split("|"))

첫번째 줄 스킵 : ['movieId', 'title', 'genres']


In [27]:
def jaccard_similarity(a, b):
    if len(a | b) == 0: return 0
    return len(a & b) / len(a | b)

`genres[1]` : {'Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'}

`len(genres[1])` : 5

`len(genres[1] | genres[2])` : 합집합 (또는) 의 의미를 가지고 있다.

`len(genres[1] & genres[2])` : 곱집합 (그리고) 의 의미를 가지고 있다.

In [28]:
def get_topk_jaccard_genres(target_movie, k=20):
    target_genres = genres[target_movie]

    result = []
    
    for movie_id, movie_title in movies.items():
        search_genres = genres[movie_id]
        score = jaccard_similarity(target_genres, search_genres)

        result.append((score, movie_title))

    result.sort(reverse=True)
    return result[:k]

In [29]:
gravity = 104841
print(movies[gravity])
get_topk_jaccard_genres(gravity)

Gravity (2013)


[(1.0, 'The Amazing Spider-Man 2 (2014)'),
 (1.0, 'Gravity (2013)'),
 (1.0, 'Edge of Tomorrow (2014)'),
 (0.75, 'V for Vendetta (2006)'),
 (0.75, 'Tron: Legacy (2010)'),
 (0.75, 'Transformers: Revenge of the Fallen (2009)'),
 (0.75, 'Transformers (2007)'),
 (0.75, 'The Hunger Games: Catching Fire (2013)'),
 (0.75, 'Tai Chi Zero (2012)'),
 (0.75, 'Superman Returns (2006)'),
 (0.75, 'Star Wars: Episode II - Attack of the Clones (2002)'),
 (0.75, 'Star Trek Into Darkness (2013)'),
 (0.75, 'Star Trek (2009)'),
 (0.75, 'Spider-Man 2 (2004)'),
 (0.75, 'Speed Racer (2008)'),
 (0.75, 'RoboCop (2014)'),
 (0.75, 'Riddick (2013)'),
 (0.75, 'Resident Evil: Retribution (2012)'),
 (0.75, 'Real Steel (2011)'),
 (0.75, 'Prometheus (2012)')]

In [30]:
iron_man = 59315 # iron man
print(movies[iron_man])
get_topk_jaccard_genres(iron_man)

Iron Man (2008)


[(1.0, 'X-Men: Days of Future Past (2014)'),
 (1.0, 'X-Men (2000)'),
 (1.0, 'Women of the Prehistoric Planet (1966)'),
 (1.0, 'Wheels of Fire (1985)'),
 (1.0, 'Waterworld (1995)'),
 (1.0, 'Valerian and the City of a Thousand Planets (2017)'),
 (1.0, 'Turbo Kid (2015)'),
 (1.0, 'Tron (1982)'),
 (1.0, 'Transmorphers Fall of Man (2009)'),
 (1.0, 'Transmorphers (2007)'),
 (1.0, 'Transformers: Age of Extinction (2014)'),
 (1.0, 'Timeline (2003)'),
 (1.0, 'Time Trap (2017)'),
 (1.0, 'Time Tracers (1995)'),
 (1.0, 'Time Machine, The (2002)'),
 (1.0, 'Time Machine, The (1960)'),
 (1.0, 'Thor: Ragnarok (2017)'),
 (1.0, 'The War in Space (1977)'),
 (1.0, 'The Veil (2017)'),
 (1.0, 'The Sisterhood (1988)')]

In [31]:
ratings = []

with open("ml-25m/ratings.csv", "r") as f:
    print(f"첫번째 줄 스킵 : {f.readline()}")
    
    for line in f:
        user_id, movie_id, rating, timestamp = line.split(",")
        ratings.append((int(user_id), int(movie_id), float(rating)))

첫번째 줄 스킵 : userId,movieId,rating,timestamp



In [47]:
from collections import defaultdict

users = defaultdict(set)

for user_id, movie_id, rating in ratings:
    users[movie_id].add(user_id)

users 에 movie id 값을 넣고, 영화를 본 사용자를 쭉 기록한다.

즉, `defaultdict(set, {영화 아이디 : {영화를 본 유저 아이디들}})` 형식이 된다.

`users[1]` 하게 되면, 1번 영화를 본 유저의 아이디를 모두 볼 수 있다.

In [52]:
from tqdm import tqdm

def get_topk_jaccard_ratings(target_movie, k=20):
    target_users = users[target_movie]

    result = []

    for movie_id, movie_title in tqdm(movies.items()):
        search_users = users[movie_id]
        score = jaccard_similarity(target_users, search_users)

        result.append((score, movie_title))
    
    result.sort(reverse=True)
    return result[:k]

`tqdm` 을 통해 진행도를 수치화하여 확인해볼 수 있다.

A 영화를 본 유저 모음과 B 영화를 본 유저 모음의 유사도를 측정하여 A와 B의 유사도를 확인하는 방법이다.
즉, 클릭형 데이터와 같다.

In [53]:
print(gravity)
get_topk_jaccard_ratings(gravity)

104841


100%|██████████| 62423/62423 [00:44<00:00, 1393.26it/s]


[(1.0, 'Gravity (2013)'),
 (0.35805736078141415, 'Interstellar (2014)'),
 (0.35123831007096196, 'The Martian (2015)'),
 (0.34912568306010927, 'Edge of Tomorrow (2014)'),
 (0.34473317642538065, 'Wolf of Wall Street, The (2013)'),
 (0.3414977020816437, 'Ex Machina (2015)'),
 (0.3379782569523695, 'Looper (2012)'),
 (0.3335578118524658, 'Mad Max: Fury Road (2015)'),
 (0.32022341168256924, 'Her (2013)'),
 (0.3199327234688436, 'Dark Knight Rises, The (2012)'),
 (0.31529790660225443, 'Guardians of the Galaxy (2014)'),
 (0.3145170941875773, 'Django Unchained (2012)'),
 (0.31443244942878107, 'Gone Girl (2014)'),
 (0.3125883303048657, 'The Hunger Games (2012)'),
 (0.3019681349578257, 'The Imitation Game (2014)'),
 (0.301206675224647, 'Grand Budapest Hotel, The (2014)'),
 (0.2998122261631546, 'Shutter Island (2010)'),
 (0.2976522597536375, 'District 9 (2009)'),
 (0.29670975323149235, 'Prometheus (2012)'),
 (0.29623615380154267, 'Arrival (2016)')]

In [55]:
print(iron_man)
get_topk_jaccard_ratings(iron_man)

59315


100%|██████████| 62423/62423 [01:44<00:00, 599.62it/s]


[(1.0, 'Iron Man (2008)'),
 (0.4553055080248652, 'Dark Knight, The (2008)'),
 (0.43536170577643124, 'Avatar (2009)'),
 (0.43502839245244584, 'Batman Begins (2005)'),
 (0.42462278447651053, 'Avengers, The (2012)'),
 (0.4243327426024381, 'WALL·E (2008)'),
 (0.4046754120417087, 'Star Trek (2009)'),
 (0.3885758527248484, 'Inception (2010)'),
 (0.3834631515877771, 'Up (2009)'),
 (0.3798344017094017, 'V for Vendetta (2006)'),
 (0.3785325924598196, 'Bourne Ultimatum, The (2007)'),
 (0.3762459999264354, 'Iron Man 2 (2010)'),
 (0.37312455346511075, 'Casino Royale (2006)'),
 (0.3730232838338504, 'Dark Knight Rises, The (2012)'),
 (0.3658243080625752, 'Guardians of the Galaxy (2014)'),
 (0.3597768479776848, 'Inglourious Basterds (2009)'),
 (0.35823165579477473, 'Sherlock Holmes (2009)'),
 (0.3566166184836333, 'District 9 (2009)'),
 (0.3556231003039514, 'Prestige, The (2006)'),
 (0.35248645687503805, '300 (2007)')]

In [56]:
uratings = defaultdict(dict)

for user_id, movie_id, rating in ratings:
    uratings[movie_id][user_id] = rating

uratings 에 movie id 값을 넣고, 영화를 본 사용자와 랭킹을 쭉 기록한다.

즉, `defaultdict(set, {영화 아이디 : {영화를 본 유저 아이디들 : 별점들}})` 형식이 된다.

`uratings[1]` 하게 되면, 1번 영화를 본 유저의 아이디와 별점들을 모두 볼 수 있다.

In [58]:
for movie_id, rating_set in uratings.items():
    avg = sum(rating_set.values()) / len(rating_set)
    
    for k in rating_set:
        rating_set[k] -= avg

- Pearson Correlation
    - uratings 의 영화 아이디와 유저들의 별점들을 가져온다.
    - 평균 값을 생성하고, 유저들의 별점들의 값을 평균 값으로 뺀다.

In [59]:
def cosine_similarity(a, b):
    numerator = sum(a[k] * b[k] for k in a.keys() & b.keys())
    denominator = (sum(x*x for x in a.values()) * sum(x*x for x in b.values())) ** 0.5
    if denominator == 0: return 0
    return numerator / denominator

1. 타겟 영화의 유저 아이디 값과 검색할 영화의 유저 아이디 값 중 두 값이 모두 존재하는 k에 대하여 유저들의 별점에서 해당하는 값끼리 가져오고 이 가져온 값들을 모두 더한다.
    1. 두 값이 모두 존재하는 곱집합에 해당하는 k는 둘 다 본 유저의 아이디를 의미한다.
    2. 유저의 별점을 각각 곱한 후 그 곱한 값들을 모두 더한다.
2. 타겟 영화에 존재하는 유저 별점들의 값을 모두 제곱한 후 더하고, 검색할 영화에 존재하는 유저 별점들의 값을 모두 제곱한 후 더한다. 그리고 이 값에 루트 값을 씌운다.
3. 1번의 값을 2번의 값으로 나누면 코사인 유사도 값이 나온다.

In [60]:
def get_topk_pearson_ratings(target_moive, k=20):
    target_rating_set = uratings[target_moive]

    result = []

    for movie_id, search_rating_set in tqdm(uratings.items()):
        score = cosine_similarity(target_rating_set, search_rating_set)

        result.append((score, movies[movie_id]))

    result.sort(reverse=True)
    return result[:k]

1. 각 영화에 유저들의 별점 모음에서 타겟이 되는 {유저:별점} 사전을 들고 온다.
2. 모든 영화에 유저들의 별점 모음에서 아이디와 {유저:별점} 사전을 추출하고, 타겟 유저들의 별점 모음 사전과 현 영화의 유저들의 별점 모음에 대하여 코사인 유사도를 확인한다.
3. 유사도가 가장 높은 k개를 반환한다.

In [61]:
print(gravity)
get_topk_pearson_ratings(gravity)

104841


100%|██████████| 59047/59047 [00:32<00:00, 1814.89it/s]


[(0.9999999999999993, 'Gravity (2013)'),
 (0.1675637835030477, 'The Martian (2015)'),
 (0.15893037944645264, 'Interstellar (2014)'),
 (0.14507434012672565, 'Looper (2012)'),
 (0.1445289584542844, 'Edge of Tomorrow (2014)'),
 (0.14270120617078794, 'Social Network, The (2010)'),
 (0.13265580029724086, 'Arrival (2016)'),
 (0.13180094930828926, 'Ex Machina (2015)'),
 (0.13174966953825917, 'Her (2013)'),
 (0.13084736683121753, 'Mad Max: Fury Road (2015)'),
 (0.1306607659810794, 'Life of Pi (2012)'),
 (0.13052055540487748, 'The Revenant (2015)'),
 (0.12836566088259643, 'Inception (2010)'),
 (0.12808470593620774, 'Avatar (2009)'),
 (0.12752398072279233, 'Prometheus (2012)'),
 (0.12602477214897687, 'Dawn of the Planet of the Apes (2014)'),
 (0.12466787482767948, 'Argo (2012)'),
 (0.12373555110752064, 'Rise of the Planet of the Apes (2011)'),
 (0.12125702995402551, 'Skyfall (2012)'),
 (0.11854312281090199, 'Gone Girl (2014)')]

In [63]:
print(iron_man)
get_topk_pearson_ratings(iron_man)

59315


100%|██████████| 59047/59047 [01:05<00:00, 895.78it/s] 


[(0.9999999999999947, 'Iron Man (2008)'),
 (0.3889511828327884, 'Avengers, The (2012)'),
 (0.38880895517115177, 'Iron Man 2 (2010)'),
 (0.3070222090675307, 'Iron Man 3 (2013)'),
 (0.27285387615386025, 'Batman Begins (2005)'),
 (0.26745868438611897, 'Captain America: The First Avenger (2011)'),
 (0.2654314855025813, 'Captain America: The Winter Soldier (2014)'),
 (0.26336453389397585, 'Star Trek (2009)'),
 (0.26291720661571566, 'Avengers: Age of Ultron (2015)'),
 (0.2567399273658855, 'Thor (2011)'),
 (0.24812905033329438, 'X-Men: First Class (2011)'),
 (0.24615682460732846, 'Guardians of the Galaxy (2014)'),
 (0.24400053318419038, 'Sherlock Holmes (2009)'),
 (0.24172358449210124, 'Spider-Man (2002)'),
 (0.2371753939110729, 'Dark Knight, The (2008)'),
 (0.2304455131275874, 'X-Men: Days of Future Past (2014)'),
 (0.22765953417955592, 'Dark Knight Rises, The (2012)'),
 (0.22683461504069075, 'X-Men (2000)'),
 (0.22659851427581307, 'Transformers (2007)'),
 (0.2225713136125041,
  'Pirates of 