In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
movie_data = pd.read_csv('./tranning-data/movies.csv')
rating_data = pd.read_csv('./tranning-data/ratings.csv')
link_data = pd.read_csv('./tranning-data/links.csv')

In [10]:
rating_data.drop('timestamp', axis = 1, inplace=True)
rating_data.head(2)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0


In [11]:
user_movie_rating = pd.merge(rating_data, movie_data, on = 'movieId')

In [12]:
user_movie_rating.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [13]:
movie_user_rating = user_movie_rating.pivot_table('rating', index = 'title', columns='userId')
user_movie_rating = user_movie_rating.pivot_table('rating', index = 'userId', columns='title')

In [14]:
movie_user_rating.fillna(0, inplace = True)
movie_user_rating.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,602,603,604,605,606,607,608,609,610,611
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
item_based_collabor = cosine_similarity(movie_user_rating)
item_based_collabor

array([[1.        , 0.        , 0.        , ..., 0.32732684, 0.        ,
        0.        ],
       [0.        , 1.        , 0.70710678, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.70710678, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.32732684, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]], shape=(9719, 9719))

In [16]:
print(movie_user_rating.shape)
print(item_based_collabor.shape)

(9719, 611)
(9719, 9719)


In [44]:
item_based_collabor = pd.DataFrame(data = item_based_collabor, index = movie_user_rating.index, columns = movie_user_rating.index)

In [47]:
print('Toy Story 3 (2010)' in item_based_collabor.index)

True


In [50]:
def get_item_based_collabor(title):
    return item_based_collabor[title].sort_values(ascending=False)

In [51]:
def recommend_with_series_priority(title, top_n=10):
    sims = get_item_based_collabor(title)
    
    # 1) 시리즈물: 기준 영화 제목에서 공통 키워드 추출 (예: 'Toy Story')
    base_keyword = title.split('(')[0].strip()  # "Toy Story"
    
    # 2) 시리즈 영화 필터링
    series_movies = [movie for movie in sims.index if base_keyword in movie]
    
    # 3) 시리즈 영화 + 상위 유사도 영화 합치기
    series_sims = sims.loc[series_movies]
    
    # 4) 시리즈 영화 먼저, 나머지 유사도 순으로 정렬
    combined = pd.concat([series_sims, sims.drop(series_movies)]).drop_duplicates()
    
    return combined[:top_n]


In [61]:
toy_movies = movie_data[movie_data['title'].str.contains('Spider', case=False, na=False)]
toy_movies

Unnamed: 0,movieId,title,genres
3027,4051,Horrors of Spider Island (Ein Toter Hing im Ne...,Horror|Sci-Fi
3151,4238,Along Came a Spider (2001),Action|Crime|Mystery|Thriller
3819,5349,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller
3821,5356,"Giant Spider Invasion, The (1975)",Horror|Sci-Fi
4250,6197,Spider (2002),Drama|Mystery
4567,6786,Kiss of the Spider Woman (1985),Drama
5260,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
6470,52722,Spider-Man 3 (2007),Action|Adventure|Sci-Fi|Thriller|IMAX
6685,58105,"Spiderwick Chronicles, The (2008)",Adventure|Children|Drama|Fantasy|IMAX
7927,95510,"Amazing Spider-Man, The (2012)",Action|Adventure|Sci-Fi|IMAX


In [62]:
similarities = recommend_with_series_priority('Spider-Man (2002)')
print(similarities)

title
Spider-Man (2002)                                      1.000000
Spider-Man 2 (2004)                                    0.732158
Spider-Man 3 (2007)                                    0.478260
Amazing Spider-Man, The (2012)                         0.300970
The Amazing Spider-Man 2 (2014)                        0.250006
Untitled Spider-Man Reboot (2017)                      0.223174
Star Wars: Episode II - Attack of the Clones (2002)    0.696357
Minority Report (2002)                                 0.661262
X2: X-Men United (2003)                                0.638789
X-Men (2000)                                           0.629953
Name: Spider-Man (2002), dtype: float64


In [63]:
similarities = recommend_with_series_priority('Toy Story (1995)')
print(similarities)

title
Toy Story (1995)                                     1.000000
Toy Story 2 (1999)                                   0.570540
Toy Story 3 (2010)                                   0.415589
Jurassic Park (1993)                                 0.563601
Independence Day (a.k.a. ID4) (1996)                 0.562231
Star Wars: Episode IV - A New Hope (1977)            0.555382
Forrest Gump (1994)                                  0.545127
Lion King, The (1994)                                0.539198
Star Wars: Episode VI - Return of the Jedi (1983)    0.539142
Mission: Impossible (1996)                           0.536973
Name: Toy Story (1995), dtype: float64


In [38]:
from sklearn.metrics.pairwise import cosine_similarity

vec1 = movie_user_rating.loc['Toy Story (1995)'].values.reshape(1, -1)
vec2 = movie_user_rating.loc['Toy Story 3 (2010)'].values.reshape(1, -1)
sim = cosine_similarity(vec1, vec2)[0][0]
print(f"Toy Story 3와의 유사도: {sim}")

Toy Story 3와의 유사도: 0.4155885787654142


In [29]:
similarities = get_item_based_collabor('Toy Story 3 (2010)')
print(similarities.sort_values(ascending=False).head(10))

title
Toy Story 3 (2010)           1.000000
Sherlock Holmes (2009)       0.557957
Up (2009)                    0.547897
X-Men: First Class (2011)    0.541870
Inside Out (2015)            0.514186
Star Trek (2009)             0.509823
Avengers, The (2012)         0.507237
Kung Fu Panda (2008)         0.500488
Avatar (2009)                0.494725
Iron Man (2008)              0.494617
Name: Toy Story 3 (2010), dtype: float64
