# 콘텐츠 기반 필터링

In [1]:
#### - 데이터 읽기
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')
import ast

movies = pd.read_csv('./movies_large.csv')
print(movies.shape)
movies

(58098, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissi梨봱e (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi


In [2]:
movies_df = movies[['movieId', 'title', 'genres']]
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissi梨봱e (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi


In [3]:
movies_df.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [4]:
movies_df.genres

0        Adventure|Animation|Children|Comedy|Fantasy
1                         Adventure|Children|Fantasy
2                                     Comedy|Romance
3                               Comedy|Drama|Romance
4                                             Comedy
                            ...                     
58093                             (no genres listed)
58094                                         Comedy
58095                                          Drama
58096                  Adventure|Drama|Horror|Sci-Fi
58097                             Action|Crime|Drama
Name: genres, Length: 58098, dtype: object

In [5]:
movies_df['genres']= movies_df['genres'].str.replace(pat=r'[^\w]',repl=r' ',regex=True)

In [6]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),no genres listed
58094,193878,Les tribulations d'une caissi梨봱e (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure Drama Horror Sci Fi


In [7]:
print(movies_df['genres'].str.contains('no'))

0        False
1        False
2        False
3        False
4        False
         ...  
58093     True
58094    False
58095    False
58096    False
58097    False
Name: genres, Length: 58098, dtype: object


In [8]:
movies_df = movies_df[~movies_df['genres'].str.contains("no", na=False, case=False)]

In [9]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58092,193874,Blondie's Big Moment (1947),Comedy
58094,193878,Les tribulations d'une caissi梨봱e (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure Drama Horror Sci Fi


In [10]:
movies_df['genres'] = movies_df['genres'].astype(str)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
count_vector = CountVectorizer(ngram_range=(1, 3))

In [13]:
count_vector

CountVectorizer(ngram_range=(1, 3))

In [14]:
c_vector_genres = count_vector.fit_transform(movies_df['genres'])

In [15]:
c_vector_genres.shape

(53472, 685)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
gerne_c_sim = cosine_similarity(c_vector_genres, c_vector_genres)

In [17]:
gerne_c_sim

array([[1.        , 0.35355339, 0.16666667, ..., 0.        , 0.08333333,
        0.        ],
       [0.35355339, 1.        , 0.        , ..., 0.        , 0.11785113,
        0.        ],
       [0.16666667, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.28867513,
        0.40824829],
       [0.08333333, 0.11785113, 0.        , ..., 0.28867513, 1.        ,
        0.11785113],
       [0.        , 0.        , 0.        , ..., 0.40824829, 0.11785113,
        1.        ]])

In [18]:
gerne_sim_df = pd.DataFrame(data=gerne_c_sim, index=movies_df['title'], columns=movies_df['title'])
gerne_sim_df

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Cocaine Godmother (2017),No somos de piedra (1968),Tales from the Hood 2 (2018),Dos tipos de cuidado (1953),Room Laundering (2018),Blondie's Big Moment (1947),Les tribulations d'une caissi梨봱e (2011),Her Name Was Mumu (2016),Flora (2017),Leal (2018)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000000,0.353553,0.166667,0.117851,0.288675,0.0,0.166667,0.333333,0.000000,0.117851,...,0.000000,0.288675,0.000000,0.096225,0.000000,0.288675,0.288675,0.000000,0.083333,0.000000
Jumanji (1995),0.353553,1.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.707107,0.000000,0.166667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.117851,0.000000
Grumpier Old Men (1995),0.166667,0.000000,1.000000,0.471405,0.577350,0.0,1.000000,0.000000,0.000000,0.000000,...,0.000000,0.577350,0.000000,0.384900,0.000000,0.577350,0.577350,0.000000,0.000000,0.000000
Waiting to Exhale (1995),0.117851,0.000000,0.471405,1.000000,0.408248,0.0,0.471405,0.000000,0.000000,0.000000,...,0.235702,0.408248,0.000000,0.816497,0.408248,0.408248,0.408248,0.408248,0.117851,0.166667
Father of the Bride Part II (1995),0.288675,0.000000,0.577350,0.408248,1.000000,0.0,0.577350,0.000000,0.000000,0.000000,...,0.000000,1.000000,0.000000,0.333333,0.000000,1.000000,1.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Blondie's Big Moment (1947),0.288675,0.000000,0.577350,0.408248,1.000000,0.0,0.577350,0.000000,0.000000,0.000000,...,0.000000,1.000000,0.000000,0.333333,0.000000,1.000000,1.000000,0.000000,0.000000,0.000000
Les tribulations d'une caissi梨봱e (2011),0.288675,0.000000,0.577350,0.408248,1.000000,0.0,0.577350,0.000000,0.000000,0.000000,...,0.000000,1.000000,0.000000,0.333333,0.000000,1.000000,1.000000,0.000000,0.000000,0.000000
Her Name Was Mumu (2016),0.000000,0.000000,0.000000,0.408248,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.577350,0.000000,0.000000,0.333333,1.000000,0.000000,0.000000,1.000000,0.288675,0.408248
Flora (2017),0.083333,0.117851,0.000000,0.117851,0.000000,0.0,0.000000,0.166667,0.000000,0.117851,...,0.166667,0.000000,0.288675,0.096225,0.288675,0.000000,0.000000,0.288675,1.000000,0.117851


In [19]:
def find_sim_movie_genre(df, title_name):
    title_movie_sim = df[[title_name]].drop(title_name, axis=0)

    return title_movie_sim.sort_values(title_name, ascending=False)[:]

In [20]:
smlt1 = find_sim_movie_genre(gerne_sim_df, 'Dark Knight, The (2008)')
pd.set_option('display.max.colwidth', -1)

In [21]:
smlt1

title,"Dark Knight, The (2008)"
title,Unnamed: 1_level_1
Need for Speed (2014),1.000000
Leal (2018),0.816497
The Peace Killers (1971),0.816497
New Jack City (1991),0.816497
Silence the Witness (1974),0.816497
...,...
Watch Out for Perestroika (1990),0.000000
Special Features: Handsome (1983),0.000000
Grand Hotel Excelsior (1982),0.000000
Madly in Love (1981),0.000000


In [22]:
smlt1 = smlt1.sort_index()
smlt1.columns = ['score']
smlt1

Unnamed: 0_level_0,score
title,Unnamed: 1_level_1
"""Great Performances"" Cats (1998)",0.000000
#1 Cheerleader Camp (2010),0.192450
#Captured (2017),0.000000
#Horror (2015),0.111111
#SCREAMERS (2016),0.000000
...,...
吏뷨ufre mam泥쁭! (1987),0.000000
吏뷪hree Amigos! (1986),0.000000
吏뷬ivan las Antipodas! (2011),0.000000
罹꿧tarl罹뇃 Gazoz (2016),0.192450


# 협업 필터링

In [24]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')


ratings = pd.read_csv('./ratings.csv')
movies = pd.read_csv('./movies_large.csv')
movies.shape, ratings.shape

((58098, 3), (100836, 4))

In [25]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissi梨봱e (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi


In [26]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [27]:
rating_movies = pd.merge(ratings, movies, on='movieId')
rating_movies

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [28]:
 ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')  # csv 파일에 rating 부분만 볼거다 행은 user id, 
                                                                                        # 열은 영화 이름
ratings_matrix.fillna(0, inplace=True)                                                # 값이 없는 곳은 0으로 채우기 inplace는 원본변경                                        
ratings_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (1964),Zulu (2013),[REC] (2007),[REC]夷 3 G梨븂esis (2012),[REC]吏?(2009),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),吏뷪hree Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
anohana: The Flower We Saw That Day - The Movie (2013),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5


In [30]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df

(9719, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (1964),Zulu (2013),[REC] (2007),[REC]夷 3 G梨븂esis (2012),[REC]吏?(2009),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),吏뷪hree Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.141653,0.000000,...,0.000000,0.000000,0.342055,0.707107,0.543305,0.0,0.000000,0.139431,0.327327,0.000000
'Hellboy': The Seeds of Creation (2004),0.000000,1.000000,0.707107,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
'Round Midnight (1986),0.000000,0.707107,1.000000,0.000000,0.000000,0.0,0.176777,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
'Salem's Lot (2004),0.000000,0.000000,0.000000,1.000000,0.857493,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
'Til There Was You (1997),0.000000,0.000000,0.000000,0.857493,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
anohana: The Flower We Saw That Day - The Movie (2013),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000
eXistenZ (1999),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.211467,0.216295,0.097935,0.132489,...,0.078689,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.192259,0.000000,0.170341
xXx (2002),0.139431,0.000000,0.000000,0.000000,0.000000,0.0,0.089634,0.000000,0.276512,0.019862,...,0.202902,0.069716,0.305535,0.246482,0.173151,0.0,0.192259,1.000000,0.270034,0.100396
xXx: State of the Union (2005),0.327327,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.156764,0.000000,...,0.000000,0.000000,0.382543,0.231455,0.177838,0.0,0.000000,0.270034,1.000000,0.000000


In [31]:
def find_sim_movie_item(df, title_name):
    title_movie_sim = df[[title_name]].drop(title_name, axis=0)

    return title_movie_sim.sort_values(title_name, ascending=False)[:]

In [32]:
smlt2 = find_sim_movie_item(item_sim_df, 'Dark Knight, The (2008)')

In [33]:
smlt2

title,"Dark Knight, The (2008)"
title,Unnamed: 1_level_1
Inception (2010),0.727263
Iron Man (2008),0.670538
"Dark Knight Rises, The (2012)",0.666054
Batman Begins (2005),0.651282
"Lord of the Rings: The Return of the King, The (2003)",0.620637
...,...
Hideous Kinky (1998),0.000000
Switchback (1997),0.000000
Puppet Master (1989),0.000000
Puppet Master 4 (1993),0.000000


In [34]:
smlt2 = smlt2.sort_index()

In [35]:
smlt2.columns = ['score']
smlt2

Unnamed: 0_level_0,score
title,Unnamed: 1_level_1
'71 (2014),0.085745
'Hellboy': The Seeds of Creation (2004),0.076218
'Round Midnight (1986),0.053894
'Salem's Lot (2004),0.000000
'Til There Was You (1997),0.000000
...,...
anohana: The Flower We Saw That Day - The Movie (2013),0.095273
eXistenZ (1999),0.095307
xXx (2002),0.217525
xXx: State of the Union (2005),0.194388


# 하이브리드 추천 시스템

In [36]:
smlt1 = find_sim_movie_genre(gerne_sim_df, 'Toy Story (1995)')
smlt1 = smlt1.sort_index()
smlt1.columns = ['score']

In [37]:
smlt2 = find_sim_movie_item(item_sim_df, 'Toy Story (1995)')
smlt2 = smlt2.sort_index()
smlt2.columns = ['score']

In [38]:
smlt = pd.merge(smlt1, smlt2, left_index=True, right_index=True, how='right')

In [39]:
for i in range(len(smlt)):
    smlt['index'] = i

In [40]:
for i in range(len(smlt)):
    smlt['index'][i] = i

In [41]:
smlt = smlt.reset_index(drop=False)

In [42]:
smlt = smlt.set_index('index', inplace=False)

In [43]:
for i in range(len(smlt)):
    if smlt['score_x'][i] == 0 and smlt['score_y'][i] == 0:
        smlt = smlt.drop(index=[i], axis=0)

In [None]:
smlt

In [45]:
smlt.reset_index(inplace=True)
del smlt['index']

In [46]:
smlt['sum'] = 0

In [47]:
def similarity(df, top_n=10):
    for i in range(len(df)):
        df['sum'][i] = (df['score_x'][i] * 0.2) + (df['score_y'][i] * 0.8)
        
    return smlt.sort_values('sum', ascending=False)[:top_n]

In [48]:
result = similarity(smlt)

In [49]:
result.columns = ['title', 'CBF', 'CF', 'sum']

In [175]:
result

Unnamed: 0,title,CBF,CF,sum
3988,Inception (2010),0.509175,0.727263,0.683645
772,Batman Begins (2005),0.544331,0.651282,0.629891
1993,"Dark Knight Rises, The (2012)",0.333333,0.666054,0.59951
2786,Fight Club (1999),0.666667,0.572724,0.591512
2149,"Departed, The (2006)",0.408248,0.60092,0.562386
4097,Iron Man (2008),0.111111,0.670538,0.558652
4802,"Lord of the Rings: The Return of the King, The (2003)",0.222222,0.620637,0.540954
4371,Kill Bill: Vol. 1 (2003),0.408248,0.572126,0.539351
1200,"Bourne Ultimatum, The (2007)",0.408248,0.555397,0.525967
1496,Catch Me If You Can (2002),0.57735,0.494258,0.510876


In [50]:
result

Unnamed: 0,title,CBF,CF,sum
8382,Toy Story 2 (1999),1.0,0.572601,0.658081
279,Aladdin (1992),1.0,0.527859,0.622287
5457,"Monsters, Inc. (2001)",1.0,0.50465,0.60372
7237,Shrek (2001),0.894427,0.527977,0.601267
280,Aladdin (1992),0.75,0.527859,0.572287
2839,Finding Nemo (2003),0.866025,0.484297,0.560643
1341,"Bug's Life, A (1998)",0.866025,0.479241,0.556598
8962,Willy Wonka & the Chocolate Factory (1971),0.57735,0.512246,0.525267
4083,"Incredibles, The (2004)",0.75,0.46688,0.523504
4818,"Lion King, The (1994)",0.447214,0.541145,0.522359
