In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
path = '../Data/movielens/'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding = 'utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), encoding = 'utf-8', index_col = 'movieId')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding = 'utf-8')

### Genres를 이용한  movie representation

In [4]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


#### 이거 확인!!

In [5]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x : x.split('|'), movies_df['genres'])) for genre in sublist]))

In [6]:
print(f"전체 영화 수: {total_count}")
print(f"장르 : {total_genres}")

전체 영화 수: 9742
장르 : ['Documentary', 'Western', 'Drama', 'Crime', 'Musical', '(no genres listed)', 'Action', 'Animation', 'War', 'Film-Noir', 'IMAX', 'Comedy', 'Romance', 'Children', 'Adventure', 'Thriller', 'Sci-Fi', 'Fantasy', 'Mystery', 'Horror']


In [7]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else :
            genre_count[genre] = genre_count[genre] + 1

In [8]:
genre_count

{'Documentary': 440,
 'Western': 167,
 'Drama': 4361,
 'Crime': 1199,
 'Musical': 334,
 '(no genres listed)': 34,
 'Action': 1828,
 'Animation': 611,
 'War': 382,
 'Film-Noir': 87,
 'IMAX': 158,
 'Comedy': 3756,
 'Romance': 1596,
 'Children': 664,
 'Adventure': 1263,
 'Thriller': 1894,
 'Sci-Fi': 980,
 'Fantasy': 779,
 'Mystery': 573,
 'Horror': 978}

In [9]:
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])
    
genre_count

{'Documentary': 1.3451954487495636,
 'Western': 1.7659316540881678,
 'Drama': 0.3490620385623247,
 'Crime': 0.9098289421369025,
 'Musical': 1.4649016584241867,
 '(no genres listed)': 2.457169208193496,
 'Action': 0.7266719338379385,
 'Animation': 1.2026069149931968,
 'War': 1.4065847623240424,
 'Film-Noir': 2.0491288726171324,
 'IMAX': 1.7899910382813284,
 'Comedy': 0.4139225416416778,
 'Romance': 0.7856152382210405,
 'Children': 1.1664800458677336,
 'Adventure': 0.8872447746804204,
 'Thriller': 0.7112681505684965,
 'Sci-Fi': 0.9974220495432563,
 'Fantasy': 1.0971106675631865,
 'Mystery': 1.2304935032683613,
 'Horror': 0.9983092704481497}

### DataFrame.update()

In [10]:
genre_representation = pd.DataFrame(columns = sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp ={i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)
    
genre_representation

9742it [00:53, 180.90it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.20261,1.16648,0.413923,,,,1.09711,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.09711,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.20261,,0.413923,,,,1.09711,,,,,,,,,,
193583,,,,1.20261,,0.413923,,,,1.09711,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.20261,,,,,,,,,,,,,,,,


### Tags

In [12]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [15]:
tag_column = list(map(lambda x : x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x : x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['blood', 'wrongful imprisonment', 'meditative', 'societal criticism', 'Quakers', 'evil children', 'busniess', 'amazing', 'Atmospheric', 'characters', 'heroin', 'alan rickman', 'Creature Feature', 'theater', 'psychiatrist', 'haunting', 'freedom', 'large cast', 'Stones of Summer', 'Police', 'bad', 'art', 'black humour', 'Thrilling', 'David Fincher', 'narnia', 'poorly paced', 'drug abuse', 'kung fu', 'thought-provoking', 'doll', 'Howard Hughes', 'film history', 'remaster', 'teacher', 'Roger Avary', 'Up series', 'sexuality', 'Homeless', 'spiders', 'Francis Ford Coppola', 'Beethoven', 'Margot Robbie', 'trains', 'representation of children', 'gunfight', 'opera', 'sisterhood', 'Clousseau', 'Nun', 'E. M. Forster', 'spying', 'independent film', 'Africa', 'challenging', 'women', 'earnest', 'crime scene scrubbing', 'melancholic', 'asylum', 'Mila Kunis', 'Star Wars', 'Kevin Costner', 'NASA', 'needed more autobots', 'amazing artwork', 'beautiful cinematography', 'Stephen Crane', 'crime', 'diner', 

In [16]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [17]:
total_movie_count = len(set(tags_df['movieId']))
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1
            
tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])
    
tag_idf

{'blood': 3.196452541703389,
 'wrongful imprisonment': 3.196452541703389,
 'meditative': 2.895422546039408,
 'societal criticism': 3.196452541703389,
 'Quakers': 3.196452541703389,
 'evil children': 2.7193312869837265,
 'busniess': 3.196452541703389,
 'amazing': 3.196452541703389,
 'Atmospheric': 2.4974825373673704,
 'characters': 2.895422546039408,
 'heroin': 2.895422546039408,
 'alan rickman': 3.196452541703389,
 'Creature Feature': 3.196452541703389,
 'theater': 2.895422546039408,
 'psychiatrist': 2.895422546039408,
 'haunting': 3.196452541703389,
 'freedom': 3.196452541703389,
 'large cast': 3.196452541703389,
 'Stones of Summer': 3.196452541703389,
 'Police': 3.196452541703389,
 'bad': 2.2933625547114453,
 'art': 2.895422546039408,
 'black humour': 3.196452541703389,
 'Thrilling': 3.196452541703389,
 'David Fincher': 3.196452541703389,
 'narnia': 3.196452541703389,
 'poorly paced': 3.196452541703389,
 'drug abuse': 2.895422546039408,
 'kung fu': 3.196452541703389,
 'thought-provok

In [18]:
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))
    
    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index = [group['movieId'].values[0]])
    tag_representation.update(row_to_add)
    
tag_representation = tag_representation.sort_index(0)
tag_representation

100%|██████████████████████████████████████████████████████████████████████████████| 1572/1572 [04:39<00:00,  5.62it/s]


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [20]:
print(genre_representation.shape)
print(tag_representation.shape)

(9742, 20)
(1572, 1589)


### Final Movie Representation

In [22]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
print(movie_representation.shape)
print(movie_representation.describe())

(9742, 1609)
       (no genres listed)       Action    Adventure    Animation     Children  \
count         9742.000000  9742.000000  9742.000000  9742.000000  9742.000000   
mean             0.008576     0.136354     0.115027     0.075425     0.079506   
std              0.144915     0.283726     0.298052     0.291593     0.293989   
min              0.000000     0.000000     0.000000     0.000000     0.000000   
25%              0.000000     0.000000     0.000000     0.000000     0.000000   
50%              0.000000     0.000000     0.000000     0.000000     0.000000   
75%              0.000000     0.000000     0.000000     0.000000     0.000000   
max              2.457169     0.726672     0.887245     1.202607     1.166480   

            Comedy        Crime  Documentary        Drama      Fantasy  ...  \
count  9742.000000  9742.000000  9742.000000  9742.000000  9742.000000  ...   
mean      0.159587     0.111978     0.060756     0.156257     0.087728  ...   
std       0.201476  

### Contents 유사도 평가

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index = [a.index])
    
    return result_df

In [27]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [28]:
print(cs_df[1].sort_values(ascending=False))

2         1.000000
46972     0.322201
126142    0.300850
2043      0.300850
2399      0.300850
            ...   
39449     0.000000
39516     0.000000
39715     0.000000
39869     0.000000
7299      0.000000
Name: 1, Length: 9742, dtype: float64


### 추천시스템의 성능 평가

In [29]:
train_df, test_df = train_test_split(ratings_df, test_size = 0.2, random_state = 1234)

In [30]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [31]:
test_userids = list(set(test_df.userId.values))

In [35]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']]
    user_rating_df = user_record_df[['rating']]
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)
    
    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten()/(sim_sum+1)
    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]
    
    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis = 0)

100%|████████████████████████████████████████████████████████████████████████████████| 610/610 [00:11<00:00, 51.04it/s]


In [36]:
result_df.head()

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,1,4.145652,1,4.0,964982703
1,50,3.650755,1,5.0,964982931
2,216,2.670124,1,5.0,964981208
3,223,2.612844,1,3.0,964980985
4,231,4.215284,1,5.0,964981179
