In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [2]:
# 각자 작업 환경에 맞는 경로를 지정해주세요. Google Colab과 Jupyter환경에서 경로가 다를 수 있습니다.

ratings_df = pd.read_csv(os.path.join('ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join('movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join('tags.csv'), encoding='utf-8')

In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [3]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))

In [7]:
print(f"전체 영화 수: {total_count}")
print(f"장르: {total_genres}")

전체 영화 수: 9742
장르: ['Horror', 'Thriller', 'Children', 'Drama', 'Sci-Fi', 'Mystery', 'Action', 'Fantasy', 'War', 'Animation', 'IMAX', 'Musical', 'Film-Noir', '(no genres listed)', 'Adventure', 'Documentary', 'Western', 'Romance', 'Comedy', 'Crime']


In [8]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] = genre_count[genre]+1

In [9]:
genre_count

{'Horror': 978,
 'Thriller': 1894,
 'Children': 664,
 'Drama': 4361,
 'Sci-Fi': 980,
 'Mystery': 573,
 'Action': 1828,
 'Fantasy': 779,
 'War': 382,
 'Animation': 611,
 'IMAX': 158,
 'Musical': 334,
 'Film-Noir': 87,
 '(no genres listed)': 34,
 'Adventure': 1263,
 'Documentary': 440,
 'Western': 167,
 'Romance': 1596,
 'Comedy': 3756,
 'Crime': 1199}

In [10]:
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])
  
genre_count

{'Horror': 0.9983092704481497,
 'Thriller': 0.7112681505684965,
 'Children': 1.1664800458677336,
 'Drama': 0.3490620385623247,
 'Sci-Fi': 0.9974220495432563,
 'Mystery': 1.2304935032683613,
 'Action': 0.7266719338379385,
 'Fantasy': 1.0971106675631865,
 'War': 1.4065847623240424,
 'Animation': 1.2026069149931968,
 'IMAX': 1.7899910382813284,
 'Musical': 1.4649016584241867,
 'Film-Noir': 2.0491288726171324,
 '(no genres listed)': 2.457169208193496,
 'Adventure': 0.8872447746804204,
 'Documentary': 1.3451954487495636,
 'Western': 1.7659316540881678,
 'Romance': 0.7856152382210405,
 'Comedy': 0.4139225416416778,
 'Crime': 0.9098289421369025}

In [11]:
# create genre representations
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation

9742it [00:42, 227.91it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.20261,1.16648,0.413923,,,,1.09711,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.09711,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.20261,,0.413923,,,,1.09711,,,,,,,,,,
193583,,,,1.20261,,0.413923,,,,1.09711,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.20261,,,,,,,,,,,,,,,,
