In [1]:
%%capture
!pip install nvtabular==1.3.3

In [2]:
%%capture
!pip install merlin-models polars merlin-core==v0.4.0 dask_cuda

In [3]:
import nvtabular as nvt
from nvtabular import Workflow, ColumnSelector, Dataset
from nvtabular.ops import FillMissing, Categorify, AddTags
from merlin.io.dataset import Dataset as MerlinDataset
from merlin.models.xgb import XGBoost
import polars as pl
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error
from dask.distributed import Client
from merlin.schema.tags import Tags

In [4]:
from dask.distributed import Client

# Dask 클라이언트 초기화
#client = Client()
client = Client(n_workers=4, 
                threads_per_worker=2, 
                memory_limit='8GB')

In [5]:
# 애니메이션 정보 데이터
anime = pd.read_csv("/kaggle/input/anime-recommendation-database-2020/anime.csv")
print(len(anime))
anime.head(1)

17562


Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0


In [6]:
# 컬럼 이름 변경
anime.rename(columns={'MAL_ID': 'anime_id'}, inplace=True)

In [7]:
item_data = anime

In [8]:
item_data.columns

Index(['anime_id', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [9]:
item_data = item_data[['anime_id', 'Score', 
                       'Genres', 'Popularity', 
                       'Members', 'Completed', 
                       'On-Hold', 'Dropped']]

item_data.head(2)

Unnamed: 0,anime_id,Score,Genres,Popularity,Members,Completed,On-Hold,Dropped
0,1,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",39,1251960,718161,71513,26678
1,5,8.39,"Action, Drama, Mystery, Sci-Fi, Space",518,273145,208333,1935,770


In [10]:
item_data = item_data.dropna()

In [11]:
# 장르를 리스트로 분리
item_data['Genres_list'] = item_data['Genres'].apply(lambda x: x.split('|'))

In [12]:
# 평점 정보 데이터
interaction_data = pd.read_csv("/kaggle/input/anime-recommendation-database-2020/rating_complete.csv")
print(len(interaction_data))
interaction_data.head(1)

57633278


Unnamed: 0,user_id,anime_id,rating
0,0,430,9


In [13]:
print('데이터셋의 사용자 수', len(interaction_data['user_id'].unique()))

데이터셋의 사용자 수 310059


In [14]:
# user_id가 n의 배수인 행 추출
# 자원 문제로 10의 배수 아이디인 사용자만 한정
interaction_data = interaction_data[interaction_data['user_id'] % 10 == 0]
print(len(interaction_data))

5815169


In [15]:
interaction_data.tail(3)

Unnamed: 0,user_id,anime_id,rating
57633097,353400,15051,7
57633098,353400,32281,10
57633099,353400,32686,8


In [16]:
print('수정된 데이터셋의 사용자 수', len(interaction_data['user_id'].unique()))

수정된 데이터셋의 사용자 수 30936


In [17]:
# 사용자 정보 데이터 정의
# user_id별 item_id 개수 계산 -> 평점 부여 횟수를 활성화 지표로 사용
user_activation_level = interaction_data.groupby('user_id')['anime_id'].nunique().reset_index()

In [18]:
# 컬럼명 변경해줌
user_activation_level.columns = ['user_id', 'activation']
user_data = user_activation_level
print(len(user_data))
user_data.head(1)

30936


Unnamed: 0,user_id,activation
0,0,35


In [19]:
data = pd.merge(interaction_data, user_data, on="user_id", 
                how="left")
data.head(1)

Unnamed: 0,user_id,anime_id,rating,activation
0,0,430,9,35


In [20]:
data = pd.merge(data, item_data, 
                on="anime_id", how="left")
data.head(1)

Unnamed: 0,user_id,anime_id,rating,activation,Score,Genres,Popularity,Members,Completed,On-Hold,Dropped,Genres_list
0,0,430,9,35,7.57,"Military, Comedy, Historical, Drama, Fantasy, ...",506,279946,245283,1253,776,"[Military, Comedy, Historical, Drama, Fantasy,..."


In [21]:
data = data.sort_values(by="user_id")
data.head(2)

Unnamed: 0,user_id,anime_id,rating,activation,Score,Genres,Popularity,Members,Completed,On-Hold,Dropped,Genres_list
0,0,430,9,35,7.57,"Military, Comedy, Historical, Drama, Fantasy, ...",506,279946,245283,1253,776,"[Military, Comedy, Historical, Drama, Fantasy,..."
20,0,415,10,35,7.19,"Slice of Life, Comedy",2796,33789,23117,398,374,"[Slice of Life, Comedy]"


In [22]:
# 사용자별 장르 리스트 추출
user_genres = data.groupby('user_id')['Genres_list'].apply(lambda x: [genre for sublist in x for genre in sublist])

In [23]:
# 지니계수 이용
# 장르가 균등할수록 0에 가까움
# 특정 장르를 많이 보는 사람일수록 1에 가까움

# def calculate_gini_coefficient(genres):
#     genre_counts = pd.Series(genres).value_counts(normalize=True).values
#     genre_counts.sort()
#     n = len(genre_counts)
#     cumulative = np.cumsum(genre_counts)
#     gini_index = (n + 1 - 2 * np.sum(cumulative) / cumulative[-1]) / n
#     return gini_index


# # 계산
# user_diversity_scores = user_genres.apply(calculate_gini_coefficient)

In [24]:
# 장르 다양성 지표 계산 : 샤넌 엔트로피
# 다양한 장르가 균등하게 분포되어 있는지를 평가
# 0에 가까우면 사용자가 소비하는 장르가 매우 편중
def calculate_diversity_score(genres):
    genre_counts = pd.Series(genres).value_counts(normalize=True)
    entropy = -np.sum(genre_counts * np.log2(genre_counts))
    max_entropy = np.log2(len(genre_counts))
    return entropy / max_entropy

# 계산
user_diversity_scores = user_genres.apply(calculate_diversity_score)

In [25]:
# 확인
user_diversity_scores

user_id
0         1.000000
10        1.000000
20        0.973112
30        0.980617
40        0.983091
            ...   
353350    0.978507
353360    0.985158
353370    0.990716
353380    0.997282
353400    0.987741
Name: Genres_list, Length: 30936, dtype: float64

In [26]:
user_diversity_df = pd.DataFrame({
    'user_id': user_diversity_scores.index,
    'diversity_score': user_diversity_scores.values
})

# user_id를 인덱스로 설정
user_diversity_df.set_index('user_id', inplace=True)

In [27]:
data = data.merge(user_diversity_df, left_on='user_id', right_index=True, how='left')
data.head(3)

Unnamed: 0,user_id,anime_id,rating,activation,Score,Genres,Popularity,Members,Completed,On-Hold,Dropped,Genres_list,diversity_score
0,0,430,9,35,7.57,"Military, Comedy, Historical, Drama, Fantasy, ...",506,279946,245283,1253,776,"[Military, Comedy, Historical, Drama, Fantasy,...",1.0
20,0,415,10,35,7.19,"Slice of Life, Comedy",2796,33789,23117,398,374,"[Slice of Life, Comedy]",1.0
21,0,2236,10,35,8.2,"Adventure, Drama, Romance, Sci-Fi",166,619843,468670,3407,1533,"[Adventure, Drama, Romance, Sci-Fi]",1.0


In [28]:
data.to_csv('anime.csv', index = False)

In [29]:
unique_diversity_scores = sorted(data['diversity_score'].unique())

print(len(unique_diversity_scores))  # 유일한 값의 개수

20728


In [30]:
# XGBoost 모델을 사용할 예정이므로 수치형 데이터만 사용해서 실험을 진행
columns_to_drop = ['Genres', 'Genres_list']
data = data.drop(columns=columns_to_drop)

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5815169 entries, 0 to 5815168
Data columns (total 11 columns):
 #   Column           Dtype  
---  ------           -----  
 0   user_id          int64  
 1   anime_id         int64  
 2   rating           int64  
 3   activation       int64  
 4   Score            object 
 5   Popularity       int64  
 6   Members          int64  
 7   Completed        int64  
 8   On-Hold          int64  
 9   Dropped          int64  
 10  diversity_score  float64
dtypes: float64(1), int64(9), object(1)
memory usage: 532.4+ MB


In [32]:
# data에 object인 컬럼 존재
# 'Unknown' 값이 있는지 확인하고 각 컬럼에 몇 개 있는지 출력
unknown_counts = {}
for column in data.columns:
    if data[column].dtype == 'object':
        unknown_counts[column] = (data[column] == 'Unknown').sum()

print("'Unknown' 값의 개수:")
for column, count in unknown_counts.items():
    print(f"{column}: {count}")

'Unknown' 값의 개수:
Score: 10781


In [33]:
# 'Unknown' 값이 포함된 행 삭제
data = data.replace('Unknown', pd.NA).dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5803736 entries, 0 to 5815168
Data columns (total 11 columns):
 #   Column           Dtype  
---  ------           -----  
 0   user_id          int64  
 1   anime_id         int64  
 2   rating           int64  
 3   activation       int64  
 4   Score            object 
 5   Popularity       int64  
 6   Members          int64  
 7   Completed        int64  
 8   On-Hold          int64  
 9   Dropped          int64  
 10  diversity_score  float64
dtypes: float64(1), int64(9), object(1)
memory usage: 531.3+ MB


In [34]:
print('수정된 데이터셋의 길이 :', len(data))

수정된 데이터셋의 길이 : 5803736


In [35]:
# Score(평균평) 컬럼을 float으로 변환
data['Score'] = pd.to_numeric(data['Score'], errors='coerce')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5803736 entries, 0 to 5815168
Data columns (total 11 columns):
 #   Column           Dtype  
---  ------           -----  
 0   user_id          int64  
 1   anime_id         int64  
 2   rating           int64  
 3   activation       int64  
 4   Score            float64
 5   Popularity       int64  
 6   Members          int64  
 7   Completed        int64  
 8   On-Hold          int64  
 9   Dropped          int64  
 10  diversity_score  float64
dtypes: float64(2), int64(9)
memory usage: 531.3 MB


In [36]:
# 확인 # xgboost를 사용할 것이므로 스케일링 굳이 안 
data.head(2)

Unnamed: 0,user_id,anime_id,rating,activation,Score,Popularity,Members,Completed,On-Hold,Dropped,diversity_score
0,0,430,9,35,7.57,506,279946,245283,1253,776,1.0
20,0,415,10,35,7.19,2796,33789,23117,398,374,1.0


In [37]:
# 데이터셋을 NVTabular Dataset으로 변환
dataset = nvt.Dataset(data)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [38]:
# 디렉토리 확인

from nvtabular import ops, ColumnSelector
from merlin.schema.tags import Tags

print(dir(Tags))

['BINARY_CLASSIFICATION', 'CATEGORICAL', 'CONTEXT', 'CONTINUOUS', 'ITEM', 'ITEM_ID', 'LIST', 'MULTI_CLASS_CLASSIFICATION', 'REGRESSION', 'SEQUENCE', 'SESSION', 'SESSION_ID', 'TARGET', 'TEXT', 'TEXT_TOKENIZED', 'TIME', 'USER', 'USER_ID', '__class__', '__doc__', '__members__', '__module__']


In [39]:
# # 사용자 피처 정의
# user_features = [
#     'user_id' >> ops.FillMissing() >> ops.Categorify() >> ops.AddTags([Tags.USER_ID])
# ]

# # 아이템 피처 정의
# item_features = [
#     'anime_id' >> ops.Categorify() >> ops.AddTags([Tags.ITEM_ID]),
#     'Score' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
#     'Members' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
#     'Popularity' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
#     'Completed' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
#     'On-Hold' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
#     'Dropped' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS])
# ]

# # 상호작용 피처 정의
# interaction_features = ['user_id', 'anime_id', 'rating'] >> ops.FillMissing() >> ops.Categorify() >> ops.AddTags([Tags.CONTINUOUS, Tags.TARGET])

In [40]:
# 사용자 피처 정의
user_features = [
    'user_id' >> ops.FillMissing() >> ops.Categorify() >> ops.AddTags([Tags.USER_ID]),
    'activation' >> ops.FillMissing() >> ops.Categorify() >> ops.AddTags([Tags.CONTINUOUS]),
    'diversity_score' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS])
]

# 아이템 피처 정의
item_features = [
    'anime_id' >> ops.Categorify() >> ops.AddTags([Tags.ITEM_ID]),
    'Score' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
    'Members' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
    'Popularity' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
    'Completed' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
    'On-Hold' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS]),
    'Dropped' >> ops.FillMissing() >> ops.AddTags([Tags.CONTINUOUS])
]

# 상호작용 피처 정의
interaction_features = ['user_id', 'anime_id', 'rating'] >> ops.FillMissing() >> ops.Categorify() >> ops.AddTags([Tags.CONTINUOUS, Tags.TARGET])

In [41]:
# 전체 파이프라인
workflow = nvt.Workflow(user_features + item_features + interaction_features)

In [42]:
# 데이터셋 분할
train_data, valid_data = train_test_split(data, 
                                          test_size=0.3, 
                                          random_state=42)
train_dataset = nvt.Dataset(train_data)
valid_dataset = nvt.Dataset(valid_data)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [43]:
# 파이프라인 학습 및 변환
workflow.fit_transform(train_dataset).to_parquet(output_path='processed_data/train/')
workflow.transform(valid_dataset).to_parquet(output_path='processed_data/valid/')

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [44]:
# 디렉토리 확인

import os

print(os.listdir('processed_data/train/'))
print(os.listdir('processed_data/valid/'))

['part_2.parquet', 'part_0.parquet', '_metadata', '_metadata.json', '_file_list.txt', 'schema.pbtxt', 'part_3.parquet', 'part_1.parquet']
['part_2.parquet', 'part_0.parquet', '_metadata', '_metadata.json', '_file_list.txt', 'schema.pbtxt', 'part_3.parquet', 'part_1.parquet']


In [45]:
from merlin.io.dataset import Dataset as MerlinDataset

# Parquet 파일로 MerlinDataset 로드하기
train = MerlinDataset('processed_data/train/part_0.parquet')
valid = MerlinDataset('processed_data/valid/part_0.parquet')

In [46]:
# 데이터셋 스키마 확인
schema = train.schema
print(schema)

[{'name': 'user_id', 'tags': {<Tags.USER_ID: 'user_id'>, <Tags.CATEGORICAL: 'categorical'>, <Tags.TARGET: 'target'>}, 'properties': {'cat_path': './/categories/unique.user_id.parquet', 'num_buckets': None, 'freq_threshold': 0.0, 'embedding_sizes': {'dimension': 512.0, 'cardinality': 30259.0}, 'start_index': 0.0, 'max_size': 0.0, 'domain': {'min': 0, 'max': 30259}}, 'dtype': dtype('int64'), 'is_list': False, 'is_ragged': False}, {'name': 'anime_id', 'tags': {<Tags.CATEGORICAL: 'categorical'>, <Tags.TARGET: 'target'>, <Tags.ITEM_ID: 'item_id'>}, 'properties': {'cat_path': './/categories/unique.anime_id.parquet', 'num_buckets': None, 'embedding_sizes': {'cardinality': 12248.0, 'dimension': 311.0}, 'max_size': 0.0, 'start_index': 0.0, 'freq_threshold': 0.0, 'domain': {'min': 0, 'max': 12248}}, 'dtype': dtype('int64'), 'is_list': False, 'is_ragged': False}, {'name': 'rating', 'tags': {<Tags.TARGET: 'target'>, <Tags.CONTINUOUS: 'continuous'>}, 'properties': {'cat_path': './/categories/unique

In [47]:
# 모델 설정 : 타겟 변수를 평점(rating)으로
model = XGBoost(train.schema, target_columns=['rating'])

In [48]:
model.fit(train)

[08:32:44] task [xgboost.dask-1]:tcp://127.0.0.1:36093 got new rank 0


[0]	train-rmse:1.85655
[1]	train-rmse:1.83423
[2]	train-rmse:1.82277
[3]	train-rmse:1.81653
[4]	train-rmse:1.81289
[5]	train-rmse:1.80977
[6]	train-rmse:1.80732
[7]	train-rmse:1.80557
[8]	train-rmse:1.80412
[9]	train-rmse:1.80206


<xgboost.core.Booster at 0x78f29c75f430>

In [49]:
predictions = model.predict(valid)

In [50]:
# MerlinDataset을 Pandas DataFrame으로 변환
valid_df = valid.to_ddf().compute()

# 'rating' 컬럼 추출
y_true = valid_df['rating'].values

In [51]:
# valid_df

In [52]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = predictions

# RMSE 계산
rmse = mean_squared_error(y_true, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

# MAE 계산
mae = mean_absolute_error(y_true, y_pred)
print(f'Validation MAE: {mae}')

Validation RMSE: 1.8042669792982708
Validation MAE: 1.4739529741722923


In [53]:
data.columns

Index(['user_id', 'anime_id', 'rating', 'activation', 'Score', 'Popularity',
       'Members', 'Completed', 'On-Hold', 'Dropped', 'diversity_score'],
      dtype='object')

In [54]:
# 특정 사용자 데이터 # 평점 부여 활동을 비교적 많이 한 사용자
len(interaction_data[interaction_data['user_id']==10])

4

In [55]:
import dask.dataframe as dd

# 특정 사용자의 ID
user_id_to_fetch = 10

# 해당 데이터 가져오기
user_features = data[['user_id', 'activation', 'diversity_score']]
item_features = data[['anime_id', 'rating', 'Score', 
                      'Members', 'Popularity', 'Completed',
                      'On-Hold', 'Dropped']]

# Dask 데이터프레임으로 변환
user_features_dd = dd.from_pandas(user_features, npartitions=10)
item_features_dd = dd.from_pandas(item_features, npartitions=10)

# 특정 사용자의 데이터 필터링
specific_user_dd = user_features_dd[user_features_dd['user_id'] == user_id_to_fetch]

# 중복 제거
item_features_dd_unique = item_features_dd.drop_duplicates()

# 사용자와 아이템 조합 생성
user_item_pairs_dd = specific_user_dd.assign(key=1).merge(
    item_features_dd_unique.assign(key=1), on='key').drop('key', axis=1)

# DataFrame을 Pandas로 변환
user_item_pairs_df = user_item_pairs_dd.compute()

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [56]:
# DataFrame을 MerlinDataset으로 변환
user_item_pairs_dataset = MerlinDataset(user_item_pairs_df)
predictions = model.predict(user_item_pairs_dataset)

# 예측 평점을 DataFrame에 추가
user_item_pairs_df['predicted_rating'] = predictions

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [57]:
# 중복된 anime_id 제거
user_specific_df = user_item_pairs_df.drop_duplicates(subset=['anime_id'])

In [58]:
# 예측 평점을 기준으로 내림차순 정렬
top_recommendations = user_specific_df.sort_values(by='predicted_rating', ascending=False)

# 상위 10개 아이템 선택
top_10_recommendations = top_recommendations.head(10)

# 결과 확인
print(top_10_recommendations[['anime_id', 'Score', 'diversity_score',
                              'Members', 'Popularity', 
                               'Completed', 'On-Hold', 'Dropped',
                              'predicted_rating']])

       anime_id  Score  diversity_score  Members  Popularity  Completed  \
7574       3287   1.85              1.0    24550        3305      18359   
3058        413   2.23              1.0    52059        2216      41464   
1699      16608   2.78              1.0    16117        4042      15032   
33051     33975   4.71              1.0     1964        8893        928   
9344      29949   2.35              1.0    10296        4856       8830   
4013      13405   2.01              1.0    13450        4367      12133   
36638     35971   4.47              1.0     1279        9901        566   
34226      5877   2.50              1.0     4426        6805       2498   
2447      19315   3.41              1.0   149948         997      93235   
2857      16436   3.47              1.0    12832        4427      10316   

       On-Hold  Dropped  predicted_rating  
7574       130      426          7.949843  
3058       253      845          7.578083  
1699        32      122          7.388037 

In [59]:
# 예측 점수가 동일한 경우 다른 컬럼을 기준으로 정렬
top_recommendations = top_recommendations.sort_values(by=['predicted_rating', 'Members'], ascending=[False, False])

# 상위 10개 아이템 선택
top_10_recommendations = top_recommendations.head(10)

# 결과 확인
print(top_10_recommendations[['anime_id', 'predicted_rating', 'Members']])

       anime_id  predicted_rating  Members
7574       3287          7.949843    24550
3058        413          7.578083    52059
1699      16608          7.388037    16117
33051     33975          7.347137     1964
4013      13405          7.228794    13450
9344      29949          7.228794    10296
36638     35971          7.149766     1279
34226      5877          7.065217     4426
2447      19315          7.024113   149948
2857      16436          6.942254    12832


In [60]:
# 애니메이션 데이터와 추천 결과 결합
top_10_recommendations_with_names = top_10_recommendations.merge(anime[['anime_id', 'Name', 'Genres']], 
                                                                 on='anime_id', how='left')

In [61]:
top_10_recommendations_with_names[['anime_id', 'Name',
                                   'predicted_rating', 'Genres', 'Score',
                                    'Members', 'Popularity', 
                               'Completed', 'On-Hold', 'Dropped']]

Unnamed: 0,anime_id,Name,predicted_rating,Genres,Score,Members,Popularity,Completed,On-Hold,Dropped
0,3287,Tenkuu Danzai Skelter+Heaven,7.949843,"Sci-Fi, Mecha",1.85,24550,3305,18359,130,426
1,413,Hametsu no Mars,7.578083,"Sci-Fi, Horror",2.23,52059,2216,41464,253,845
2,16608,Shitcom,7.388037,"Comedy, Romance",2.78,16117,4042,15032,32,122
3,33975,3-Nen D-Gumi Glass no Kamen,7.347137,"Comedy, Parody, School",4.71,1964,8893,928,52,335
4,13405,Utsu Musume Sayuri,7.228794,"Comedy, Dementia",2.01,13450,4367,12133,29,158
5,29949,Nami,7.228794,Dementia,2.35,10296,4856,8830,39,142
6,35971,Onyankopon,7.149766,"Music, Comedy, School",4.47,1279,9901,566,48,316
7,5877,Abunai Sisters: Koko & Mika,7.065217,"Action, Comedy, Ecchi",2.5,4426,6805,2498,111,825
8,19315,Pupa,7.024113,"Fantasy, Horror, Psychological",3.41,149948,997,93235,2677,17444
9,16436,Tenshi no Drop,6.942254,"Comedy, Ecchi, Shounen",3.47,12832,4427,10316,59,132


In [62]:
print(top_10_recommendations_with_names[['anime_id', 
                                   'predicted_rating']])

   anime_id  predicted_rating
0      3287          7.949843
1       413          7.578083
2     16608          7.388037
3     33975          7.347137
4     13405          7.228794
5     29949          7.228794
6     35971          7.149766
7      5877          7.065217
8     19315          7.024113
9     16436          6.942254


In [63]:
from collections import Counter

pred_genres = top_10_recommendations_with_names['Genres'].str.split(', ', expand=True).stack()

# 각 장르의 출현 횟수 계산
pred_genre_counts = pred_genres.value_counts()

pred_genre_counts_df = pred_genre_counts.reset_index()

pred_genre_counts_df.columns = ['Genre', 'Count']

pred_genre_counts_df

Unnamed: 0,Genre,Count
0,Comedy,6
1,Sci-Fi,2
2,Horror,2
3,School,2
4,Dementia,2
5,Ecchi,2
6,Mecha,1
7,Romance,1
8,Parody,1
9,Music,1


In [64]:
# 10번 사용자 확인
user_interactions = interaction_data[interaction_data['user_id']==10]
user_interactions

Unnamed: 0,user_id,anime_id,rating
1081,10,3652,8
1082,10,934,9
1083,10,1889,9
1084,10,10491,5


In [65]:
user_interactions_with_anime = user_interactions.merge(anime[['anime_id', 'Name', 'Genres',
                                                             'Score', 'Members', 'Popularity', 
                                                              'Completed',
                                                              'On-Hold', 'Dropped']], 
                                                       on='anime_id', how='left')
user_interactions_with_anime

Unnamed: 0,user_id,anime_id,rating,Name,Genres,Score,Members,Popularity,Completed,On-Hold,Dropped
0,10,3652,8,Higurashi no Naku Koro ni Rei,"Mystery, Comedy, Psychological, Supernatural, ...",7.44,155511,948,110927,2530,1875
1,10,934,9,Higurashi no Naku Koro ni,"Mystery, Dementia, Horror, Psychological, Supe...",7.95,638491,156,363708,33103,29836
2,10,1889,9,Higurashi no Naku Koro ni Kai,"Mystery, Psychological, Supernatural, Thriller",8.23,385728,344,254705,12031,9344
3,10,10491,5,Higurashi no Naku Koro ni Kira,"Mystery, Parody",6.54,72010,1793,48585,1588,2286


In [66]:
pd.set_option('display.max_colwidth', None)

In [67]:
user_interactions_with_anime

Unnamed: 0,user_id,anime_id,rating,Name,Genres,Score,Members,Popularity,Completed,On-Hold,Dropped
0,10,3652,8,Higurashi no Naku Koro ni Rei,"Mystery, Comedy, Psychological, Supernatural, Thriller",7.44,155511,948,110927,2530,1875
1,10,934,9,Higurashi no Naku Koro ni,"Mystery, Dementia, Horror, Psychological, Supernatural, Thriller",7.95,638491,156,363708,33103,29836
2,10,1889,9,Higurashi no Naku Koro ni Kai,"Mystery, Psychological, Supernatural, Thriller",8.23,385728,344,254705,12031,9344
3,10,10491,5,Higurashi no Naku Koro ni Kira,"Mystery, Parody",6.54,72010,1793,48585,1588,2286


In [68]:
from collections import Counter

all_genres = user_interactions_with_anime['Genres'].str.split(', ', expand=True).stack()

# 각 장르의 출현 횟수 계산
genre_counts = all_genres.value_counts()

genre_counts_df = genre_counts.reset_index()

genre_counts_df.columns = ['Genre', 'Count']

genre_counts_df

Unnamed: 0,Genre,Count
0,Mystery,4
1,Psychological,3
2,Supernatural,3
3,Thriller,3
4,Comedy,1
5,Dementia,1
6,Horror,1
7,Parody,1


In [69]:
print(genre_counts_df)
print(pred_genre_counts_df)

           Genre  Count
0        Mystery      4
1  Psychological      3
2   Supernatural      3
3       Thriller      3
4         Comedy      1
5       Dementia      1
6         Horror      1
7         Parody      1
            Genre  Count
0          Comedy      6
1          Sci-Fi      2
2          Horror      2
3          School      2
4        Dementia      2
5           Ecchi      2
6           Mecha      1
7         Romance      1
8          Parody      1
9           Music      1
10         Action      1
11        Fantasy      1
12  Psychological      1
13        Shounen      1


In [70]:
# 추천 모델과 비교
pred_genre_counts_df

Unnamed: 0,Genre,Count
0,Comedy,6
1,Sci-Fi,2
2,Horror,2
3,School,2
4,Dementia,2
5,Ecchi,2
6,Mecha,1
7,Romance,1
8,Parody,1
9,Music,1


In [71]:
data['rating'].describe()

count    5.803736e+06
mean     7.518300e+00
std      1.697103e+00
min      1.000000e+00
25%      7.000000e+00
50%      8.000000e+00
75%      9.000000e+00
max      1.000000e+01
Name: rating, dtype: float64