In [1]:
import numpy as np 
import pandas as pd 
import random

# 랜덤 시드 고정
seed = 42
random.seed(seed)
np.random.seed(seed)

In [2]:
# 애니메이션 정보 데이터

anime = pd.read_csv("/kaggle/input/anime-recommendation-database-2020/anime.csv")
print(len(anime))
anime.head(1)

17562


Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0


In [3]:
# 평점 정보 데이터

user_scores = pd.read_csv("/kaggle/input/anime-recommendation-database-2020/rating_complete.csv")
print(len(user_scores))
user_scores.head(1)

57633278


Unnamed: 0,user_id,anime_id,rating
0,0,430,9


In [4]:
user_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


In [5]:
# user_id가 n의 배수인 행 추출
# 하이브리드 모델의 자원 문제로 10의 배수 아이디인 사용자만 한정
# svd 모델에서도 똑같이 맞춰줌
user_scores = user_scores[user_scores['user_id'] % 10 == 0]
print(len(user_scores))

5815169


In [6]:
# NaN 값이 있는 행
num_nan_rows = user_scores.isna().any(axis=1).sum()
print(num_nan_rows)

0


In [7]:
user_scores = user_scores.dropna()

In [8]:
user_scores.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5815169 entries, 0 to 57633099
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 177.5 MB


In [9]:
merged_df = pd.merge(user_scores, 
                     anime[['MAL_ID', 'Name', 'Genres']],
                     left_on='anime_id', 
                     right_on='MAL_ID', 
                     how='left')

In [10]:
user_scores = merged_df[['user_id', 'anime_id', 'rating', 
                         'Name', 'Genres']]
user_scores.head(2)

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,0,430,9,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
1,0,1004,5,Kanojo to Kanojo no Neko,"Drama, Psychological, Romance, Slice of Life"


In [11]:
print('사용자의 수 :', len(user_scores['user_id'].unique()))
print('애니메이션의 수 :', len(user_scores['anime_id'].unique()))

사용자의 수 : 30936
애니메이션의 수 : 15929


In [12]:
# user_id가 n의 배수인 행 추출
user_scores = user_scores[user_scores['user_id'] % 10 == 0]

In [13]:
print('수정된 데이터셋의 길이 :', len(user_scores))

수정된 데이터셋의 길이 : 5815169


In [14]:
from surprise import Dataset, Reader, accuracy

# Surprise 라이브러리용 데이터 준비
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(user_scores[['user_id', 'anime_id', 'rating']], reader)

In [15]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, 
                                     test_size=0.3,
                                    random_state = 42)

In [16]:
from surprise import SVD

# 하이퍼파라미터 기본값으로 설정함 (변경시 성능 더 떨어짐)
# n_factors=100, n_epochs=20, biased=True, lr_all=0.005, reg_all=0.02
svd = SVD(n_factors=100, 
          n_epochs=20, 
          biased=True, 
          lr_all=0.005, 
          reg_all=0.02)

# 훈련 시간 길다
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7d2af243cf70>

In [17]:
from surprise import accuracy

# 모델 평가
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 1.1511
MAE:  0.8591


In [18]:
# 사용자에게 추천
# 아이디가 정수인지 다시 확인
user_scores['user_id'] = user_scores['user_id'].astype(int)

In [19]:
# 사용자 ID에 애니메이션 추천
user_id = 10

In [20]:
# 사용자가 이미 평가한 애니메이션 목록
rated_anime_ids = user_scores[user_scores['user_id'] == user_id]['anime_id'].tolist()
print(rated_anime_ids)

[3652, 934, 1889, 10491]


In [21]:
# 모든 애니메이션 ID
all_anime_ids = anime['MAL_ID'].unique()

# 사용자가 평가하지 않은 애니메이션 ID
unrated_anime_ids = [anime_id for anime_id in all_anime_ids if anime_id not in rated_anime_ids]

In [22]:
# 예측 평점
predictions = [svd.predict(user_id, anime_id) for anime_id in unrated_anime_ids]

In [23]:
# 예측 평점이 높은 순으로 정렬
predictions.sort(key=lambda x: x.est, reverse=True)

In [24]:
# 추천 결과를 저장할 리스트
recommendations_list = []

# 추천 결과에서 상위 10개 항목 추출
top_10_recommendations = predictions[:10]

# 추천 결과 리스트 생성
for pred in top_10_recommendations:
    anime_id = pred.iid
    
    # 애니메이션 데이터에서 제목과 장르 추출
    recommendations_list.append({
        'anime_id': anime_id,  # 애니메이션 ID 추가
        'Predicted Rating': pred.est
    })
    
# 리스트를 데이터프레임으로 변환
recommendations_df = pd.DataFrame(recommendations_list)

In [29]:
print(recommendations_df)

       anime_id  Predicted Rating  \
0           820          9.533816   
949          19          9.524660   
4170      21939          9.518805   
6119      33050          9.513298   
6312       9969          9.487867   
9069        338          9.464964   
9608       2251          9.429065   
15648     24701          9.354969   
17230     33255          9.342817   
20700     28977          9.342192   

                                                    Name  \
0                                   Ginga Eiyuu Densetsu   
949                                              Monster   
4170                                  Mushishi Zoku Shou   
6119   Fate/stay night Movie: Heaven's Feel - III. Sp...   
6312                                            Gintama'   
9069                                  Versailles no Bara   
9608                                            Baccano!   
15648                      Mushishi Zoku Shou 2nd Season   
17230                               Saiki Kusuo no Ψ

In [25]:
# user_scores 데이터프레임과 병합 (병합 기준: 'anime_id')
recommendations_df = recommendations_df.merge(user_scores[['anime_id', 'Name', 'Genres']], 
                                               on='anime_id', how='left')


In [26]:
# 중복된 행 제거
recommendations_df = recommendations_df.drop_duplicates()

# 결과 출력
print("추천 결과 데이터프레임:")
recommendations_df

추천 결과 데이터프레임:


Unnamed: 0,anime_id,Predicted Rating,Name,Genres
0,820,9.533816,Ginga Eiyuu Densetsu,"Military, Sci-Fi, Space, Drama"
949,19,9.52466,Monster,"Drama, Horror, Mystery, Police, Psychological,..."
4170,21939,9.518805,Mushishi Zoku Shou,"Adventure, Slice of Life, Mystery, Historical,..."
6119,33050,9.513298,Fate/stay night Movie: Heaven's Feel - III. Sp...,"Action, Supernatural, Magic, Fantasy"
6312,9969,9.487867,Gintama',"Action, Sci-Fi, Comedy, Historical, Parody, Sa..."
9069,338,9.464964,Versailles no Bara,"Military, Historical, Drama, Romance, Shoujo"
9608,2251,9.429065,Baccano!,"Action, Comedy, Historical, Mystery, Supernatural"
15648,24701,9.354969,Mushishi Zoku Shou 2nd Season,"Adventure, Fantasy, Historical, Mystery, Seine..."
17230,33255,9.342817,Saiki Kusuo no Ψ-nan,"Comedy, School, Shounen, Slice of Life, Supern..."
20700,28977,9.342192,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."


In [27]:
# 10번 사용자 확인
user_interactions = user_scores[user_scores['user_id']==10]
user_interactions

Unnamed: 0,user_id,anime_id,rating,Name,Genres
35,10,3652,8,Higurashi no Naku Koro ni Rei,"Mystery, Comedy, Psychological, Supernatural, ..."
36,10,934,9,Higurashi no Naku Koro ni,"Mystery, Dementia, Horror, Psychological, Supe..."
37,10,1889,9,Higurashi no Naku Koro ni Kai,"Mystery, Psychological, Supernatural, Thriller"
38,10,10491,5,Higurashi no Naku Koro ni Kira,"Mystery, Parody"
