#### 데이터 탐색
1. 데이터가 어떤 식으로 구성되어 있는지 파악하고 분석의 방향을 결정
2. user, item의 특성 파악 및 좋은 feature 도출

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings(action = 'ignore')

In [2]:
## 올바른 시각화를 위해서 미리미리 matplotlib 설정을 세팅해 놓자
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
import matplotlib.font_manager as fm
from matplotlib import rc

mpl.font_manager._rebuild()

fontpath = '/content/drive/MyDrive/Euron/assignment/Week02-Anime_Rec/archive/copyfonts.com_batang.ttc.ttf'
font =  fm.FontProperties(fname = fontpath, size = 7).get_name()

mpl.pyplot.rc('font', family = font)
plt.rc('font', family = font)
plt.rcParams['font.family'] = font

In [3]:
base_dir = '/content/drive/MyDrive/Euron/assignment/Week02-Anime_Rec/archive'

In [4]:
from glob import glob
csv_dirs = glob(base_dir + '/*.csv')

In [5]:
csv_names = list(map(lambda x: x.split('/')[-1], csv_dirs))

In [6]:
## anime.csv (Item database)
anime_df = pd.read_csv(base_dir + '/anime.csv')
anime_df.head(3)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0


In [7]:
anime_df.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [8]:
## Favorites, Watching, Completed (+)
## On-Hold Plan to Watch(=)
## Dropped (-)

In [9]:
## anime_df에서 watch status를 0-4사이의 수치로 바꾸어 줄 생각이다.
watch_status = ['Watching', 'Completed', 'On-Hold', 'Dropped', 'Plan to Watch']
watch_encoder = {key:value for (key, value) in zip(watch_status, list(x for x in range(len(watch_status))))}


In [10]:
## anime_with_synopsis.csv (Item database)

synop_df = pd.read_csv(base_dir + '/anime_with_synopsis.csv')
synop_df.head(3)

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."


In [11]:
len(list(set(anime_df['MAL_ID']))), len(list(set(synop_df['MAL_ID'])))
# 줄거리 정보가 주어져 있는 애니메이션 데이터가 실제 user-item interaction이 존재하는 애니메이션의 개수에 비해 부족하다.
# 그렇다면 애니메이션 자체에 대한 정보보다는 오히려 해당 영화를 선호하는 사용자의 취향을 자세히 파악 할 수 있는게 아닐까?

(17562, 16214)

In [12]:
## animelist.csv (User-Item Interaction)

# 사용자가 애니메이션에 대해서 평점을 매기지 않은 경우 0으로 저장이 되어 있음
# watched_episodes에 관한 정보는 해당 애니메이션이 몇화로 이루어져 있는지 모르는 상황이기 때문에..어떤식의 기준으로 적용할지 생각해 봐야 함
list_df = pd.read_csv(base_dir + '/animelist.csv')
list_df.head(3)

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9,1,1
1,0,6702,7,1,4
2,0,242,10,1,4


In [13]:
## rating_complete.csv (User-Item interaction)
# 각각의 user이 특정 item에 어떤 rating을 매겼는지, 즉 선호도를 파악할 수 있는 자료이다.
# 일반적으로 collaborative filtering을 할때에 많이 사용하는 rating matrix를 만들 수 있음

rating_df = pd.read_csv(base_dir + '/rating_complete.csv')
rating_df.head(3)
# 0으로 평점을 남긴 경우는 없음

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7


In [14]:
## watching_satus.csv

watch_df = pd.read_csv(base_dir + '/watching_status.csv')
watch_df.head()

Unnamed: 0,status,description
0,1,Currently Watching
1,2,Completed
2,3,On Hold
3,4,Dropped
4,6,Plan to Watch


#### Collaborative Filtering
- 다른 사용자와의 유사도를 바탕으로 어떤 애니메이션을 좋아할지 예측
1. 유사한 item 찾기
2. 유사한 user 찾기
3. random user에개 item 추천해주기

In [15]:
## MAL_ID, name, genre, type, episodes, rating, members 칼럼의 정보만을 사용한다.
import pandas as pd
new_df = {
    'anime_id' : list(anime_df['MAL_ID']),
    'name' : list(anime_df['Name']),
    'genre' : list(anime_df['Genres']),
    'type' : list(anime_df['Type']),
    'episodes' : list(anime_df['Episodes']),
    'rating' : list(anime_df['Rating']),
    "members":list(anime_df['Members'])
}

new_anime_df = pd.DataFrame(new_df)

In [16]:
new_anime_df.head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,R - 17+ (violence & profanity),1251960
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,R - 17+ (violence & profanity),273145
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,PG-13 - Teens 13 or older,558913


In [17]:
new_rating_df = list_df[['user_id', 'anime_id', 'rating']]

In [18]:
new_rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [19]:
new_rating_df.shape

(109224747, 3)


- 데이터를 합쳐주려 했으나 메모리 초과로 불가능했다.
```
## merge the dataset
anime_full_df = pd.merge(new_anime_df, new_rating_df, on = 'anime_id', suffixes = ['', '_user'])
anime_full_df = anime_full_df.rename(columns = {'name' : 'anime_title', 'rating_user' : 'user_rating'})
anime_full_df.head()
```



In [20]:
## 1. 비슷한 Animation 찾기
# 1-(1)
n_ratings = new_rating_df['user_id'].value_counts()
new_rating_df = new_rating_df[new_rating_df['user_id'].isin(n_ratings[n_ratings >= 400].index)]

In [23]:
# 1-(2) Rating Data를 0.0-1.0사이의 값으로 정규화 한다.
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()
new_rating_df['rating'] = mmscaler.fit_transform(np.array(new_rating_df['rating']).reshape(-1,1)).squeeze(-1)

In [29]:
# 1-(3) Remove duplicated rows - 중복 값을 없애주자
duplicates = new_rating_df.duplicated()

if duplicates.sum() > 0:
  new_rating_df = new_rating_df[~duplicates]

In [31]:
## 2. Data PreProcessing
# 2-(1) Encoding Categorical Data
user_ids = new_rating_df['user_id'].unique().tolist()
anime_ids = new_rating_df['anime_id'].unique().tolist()

user_decoder = {key:value for key, value in enumerate(user_ids)}
user_encoder = {value:key for key, value in enumerate(user_ids)}

anime_decoder = {key:value for key, value in enumerate(anime_ids)}
anime_encoder = {value:key for key, value in enumerate(anime_ids)}

new_rating_df['user'] = new_rating_df['user_id'].map(user_encoder)
new_rating_df['anime'] = new_rating_df['anime_id'].map(anime_encoder)

In [None]:
new_rating_df.to_csv(base_dir + '/new_rate.csv', index = False)

In [32]:
print(f"user_num : {len(user_ids)} anime_num : {len(anime_ids)}")

user_num : 91641 anime_num : 17560


In [6]:
new_rating_df = pd.read_csv(base_dir + '/new_rate.csv')

In [7]:
new_rating_df.head(3)

Unnamed: 0,user_id,anime_id,rating,user,anime
0,77277,1894,0.4,19983,797
1,221403,777,0.9,57395,193
2,270652,36474,0.8,70189,2052


In [9]:
# 2-(2) Shuffle the Data - 섞어 주어야지 train test split할때 골고루 분배됨
new_rating_df = new_rating_df.sample(frac = 1, random_state = 42)
X = new_rating_df[['user', 'anime']].values
y = new_rating_df['rating']

In [11]:
# 2-(3) Split into train and test set
test_set_size = 10000
train_indices = new_rating_df.shape[0] - test_set_size

X_train, X_test, y_train, y_test = (
    X[:train_indices], X[train_indices:],
    y[:train_indices], y[train_indices:]
)

print(f"train_num : {len(X_train)}")

train_num : 71408113


In [None]:
# 2-(4) Data Utils
def load_all():
  new_rating_df = pd.read_csv(base_dir + '/new_rate.csv')

In [20]:
## 3.Building the Dataset
from torch.utils.data import Dataset, DataLoader

class AnimeDataset(Dataset):
  def __init__(self, X, y, mode = 'train'):
    self.mode = mode
    self.user = X[:,0]
    self.anime = X[:,1]
    self.y = np.array(y)

  def __len__(self):
    return len(self.y)
  
  def __getitem__(self, idx):
    user, anime, y = self.user[idx], self.anime[idx], self.y[idx]
    return {
        'user': torch.from_numpy(user),
        'anime' : torch.from_numpy(anime),
        'rate' : torch.from_numpy(y)
    }


In [21]:
train_dataset = AnimeDataset(X_train, y_train)
test_dataset = AnimeDataset(X_test, y_test)

# 3-(2). Generate the Dataloader

train_dataloader = DataLoader(train_dataset, batch_size = 256, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = 256, shuffle = False)

In [33]:
## 4. Building Model
import torch
import torch.nn as nn

class RecNet(nn.Module):
  def __init__(self, user_num, anime_num, emb_size = 128, mode = 'dot'):
    super(RecNet, self).__init__()
    self.user_embedding = nn.Embedding(num_embeddings = emb_size, embedding_dim = user_num, max_norm = True)
    self.anime_embedding = nn.Embedding(num_embeddings = emb_size, embedding_dim = anime_num, max_norm = True)
    
    self.linear = nn.Linear(emb_size, 1)
    self.sigmoid = nn.Sigmoid()

    self._init_weight_()

    self.mode = mode
  
  def _init_weight_(self):
    nn.init.normal_(self.user_embedding.weight, std = 0.01)
    nn.init.normal_(self.anime_embedding.weight, std = 0.01)
    nn.init.kaiming_uniform_(self.linear.weight, a = 1, nonlinearity = 'sigmoid')
    
    for m in self.modules():
      if isinstance(m, nn.Linear) and m.bias is not None:
        m.bias.data.zero_()

  def forward(self, user, anime):
    user_embedding = self.user_embedding(user)
    anime_embedding = self.anime_embedding(anime)
    if self.mode == 'mul':
      addded = torch.matmul(user_embedding, anime_embedding)
    elif self.mode == 'cat':
      added = torch.cat([user_embedding, anime_embedding], dim = 0)
    elif self.mode == 'dot':
      added = user_embedding * anime_embedding

    
    # output = nn.Flatten(added)
    output = self.sigmoid(self.linear(added))

    return output.view(-1)



In [36]:
## 5. CREATE MODEL
def create_model(user_num, anime_num):
  net = RecNet(user_num, anime_num)
  net.cuda()
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(net.parameters(), lr = 0.001)
  
  return net, criterion, optimizer
user_num = 91641 
anime_num = 17560
net, criterion, optimizer = create_model(user_num, anime_num)

In [None]:
## 6. EVALUATE
def metrics(model, test_loader, top_k = 10):
  for user, anime, y in test_loader:
    user = user.cuda()
    anime = anime.cuda()

    pred = model(user, anime)
    # 가장 높은 상위 k개를 선택
    _, indices = torch.topk(pred, top_k)
    recommendation = torch.take(anime, indices).cpu().numpy().tolist()
    gt_anime = anime[0].item()
    

In [None]:
## 7.TRAINING
import time
from tqdm import tqdm

if __name__ == "__main__":
  best_loss = np.inf

  for epoch in range(20):
    net.train()
    start_time = time.time()

    train_loop = tqdm(len(train_dataloader))
    for user, anime, label in train_dataloader:
      user = user.cuda()
      anime = anime.cuda()
      label = label.float().cuda()

      # gradient 초기화
      net.zero_grad()
      pred = net(user, anime)
      loss = criterion(pred, label)
      loss.backward()
      optimizer.step()
      train_loop.set_postfix({'train loss' : loss.data})
    
    net.eval()
    test_loop = tqdm(len(test_dataloader))
    for user, anime, label in test_datalodaer:
      user = user.cuda()
      anime = anime.cuda()
      pred = net(user, anime)
      label = label.float().cuda()
      loss = criterion(pred, label)
      test_loop.set_postfix({'test loss' : loss.data})

      running_loss += loss
    if running_loss < best_loss:
      best_loss = running_loss
      torch.save(
          net, "{}-{}.pth".format(base_dir, epoch)
      )



#### Using More Meta Data
- 평점 데이터 이외에도 줄거리, 점수, 이름, 재생 타입, 장르 등 다양한 정보를 사용해서 유사도를 측정할 수 있다.

In [None]:
## merge the dataset
anime_full_df = new_anime_df.merge(new_rating_df, on = 'anime_id', suffixes = ['', '_user'])
anime_full_df = anime_full_df.rename(columns = {'name' : 'anime_title', 'rating_user' : 'user_rating'})
anime_full_df.dropna(axis = 0, how = 'any', inplace = True)
anime_full_df.head()

In [None]:
counts = anime_full_df['user_id'].value_counts()
# user-item interactiopn, 즉 평점을 남기는 상호작용을 한 개수가 user당 
# 적어도 400개는 되어야 한다.

anime_full_df = anime_full_df[anime_full_df['user_id'].isin(counts[counts >= 400].index)] 
anime_pivot = anime_full_df.pivot_table(index = 'anime', columns = 'user', values = 'user_rating')
anime_pivot.head()

In [None]:
## 1. Cosine Similarity using KNN
from scipy.sparse import csr_matrix

anime_matrix = csr_matrix(anime_pivot.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algrorithm = 'brute')
model_knn.fit(anime_matrix)
# 이렇게 sklearn의 cosine 유사도를 계산해 K Nearest Neighbor을 구하는 방법으로 
# sparse rating matrix를 채워주는 것이 가능했다.