<a href="https://colab.research.google.com/github/Chei-YuanChi/Recommender/blob/main/Recommender_HW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from tqdm.notebook import tqdm

from google.colab import drive
drive.mount('/content/gdrive') #mounting google drive for reading data in the future

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
df = pd.read_csv('/content/gdrive/My Drive/AI/ratings_small.csv') #read csv from google drive
df #output data (type:dataframe)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [3]:
class TrainDataset(Dataset):
    def __init__(self, df, all_movies):
        self.users, self.items, self.labels = self.get_dataset(df, all_movies)
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]
    
    def get_dataset(self, df, all_movies):
        users, items, labels = [], [], []
        user_item_set = set(zip(train_data['userId'], train_data['movieId']))

        num_neg = 4

        for (u, i) in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_neg):
                neg_item = np.random.choice(all_movies)

                while(u, neg_item) in user_item_set:
                    neg_item = np.random.choice(all_movies)
                users.append(u)
                items.append(neg_item)
                labels.append(0)
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [4]:
class NCF(pl.LightningModule):
    def __init__(self, num_users, num_items, df, all_movies):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings = num_users, embedding_dim = 8)
        self.item_embedding = nn.Embedding(num_embeddings = num_items, embedding_dim = 8)
        self.fc1 = nn.Linear(in_features = 16, out_features = 64)
        self.fc2 = nn.Linear(in_features = 64, out_features = 32)
        self.output = nn.Linear(in_features = 32, out_features = 1)
        self.df = df
        self.all_movies = all_movies
        
    def forward(self, user_input, item_input):
        
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)
        
        vector = torch.cat([user_embedded, item_embedded], dim = 1)
        
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        
        pred = nn.Sigmoid()(self.output(vector))
        
        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())
    
    def train_dataloader(self):
        return DataLoader(TrainDataset(self.df, self.all_movies), batch_size = 512, num_workers = 4)
    

In [5]:
num_users = df['userId'].max() + 1
num_items = df['movieId'].max() + 1

all_movies = df['movieId'].unique()

df['rank_latest'] = df.groupby(['userId'])['timestamp'].rank(method = 'first', ascending = False)

train_data = df[df['rank_latest'] != 1]
test_data = df[df['rank_latest'] == 1]

train_data = train_data[['userId', 'movieId', 'rating']]
test_data = test_data[['userId', 'movieId', 'rating']]

train_data.loc[:, 'rating'] = 1
train_data

trainer = pl.Trainer(max_epochs  = 5, gpus = 1, reload_dataloaders_every_n_epochs = True, progress_bar_refresh_rate = 50, logger = False, checkpoint_callback = False)
model = NCF(num_users, num_items, train_data, all_movies)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 5.4 K 
1 | item_embedding | Embedding | 1.3 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.281     Total estimated model params size (MB)
  cpuset_checked))


Training: -1it [00:00, ?it/s]

In [6]:
test_user_item_set = set(zip(test_data['userId'], test_data['movieId']))

user_interacted_items = df.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u, i) in tqdm(test_user_item_set):
  interacted_items = user_interacted_items[u]
  not_interacted_items = set(all_movies) - set(interacted_items)
  selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
  test_items = selected_not_interacted + [i]

  predicted_labels = np.squeeze(model(torch.tensor([u]*100), torch.tensor(test_items)).detach().numpy())

  top_10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]

  if i in top_10_items:
    hits.append(1)
  else:
    hits.append(0)

print('Hit rate of top 10 is {:.2f}'.format(np.average(hits)))

  0%|          | 0/671 [00:00<?, ?it/s]

Hit rate of top 10 is 0.54


In [7]:
train_data_all = df[['userId', 'movieId', 'rating']]
model = NCF(num_users, num_items, train_data_all, all_movies)
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 5.4 K 
1 | item_embedding | Embedding | 1.3 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.281     Total estimated model params size (MB)


Training: 970it [00:00, ?it/s]

  cpuset_checked))


In [12]:
from collections import defaultdict
uid = input('input user ID for recommending movies : ')
uid = int(uid)
top_2_items = defaultdict()
for u in df.userId.unique():
  interacted_items = user_interacted_items[u]
  not_interacted_items = set(all_movies) - set(interacted_items)
  test_items = list(np.random.choice(list(not_interacted_items), 99))
  predicted_labels = np.squeeze(model(torch.tensor([u]*99), torch.tensor(test_items)).detach().numpy())
  top_2_items[u] = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:2].tolist()]
print('Recommend movies for user {} :'.format(uid))
for i in top_2_items[uid]:
  print('Movie ID :', i)

input user ID for recommending movies : 4
Recommend movies for user 4 :
Movie ID : 760
Movie ID : 4381


In [15]:
pd.DataFrame(top_2_items).to_csv("/content/gdrive/My Drive/AI/recommend.csv")