In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from GetRecallMovie import *
from torchvision import datasets, transforms
from torchvision.models import resnet50

In [3]:
def find_top_view_movies():
    #from full dataset
    df = pd.read_csv('data_raw/ratings.csv')
    top_view_movies = df.movieId.value_counts().index.values[:100]
    return top_view_movies

In [None]:
df = pd.read_csv("data/dataset_feature.csv")
top_view_movies = list(find_top_view_movies())
def recall_movies(df):
    df_cov = pd.read_csv('data/Movie2Movie.csv')
    top_20_movies = df_covisitation_to_dict(df_cov)
    movie_list = df.movieId.tolist()
    unique_movie = list(dict.fromkeys(movie_list))
    # gererate from movie2movie
    movies_2 = list(itertools.chain(*[top_20_movies[id] for id in unique_movie if id in top_20_movies]))
    recall_list1 = [[id, cnt] for id, cnt in Counter(movies_2).most_common(100) if id not in unique_movie]
    recall_id = [i[0] for i in recall_list1]
    # add top_view_movies to 100
    index = 0
    while len(recall_id)<100:
        if top_view_movies[index] not in recall_id:
            recall_id.append(top_view_movies[index])
        index += 1
    return recall_id

def get_result(df):
    df1 = df[:30]
    recall_id = recall_movies(df1)
    df2 = list(df[30:]["movieId"])
    binary_list = [1 if item in df2 else 0 for item in recall_id]
    res = pd.DataFrame({'movieId': recall_id, 'Lable': binary_list})
    return res

def get_dataset_id(df):
    df_set = df.groupby('userId').apply(lambda x:get_result(x))
    return df_set

df_set = get_dataset_id(df)

In [70]:
df_set.to_csv("./data/dataset.csv")

In [5]:
df = pd.read_csv("./data/dataset_feature.csv")
train_ratio = 0.8
unique_user = df["userId"].unique()
n = len(unique_user)
np.random.shuffle(unique_user)
train_user = unique_user[0:int(n*train_ratio)]
test_user = unique_user[int(n*train_ratio):]
train_df = df[df['userId'].isin(train_user)]
test_df = df[df['userId'].isin(test_user)]
train_df.to_csv("./data/train_set.csv")
test_df.to_csv("./data/test_set.csv")

In [6]:
import torchvision.transforms as transforms
def split_train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


class MovieDataset(Dataset):
    def __init__(self, csv_file, num_rows=100, num_column=4, transform=None):
        self.data = pd.read_csv(csv_file)
        self.num_rows = num_rows
        self.num_column = num_column
        self.transform = transform

    def __len__(self):
        return len(self.data) // self.num_rows

    def __getitem__(self, idx):
        start_idx = idx * self.num_rows
        end_idx = start_idx + self.num_rows
        data = self.data.iloc[start_idx:end_idx, self.num_column+1:]
        label = self.data.iloc[start_idx:end_idx, self.num_column].values
        data = torch.tensor(data.values, dtype=torch.float32)
        label = torch.tensor(label,dtype=torch.float32)
        return data, label

batchsize = 1
trainset = MovieDataset('./data/train_set.csv',transform=transforms.ToTensor())
trainloader = torch.utils.data.DataLoader(trainset,batch_size=batchsize,shuffle=True)
testset = MovieDataset('./data/test_set.csv',transform=transforms.ToTensor())
testloader = torch.utils.data.DataLoader(testset,batch_size=batchsize,shuffle=True)
input_channels = 1  
custom_input_height = 100
custom_input_width = 97
num_classes = 100


In [12]:
model = resnet50(pretrained=True)
model.conv1 = nn.Conv2d(input_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, num_classes),
    nn.Sigmoid()
)

top_n = 10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(),lr=1e-3)
num_epochs = 3
train_loss_history = []
train_acc_history = []
val_loss_history = []
val_acc_history = []
for epoch in range(num_epochs):
    model.train()
    running_loss_train = 0.0
    running_acc_train = 0.0
#     if epoch == 5:
#         for param in model.layer4.parameters():
#             param.requires_grad = True
#     elif epoch == 10:
#         for param in model.layer3.parameters():
#             param.requires_grad = True
#     elif epoch == 15:
#         optimizer = optim.Adam(model.parameters(),lr=1e-4)
#     elif epoch == 20:
#         optimizer = optim.Adam(model.parameters(),lr=1e-5)
    with tqdm(total=len(trainloader), desc=f"Train", unit="batch") as pbar:
        for n_batch,data in enumerate(trainloader):
            images,labels = data
            images = torch.unsqueeze(images, 1)
            images,labels = images.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(images)
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()

            running_loss_train += loss.item()
            outputs = outputs.squeeze().tolist()
            labels = labels.squeeze().tolist()
            
            top_10_indices = np.argsort(outputs)[-top_n:]
            count = 0
            for index in top_10_indices:
                if labels[index] == 1:
                    count += 1
            running_acc_train += count/top_n
            pbar.set_postfix({'loss': loss.item(), 'acc': 100. * running_acc_train / (n_batch + 1), 'epoch': epoch})
            pbar.update()
        train_loss_history.append(running_loss_train/ len(trainloader))
        train_acc_history.append(100. * running_acc_train / len(trainloader)) 
    with torch.no_grad():
        model.eval()
        running_loss_val = 0.0
        running_acc_val = 0.0
        with tqdm(total=len(testloader), desc=f"Test", unit="batch") as pbar:
            for n1_batch,data in enumerate(testloader):
                images,labels = data
                images = torch.unsqueeze(images, 1)
                images,labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs,labels)
                running_loss_val += loss.item()
                outputs = outputs.squeeze().tolist()
                labels = labels.squeeze().tolist()
                top_10_indices = np.argsort(outputs)[-top_n:]
                count = 0
                for index in top_10_indices:
                    if labels[index] == 1:
                        count+=1
                running_acc_val += count/top_n
                pbar.set_postfix({'loss': loss.item(), 'acc': 100. * running_acc_val / (n1_batch + 1), 'epoch': epoch})
                pbar.update()
            val_loss_history.append(running_loss_val/len(testloader))
            val_acc_history.append(100. * running_acc_val / len(testloader)) 


Train: 100%|███████████████████████████████████████| 330/330 [00:25<00:00, 12.69batch/s, loss=0.513, acc=98.4, epoch=0]
Test: 100%|██████████████████████████████████████████| 83/83 [00:01<00:00, 57.96batch/s, loss=0.499, acc=98.2, epoch=0]
Train: 100%|███████████████████████████████████████| 330/330 [00:25<00:00, 12.89batch/s, loss=0.378, acc=99.1, epoch=1]
Test: 100%|██████████████████████████████████████████| 83/83 [00:01<00:00, 52.60batch/s, loss=0.467, acc=98.9, epoch=1]
Train: 100%|███████████████████████████████████████| 330/330 [00:25<00:00, 12.81batch/s, loss=0.294, acc=99.1, epoch=2]
Test: 100%|██████████████████████████████████████████| 83/83 [00:01<00:00, 54.49batch/s, loss=0.401, acc=98.9, epoch=2]
