## Load Data & Preprocessing

- movie-lens 데이터를 불러온 후 전처리하여 pickle파일로 저장

In [1]:
# https://github.com/Parasgr7/Movie-Recommendation-System
# AutoEncoders
import numpy as np
import pandas as pd
import random

In [2]:
# UserID::Gender::Age::Occupation::Zip-code
# MovieID::Title::Genres
# UserID::MovieID::Rating::Timestamp (5-star scale)

# Importing the dataset
movies = pd.read_csv('./ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('./ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('./ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [3]:
# user와 item의 수
num_users = len(users)
num_movies= len(movies)

# train, valid, test로 분리
train_lst = []
val_lst   = []
test_lst  = []
# 데이터를 userid별로 train, valid, test로 나누어줌
for uid in range(num_users):
    watches = ratings.loc[ratings[0] == uid] # ratings을 보고 userid와 연결시켜줌
    
    train_lst.append(watches.iloc[:int(len(watches)*0.7)])
    val_lst.append(watches.iloc[int(len(watches)*0.7):int(len(watches)*0.8)])
    test_lst.append(watches.iloc[int(len(watches)*0.8):])

In [4]:
# 나누어서 저장함
train = pd.concat(train_lst)
val   = pd.concat(val_lst)
test  = pd.concat(test_lst)
train.to_pickle('./data/ml/train.pkl')
val.to_pickle('./data/ml/val.pkl')
test.to_pickle('./data/ml/test.pkl')

In [5]:
# 전체 데이터에서 user와 movie의 수
num_users, num_movies

(6040, 3883)

In [6]:
# train, valid, test의 데이터를 합친 수가 전체와 같은지 확인
num_users  = int(max(max(train.values[:,0]), max(val.values[:,0]), max(test.values[:,0]))) + 1
num_movies = int(max(max(train.values[:,1]), max(val.values[:,1]), max(test.values[:,1]))) + 1
num_users, num_movies

(6040, 3953)

## Hit ratio / NDCG

In [7]:
train_lst = []
val_lst   = []
test_lst  = []
neg_lst  = []

In [8]:
for uid in range(1, num_users+1):
    watches = ratings.loc[ratings[0] == uid]
    
    watched = watches[1].values.tolist()
    unwatch = set(range(1, num_movies+1)) - set(watched)
    
    ns_list = random.sample(unwatch, 100)
    
    train_lst.append(watches.iloc[:-2])
    val_lst.append(watches.iloc[-2])
    test_lst.append(watches.iloc[-1])
    neg_lst.append(list(ns_list))

In [9]:
train = pd.concat(train_lst)
val   = pd.concat(val_lst, 1).T
test  = pd.concat(test_lst, 1).T

train.to_pickle('./data/ml/train_score.pkl')
val.to_pickle('./data/ml/val_score.pkl')
test.to_pickle('./data/ml/test_score.pkl')
np.save('./data/ml/neg_score.npy', neg_lst)

## END