In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
df = pd.read_csv('train.csv', names=['userID', 'itemID', 'rating'], low_memory=False)
print(f'shape: {df.shape}')
df[:5]
df.drop(0, inplace=True)

shape: (1254442, 3)


In [3]:
df[:5]

Unnamed: 0,userID,itemID,rating
1,114341,9124,5.0
2,114341,32109,4.0
3,114341,44195,5.0
4,114341,24427,5.0
5,114341,10994,5.0


In [4]:
df.describe()

Unnamed: 0,userID,itemID,rating
count,1254441,1254441,1254441.0
unique,192403,62989,5.0
top,19883,11643,5.0
freq,345,3688,749623.0


In [5]:
def split_data(data, n_sample = 1, replace = False, seed = None):
    """
    train / valid data로 split

    Args:
        data
    Returns:
        train data / valid data
    """
    users = defaultdict(list)
    ratings = defaultdict(list)
    user_train = []
    user_valid = []
    for d in data:
        users[int(d[0])].append(d[1])
        ratings[int(d[0])].append(d[2])

    random = (np.random.RandomState(seed) if seed is not None else np.random)
    for user in users:
        item, rating = users[user], ratings[user]
        n_valid_sample = max(int(np.round(len(item) * n_sample)) if isinstance(n_sample, float) else min(n_sample, len(item)), 1)
        n_valid_sample = (n_valid_sample - 1) if (len(item) == n_valid_sample) else n_valid_sample
        valid_indices = random.choice(len(item), size = n_valid_sample, replace = replace).tolist()
        valid = [item[i] for i in valid_indices]
        train = list(set(item) - set(valid))
        train_indices = [idx for idx, i in enumerate(item) if i in train]

        for i, r in zip(train, [rating[i] for i in train_indices]):
            user_train.append([user, i, r])

        for i, r in zip(valid, [rating[i] for i in valid_indices]):
            user_valid.append([user, i, r])
    user_train = np.array(user_train, dtype = data.dtype)
    user_valid = np.array(user_valid, dtype = data.dtype)
    return user_train, user_valid

In [6]:
user_train, user_valid = split_data(df.to_numpy())

In [7]:
df_train = pd.DataFrame(user_train)
df_valid = pd.DataFrame(user_valid)

In [8]:
pd.unique(df_train[0]).shape, pd.unique(df_valid[0]).shape, 

((192403,), (192403,))

In [9]:
pd.unique(df_train[1]).shape, pd.unique(df_valid[1]).shape, 

((62907,), (46045,))

In [10]:
df_train.describe()

Unnamed: 0,0,1,2
count,1062038,1062038,1062038.0
unique,192403,62907,5.0
top,19883,11643,5.0
freq,344,3130,634552.0


In [11]:
df_valid.describe()

Unnamed: 0,0,1,2
count,192403,192403,192403.0
unique,192403,46045,5.0
top,114341,11643,5.0
freq,1,558,115071.0


In [13]:
df_train.head()

Unnamed: 0,0,1,2
0,114341,44195,5.0
1,114341,9124,4.0
2,114341,10994,5.0
3,114341,32109,5.0
4,88818,44195,3.0


In [18]:
df_train.columns = ['userID', 'itemID', 'rating']
df_valid.columns = ['userID', 'itemID', 'rating']

In [19]:
df_train['x_label'] = 0

In [24]:
df_valid['x_label'] = 1

In [26]:
import copy

In [27]:
df_test = copy.deepcopy(df_valid)
df_test['x_label'] = 2

In [29]:
df_test.head()

Unnamed: 0,userID,itemID,rating,x_label
0,114341,24427,5.0,2
1,88818,14862,1.0,2
2,85622,44383,5.0,2
3,130855,47833,5.0,2
4,40088,24087,1.0,2


In [31]:
inha = pd.concat([df_train, df_valid, df_test])

In [32]:
inha.to_csv('new_inha.inter', sep='\t', index=False)