In [None]:
import pandas as pd

inter = pd.read_csv('ml-1m.inter', sep='\t')
item = pd.read_csv('ml-1m.item', sep='\t')

In [None]:
inter

In [None]:
item

In [None]:
df = inter.merge(item, on='item_id:token', how='left')
df.dropna(subset=['user_id:token', 'item_id:token', 'rating:float', 'movie_title:token_seq', 'release_year:token', 'genre:token_seq'], inplace=True)
df

In [None]:
import numpy as np

rating = np.unique(df['rating:float'].values, return_counts=True)
rating

In [None]:
np.sum(rating[1][:3]) / np.sum(rating[1])

In [None]:
df

In [None]:
import pandas as pd

def get_count(tp, id):
    playcount_groupbyid = tp.groupby(id).size().reset_index(name='count')
    return playcount_groupbyid

def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item_id:token')
        tp = tp[tp['item_id:token'].isin(itemcount['item_id:token'][itemcount['count'] >= min_sc])]
    
    if min_uc > 0:
        usercount = get_count(tp, 'user_id:token')
        tp = tp[tp['user_id:token'].isin(usercount['user_id:token'][usercount['count'] >= min_uc])]
    
    usercount, itemcount = get_count(tp, 'user_id:token'), get_count(tp, 'item_id:token') 
    return tp, usercount, itemcount

In [None]:
prev_shape = -1
current_shape = df.shape[0]

while current_shape != prev_shape:
    df_pos = df[df['rating:float'] >= 4]
    df_neg = df[df['rating:float'] < 4]
    prev_shape = current_shape
    df_pos, user_activity, item_popularity = filter_triplets(df_pos, 5, 5)
    df = pd.concat([df_pos, df_neg])
    df, user_activity, item_popularity = filter_triplets(df, 5, 5)
    current_shape = df.shape[0]

sparsity = 1. * df.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])
print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
    (df.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

In [None]:
rating = np.unique(df['rating:float'].values, return_counts=True)
np.sum(rating[1][:3]) / np.sum(rating[1])

In [None]:
exist_user = np.unique(df_pos['user_id:token'].values)
exist_item = np.unique(df_pos['item_id:token'].values)

exist_user, len(exist_user), exist_item, len(exist_item)

In [None]:
df_neg

In [None]:
df_neg = df_neg[df_neg['user_id:token'].isin(exist_user)]
df_neg = df_neg[df_neg['item_id:token'].isin(exist_item)]
df_neg

In [None]:
df_pos

In [None]:
import sys

def split_train_val_test(data, val_prop=0.1, test_prop=0.2):
    data_grouped_by_user = data.groupby('user_id:token')
    tr_list, val_list, te_list = [], [], []

    np.random.seed(2024)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)
        
        if n_items_u >= 5:
            indices = np.random.permutation(n_items_u)
            test_size = int(test_prop * n_items_u)
            val_size = int(val_prop * n_items_u)

            test_indices = indices[:test_size]
            val_indices = indices[test_size:test_size+val_size]
            train_indices = indices[test_size+val_size:]

            train_mask = np.zeros(n_items_u, dtype=bool)
            train_mask[train_indices] = True
            val_mask = np.zeros(n_items_u, dtype=bool)
            val_mask[val_indices] = True
            test_mask = np.zeros(n_items_u, dtype=bool)
            test_mask[test_indices] = True

            tr_list.append(group[train_mask])
            val_list.append(group[val_mask])
            te_list.append(group[test_mask])
        else:
            print('else')
            tr_list.append(group)

        if i % 100 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    print('split done!')

    data_tr = pd.concat(tr_list)
    data_val = pd.concat(val_list)
    data_te = pd.concat(te_list)

    return data_tr, data_val, data_te


tr, va, te = split_train_val_test(df_pos)

In [None]:
tr

In [None]:
va

In [None]:
te

In [None]:
len(tr) / (len(tr)+len(va)+len(te)), len(va) / (len(tr)+len(va)+len(te)), len(te) / (len(tr)+len(va)+len(te))

In [None]:
len(np.unique(tr['user_id:token'])), len(np.unique(tr['item_id:token']))

In [None]:
len(np.unique(va['user_id:token'])), len(np.unique(va['item_id:token']))

In [None]:
len(np.unique(te['user_id:token'])), len(np.unique(te['item_id:token']))

In [None]:
va[~va['item_id:token'].isin(tr['item_id:token'].values)]
#va[va['item_id:token'].isin(~tr['item_id:token'].values)] # tr에 없는 item_id가 va, te에 있으면 안 됨.

In [None]:
te[~te['item_id:token'].isin(tr['item_id:token'].values)]
#te[te['item_id:token'].isin(~tr['item_id:token'].values)] # tr에 없는 item_id가 va, te에 있으면 안 됨.

In [None]:
tr

In [None]:
df_neg

In [None]:
len(df_neg) / (len(tr)+len(df_neg))

In [None]:
tr_plus_neg = pd.concat([tr, df_neg], ignore_index=True)
tr_plus_neg

In [None]:
tr_plus_neg.to_csv('ML-1M_train_original.csv', index=False)

In [None]:
va.to_csv('ML-1M_validation.csv', index=False)

In [None]:
te.to_csv('ML-1M_test.csv', index=False)

In [None]:
train = pd.read_csv('ML-1M_train_original.csv')
valid = pd.read_csv('ML-1M_validation.csv')
test = pd.read_csv('ML-1M_test.csv')

In [None]:
df = pd.concat([train,valid,test])
df

In [None]:
u = len(np.unique(df['user_id:token'].values))
i = len(np.unique(df['item_id:token'].values))
e = len(df)

u, i, e

In [None]:
print('%.4f' % (e / (u * i)))

In [None]:
pos = df[df['rating:float'] >= 4]
neg  = df[df['rating:float'] < 4]

p = len(pos)
n = len(neg)

p, n

In [None]:
print('%.2f:%.2f' %(p/(p+n), n/(p+n)))

In [None]:
print('1:%.2f' %(n / p))