In [1]:
from sklearn.utils import shuffle

def read_dat(file_path):
    lines = list()
    for line in open(file_path, 'r'):
        line = line.replace('\n', '')
        line = line.split('\t')
        lines.append(line)

    return lines

In [2]:
def save_txt(data, file_name):
    out_dict = {}
    for u, i in data:
        # u = int(user)
        # i = int(item)
        if out_dict.get(u):
            out_dict[u].append(i)
        else:
            out_dict[u] = [i]

    out = []
    for key in out_dict.keys():
        out.append(out_dict[key])
        out[-1].insert(0, key)
        
    textfile = open("lightgcn/data/{}.txt".format(file_name), "w")
    for element in out:
        s = ' '.join(element)
        textfile.write(s + "\n")
    textfile.close()

## MovieLens

In [29]:
import numpy as np

data = read_dat('../data/Movielens/user_movie.dat')
data = np.array(data)
training_data, testing_data = data[:80000, :2], data[80000:, :2]
print(training_data.shape, testing_data.shape)

(80000, 2) (20000, 2)


In [31]:
save_txt(training_data, "train")
save_txt(testing_data, "test")

## Yelp

In [61]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

data = read_dat('Yelp/user_business.dat')
data = np.array(data)
# remove item < 3 users
df = pd.DataFrame(columns=['u', 'i', 'rating'], data=data)
print('data count: {}'.format(len(df)))
value_counts = df['u'].value_counts()
to_remove = value_counts[value_counts <= 3].index
df['u'].replace(to_remove, np.nan, inplace=True)

df = df.dropna()
print('data count: {}'.format(len(df)))
userencoder = LabelEncoder() 
itemencoder = LabelEncoder() 
df['u'] = userencoder.fit_transform(df['u'])
df['i'] = itemencoder.fit_transform(df['i'])
print("max/min user number: {}/{}\nmax/min item number: {}/{}".format(df['u'].max(), df['u'].min(), df['i'].max(), df['i'].min()))
print("distint user number: {}\ndistint item number: {}".format(df['u'].nunique(),  df['i'].nunique()))
df['u'] = df['u'].astype(str)
df['i'] = df['i'].astype(str)

data = df.to_numpy()
data = shuffle(data)
cut = int(len(data) * 0.8)
training_data, testing_data = data[:cut, :2], data[cut:, :2]
print(training_data.shape, testing_data.shape)

data count: 198397
data count: 184835
max/min user number: 7325/0
max/min item number: 14126/0
distint user number: 7326
distint item number: 14127
(147868, 2) (36967, 2)


In [62]:
max_user, min_user, max_item, min_item = 0, 999, 0, 999
dis_user_list, dis_item_list = [], []
for u, i in training_data:
    u = int(u)
    i = int(i)
    if u > max_user:
        max_user = u
    if u < min_user:
        min_user = u
    if i > max_item:
        max_item = i
    if i < min_item:
        min_item = i
        
    if u not in dis_user_list:
        dis_user_list.append(u)
    if i not in dis_item_list:
        dis_item_list.append(i)

print("training max/min user number: {}/{}\ntraining max/min item number: {}/{}".format(max_user, min_user, max_item, min_item))
print("training distint user number: {}\ntraining distint item number: {}".format(len(dis_user_list), len(dis_item_list)))

training max/min user number: 7325/0
training max/min item number: 14126/0
training distint user number: 7326
training distint item number: 13552


In [63]:
save_txt(training_data, "yelp/train")
save_txt(testing_data, "yelp/test")

## Douban

In [58]:
import numpy as np

data = read_dat('Douban/user_book.dat')
data = np.array(data)
# remove item < 3 users
df = pd.DataFrame(columns=['u', 'i', 'rating'], data=data)
print('data count: {}'.format(len(df)))
value_counts = df['u'].value_counts()
to_remove = value_counts[value_counts <= 3].index
df['u'].replace(to_remove, np.nan, inplace=True)

df = df.dropna()
print('data count: {}'.format(len(df)))
userencoder = LabelEncoder() 
itemencoder = LabelEncoder() 
df['u'] = userencoder.fit_transform(df['u'])
df['i'] = itemencoder.fit_transform(df['i'])
print("max/min user number: {}/{}\nmax/min item number: {}/{}".format(df['u'].max(), df['u'].min(), df['i'].max(), df['i'].min()))
print("distint user number: {}\ndistint item number: {}".format(df['u'].nunique(),  df['i'].nunique()))
df['u'] = df['u'].astype(str)
df['i'] = df['i'].astype(str)

data = df.to_numpy()
data = shuffle(data)
cut = int(len(data) * 0.8)
training_data, testing_data = data[:cut, :2], data[cut:, :2]
print(training_data.shape, testing_data.shape)

data count: 792062
data count: 788898
max/min user number: 11265/0
max/min item number: 22346/0
distint user number: 11266
distint item number: 22347
(631118, 2) (157780, 2)


In [60]:
max_user, min_user, max_item, min_item = 0, 999, 0, 999
dis_user_list, dis_item_list = [], []
for u, i in training_data:
    u = int(u)
    i = int(i)
    if u > max_user:
        max_user = u
    if u < min_user:
        min_user = u
    if i > max_item:
        max_item = i
    if i < min_item:
        min_item = i
        
    if u not in dis_user_list:
        dis_user_list.append(u)
    if i not in dis_item_list:
        dis_item_list.append(i)

print("training max/min user number: {}/{}\ntraining max/min item number: {}/{}".format(max_user, min_user, max_item, min_item))
print("training distint user number: {}\ntraining distint item number: {}".format(len(dis_user_list), len(dis_item_list)))

training max/min user number: 11265/0
training max/min item number: 22346/0
training distint user number: 11266
training distint item number: 22347


In [59]:
save_txt(training_data, "douban/train")
save_txt(testing_data, "douban/test")