## Preprocess Steps

* Build vocab files for user and item, textline format: field  value  idx
* Filter multicore rating data
* Holdout train-val-test dataset

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

DATA_ROOT = "/home/sl1471/public/"
PROCESSED_DATA_ROOT = "/home/sl1471/workspace/experiments/"
data_path = DATA_ROOT + "ml-1m/"
target_path = PROCESSED_DATA_ROOT + "ml-1m/"

In [2]:
import pandas as pd
user_meta = pd.read_table(data_path + "users.dat", sep="::",
                          names=["UserID","Gender","Age","Occupation","Zip-code"])
item_meta = pd.read_table(data_path + "movies.dat", sep="::", 
                          names=["ItemID","Title","Genre"])
rating_data = pd.read_table(data_path + "ratings.dat", sep="::",
                            names=["UserID","ItemID","Response","Timestamp"])

  return func(*args, **kwargs)


### Item Features

In [3]:
item_meta[:3]

Unnamed: 0,ItemID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [4]:
year = [title[-5:-1] for title in item_meta["Title"]]
item_meta.insert(1, "Year", year)
item_meta = item_meta.drop(["Title"], axis = 1)
genre = [g.replace('\'s','').replace('-','').replace('|',',') for g in item_meta["Genre"]]
item_meta = item_meta.drop(["Genre"], axis = 1)
item_meta.insert(2, "Genre", genre)
item_meta[:3]

Unnamed: 0,ItemID,Year,Genre
0,1,1995,"Animation,Children,Comedy"
1,2,1995,"Adventure,Children,Fantasy"
2,3,1995,"Comedy,Romance"


In [5]:
S = list(rating_data["ItemID"].unique())
92 in S

True

In [6]:
from utils import setup_path
save_path = target_path + "meta_data/item.meta"
setup_path(save_path, is_dir = False)
item_meta.to_csv(save_path, sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/ml-1m" existed
dir "/home/sl1471/workspace/experiments/ml-1m/meta_data" existed


In [7]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/item_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(item_meta, save_path, ["ItemID", "Year", "Genre"])

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/ml-1m" existed
dir "/home/sl1471/workspace/experiments/ml-1m/meta_data" existed
Vocab file saved to: /home/sl1471/workspace/experiments/ml-1m/meta_data/item_fields.vocab


In [8]:
import pandas as pd
item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Year", "Genre"], 
                                 "field_type": ["nominal", "ordinal", "nominal"], 
                                 "value_type": ["int", "int", "str"], 
                                 "field_enc": ["v2id", "v2id", "v2onehot"]})
item_fields_meta.to_csv(target_path + "meta_data/item_fields.meta", 
                        sep = '\t', index = False)

### User Features

In [9]:
user_meta[:3]

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117


In [10]:
area = [zp[:2] for zp in user_meta["Zip-code"]]
user_meta = user_meta.drop(["Zip-code"], axis = 1)
user_meta.insert(4, "ZipArea", area)
user_meta[:3]

Unnamed: 0,UserID,Gender,Age,Occupation,ZipArea
0,1,F,1,10,48
1,2,M,56,16,70
2,3,M,25,15,55


In [11]:
len(user_meta), len(rating_data["UserID"].unique())

(6040, 6040)

In [12]:
from utils import setup_path
save_path = target_path + "meta_data/user.meta"
setup_path(save_path, is_dir = False)
user_meta.to_csv(save_path, sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/ml-1m" existed
dir "/home/sl1471/workspace/experiments/ml-1m/meta_data" existed


In [13]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/user_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(user_meta, save_path, ["UserID", "Gender", "Age", "Occupation", "ZipArea"])

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/ml-1m" existed
dir "/home/sl1471/workspace/experiments/ml-1m/meta_data" existed
Vocab file saved to: /home/sl1471/workspace/experiments/ml-1m/meta_data/user_fields.vocab


In [14]:
import pandas as pd
user_fields_meta = pd.DataFrame({"field_name": ["UserID", "Gender", "Age", "Occupation", "ZipArea"], 
                          "field_type": ["nominal", "nominal", "nominal", "nominal", "nominal"], 
                          "value_type": ["int", "str", "int", "int", "int"], 
                          "field_enc": ["v2id", "v2id", "v2id", "v2id", "v2id"]})
user_fields_meta.to_csv(target_path + "meta_data/user_fields.meta", sep = '\t', index = False)

### Rating Data

In [15]:
df = rating_data.sort_values(by=['UserID','Timestamp'])

In [16]:
print(f"#user: {len(df.UserID.unique())}")
print(f"#item: {len(df.ItemID.unique())}")
print(f"sparsity: {1.0 - len(df) / (len(df.UserID.unique()) * len(df.ItemID.unique()))}")

#user: 6040
#item: 3706
sparsity: 0.9553163743776871


In [17]:
from data.preprocess import run_multicore
multicore_data = run_multicore(df, n_core = 10)

Filter 10-core data.
n_core = 10
N-core is set to [5,100]
Filtering 10-core data
Iteration 1


100%|██████████| 1000209/1000209 [00:01<00:00, 618643.32it/s]


Number of removed record: 1670
Iteration 2


100%|██████████| 998539/998539 [00:01<00:00, 792820.57it/s]


Number of removed record: 0
Size change: 1000209 --> 998539


In [18]:
from utils import set_random_seed
set_random_seed(9)
from data.preprocess import holdout_data_sequential
trainset, valset, testset = holdout_data_sequential(multicore_data, holdout_type = "warm", ratio = [0.8,0.1,0.1])

Build user history


998539it [00:02, 426544.51it/s]


Holdout user histories


100%|██████████| 6040/6040 [00:00<00:00, 8564.77it/s] 


In [19]:
from utils import setup_path
save_path = target_path + "tsv_data/"
setup_path(save_path, is_dir = True)
trainset.to_csv(save_path + "train.tsv", sep = '\t', index = False)
valset.to_csv(save_path + "val.tsv", sep = '\t', index = False)
testset.to_csv(save_path + "test.tsv", sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/ml-1m" existed
dir "/home/sl1471/workspace/experiments/ml-1m/tsv_data" existed
dir "/home/sl1471/workspace/experiments/ml-1m/tsv_data/" existed
