In [2]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
import argparse


def preprocess(data_name):
  u_list, i_list, ts_list, label_list = [], [], [], []
  feat_l = []
  idx_list = []

  with open(data_name) as f:
    s = next(f)
    for idx, line in enumerate(f):
      e = line.strip().split(',')
      #user id 
      u = int(e[0])
      #item id 
      i = int(e[1])
      #time stamp 
      ts = float(e[2])
      #state label/ label of what it is 
      label = float(e[3])  # int(e[3])
      #csv of features anything after index 4 
      #note features must be numeric so we would 1 hot encode categorical variables
      feat = np.array([float(x) for x in e[4:]])

      u_list.append(u)
      i_list.append(i)
      ts_list.append(ts)
      label_list.append(label)
      idx_list.append(idx)

      feat_l.append(feat)
  #returns pandas dataframe with users, ids, time stamps, numpy array of features
  return pd.DataFrame({'u': u_list,
                       'i': i_list,
                       'ts': ts_list,
                       'label': label_list,
                       'idx': idx_list}), np.array(feat_l)


def reindex(df, bipartite=True):
  new_df = df.copy()
  if bipartite:
    assert (df.u.max() - df.u.min() + 1 == len(df.u.unique()))
    assert (df.i.max() - df.i.min() + 1 == len(df.i.unique()))

    upper_u = df.u.max() + 1
    new_i = df.i + upper_u

    new_df.i = new_i
    new_df.u += 1
    new_df.i += 1
    new_df.idx += 1
  else:
    new_df.u += 1
    new_df.i += 1
    new_df.idx += 1
  return new_df


def run(data_name, bipartite=True):
    base = '/home/emiliano/projects/def-cbravo/rappi_data/QTR/'
    Path("data/").mkdir(parents=True, exist_ok=True)
    PATH = './data/{}.csv'.format(data_name)
    OUT_DF = './data/ml_{}.csv'.format(data_name)
    OUT_FEAT = './data/ml_{}.npy'.format(data_name)
    OUT_NODE_FEAT = './data/ml_{}_node.npy'.format(data_name)

    df, feat = preprocess(PATH)
    new_df = reindex(df, bipartite)

    empty = np.zeros(feat.shape[1])[np.newaxis, :]
    feat = np.vstack([empty, feat])

    

    max_idx = max(new_df.u.max(), new_df.i.max())
    # diff 172 -> feat.shape[1] as to flow with the input shape
    #node features will have to be popullated with actual node features as these are empty vectors
    rand_feat = np.zeros((max_idx + 1, feat.shape[1]))


    #saves time stamped event stamped edge list as a csv 
    new_df.to_csv(OUT_DF)
    #saves edge features 
    np.save(OUT_FEAT, feat)
    #saves node features
    np.save(OUT_NODE_FEAT, rand_feat)

# parser = argparse.ArgumentParser('Interface for TGN data preprocessing')
# parser.add_argument('--data', type=str, help='Dataset name (eg. wikipedia or reddit)',
#                         default='wikipedia')
# parser.add_argument('--bipartite', action='store_true', help='Whether the graph is bipartite')

# args = parser.parse_args()

# run(args.data, bipartite=args.bipartite)