In [1]:
import os
import numpy as np
import pandas as pd
import tensorly as tl
from tensorly.decomposition import parafac
from tensorly.decomposition import robust_pca
import datetime
import csv
import glob
import tensortools as tt
from tensortools.operations import unfold as tt_unfold, khatri_rao

Using numpy backend.


In [3]:
'''
Step 1 - Set user_id
'''

# Get all text files in data folder
files = []
for file in glob.glob("data/*.txt"):
    files.append(file)
    
# Put all the files into one dataframe and give each user an ID
users_dict = []
df = pd.DataFrame()
for idx,file in enumerate(files): 
    file_name = file[5:11]
    file_dict = [file_name,idx+1]
    users_dict.append(file_dict)
    df_file = pd.read_csv(file, sep = '\t', header=None)
    df_file["userID"] = idx + 1
    df = df.append(df_file, ignore_index=True)
df.columns = ['s_vertex', 'e_vertex', 'is_weekend', 'time', 'freq', 'user_id']

# Save the users_dict
with open('users_dict.csv', 'w', newline='') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerows(users_dict)

df.head(5)

Unnamed: 0,s_vertex,e_vertex,is_weekend,time,freq,user_id
0,203492,217680,0,0,1,1
1,217680,217681,0,0,1,1
2,217681,217682,0,0,1,1
3,217682,196445,0,0,1,1
4,196445,196446,0,0,1,1


In [4]:
'''
Step 2 - Set a time_id for each ['is_weekend', 'time'] pair
'''

time_dict = df.copy()
time_dict.drop(['freq', 'user_id', 's_vertex', 'e_vertex'], axis=1, inplace=True)
time_dict.drop_duplicates(subset=['is_weekend', 'time'], keep="first", inplace=True)
time_dict.reset_index(drop = True, inplace = True)

indices = list(range(1, len(time_dict) + 1))
time_dict['time_id'] = indices

time_dict_w  = time_dict.copy()
time_dict_w = time_dict_w.values.tolist()
time_dict_w = list(time_dict_w)

# Save the behavior dict
with open('time_dict.csv', 'w', newline='') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerows(time_dict_w)

# Join the item_dict with the dataframe
df = pd.merge(df, time_dict, on=['is_weekend','time'], how='inner')
df = df.drop(['is_weekend', 'time'], axis = 1)
df = df[['user_id', 's_vertex', 'e_vertex', 'time_id', 'freq']]

df.head(5)

Unnamed: 0,user_id,s_vertex,e_vertex,time_id,freq
0,1,203492,217680,1,1
1,1,217680,217681,1,1
2,1,217681,217682,1,1
3,1,217682,196445,1,1
4,1,196445,196446,1,1


In [5]:
'''
Step 3 - Set a segment_id for each ['s_vertex', 'e_vertex'] pair
'''

segment_dict = df.copy()
segment_dict.drop(['freq', 'user_id', 'time_id'], axis=1, inplace=True)
segment_dict.drop_duplicates(subset=['s_vertex', 'e_vertex'], keep="first", inplace=True)
segment_dict.reset_index(drop = True, inplace = True)

indices = list(range(1, len(segment_dict) + 1))
segment_dict['segment_id'] = indices

segment_dict_w  = segment_dict.copy()
segment_dict_w = segment_dict_w.values.tolist()
segment_dict_w = list(segment_dict_w)

# Save the behavior dict
with open('segment_dict.csv', 'w', newline='') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerows(segment_dict_w)

# Join the item_dict with the dataframe
df = pd.merge(df, segment_dict, on=['s_vertex', 'e_vertex'], how='inner')
df = df.drop(['s_vertex', 'e_vertex'], axis = 1)
df = df[['user_id', 'segment_id', 'time_id', 'freq']]
df.head(5)

Unnamed: 0,user_id,segment_id,time_id,freq
0,1,1,1,1
1,24,1,1,1
2,20,1,3,1
3,17,1,4,1
4,27,1,4,1


In [39]:
# Get the size of the tensor
num_users = len(files)
num_segments = len(segment_dict)
num_time = len(time_dict)

In [40]:
# Create an all zeros tensor
tensor = np.zeros((num_users, num_segments, num_time))

In [50]:
# Fill the tensor with existing data
for index, row in df.iterrows():
    x = row["user_id"] - 1
    y = row["segment_id"] - 1
    z = row["time_id"] - 1
    val = row["freq"]
    tensor[x][y][z] = val

In [8]:
# Create a mask
mask = np.zeros((num_users, num_segments, num_timeInterval))

In [9]:
# Fill the tensor with existing data
for index, row in df.iterrows():
    x = row["user_id"] - 1
    y = row["segment_id"] - 1
    z = row["time_id"] - 1
    val = row["freq"]
    mask[x][y][z] = 1

In [10]:
# Using tensor decomposition lib
start = datetime.datetime.now()

rank = 2

X = tl.tensor(tensor, dtype=np.int32)
Xn = np.maximum(0, X + .1*np.random.randn(num_users, num_segments, num_timeInterval))

M = tl.tensor(mask, dtype=np.bool)

U = tt.mncp_hals(Xn, rank=rank, mask=M, verbose=False, max_iter =30)
factors = U.factors.factors

end = datetime.datetime.now()

In [11]:
# Get the predictic matrix from the factors
pred_matrix = tl.kruskal_to_tensor(factors)

In [12]:
# Calculate the pred_error and other param
pred_err = 0
for index, row in df.iterrows():
    x = row["user_id"] - 1
    y = row["segment_id"] - 1
    z = row["time_id"] - 1
    pred_err += np.sqrt(np.square(Xn[x][y][z] - pred_matrix[x][y][z]))

pred_err = pred_err / len(df.index)
time = (end - start).total_seconds()

In [13]:
# Store the result into result
result = [num_users, num_segments, num_time, rank, pred_err, time]

In [14]:
results = []
header = ["user", "segment", "interval", "rank", "iteration", "error", "time"]

results.append(header)
results.append(result)

In [15]:
# Write the experiment results into the output file
with open('output.csv', 'a') as resultFile:
    wr = csv.writer(resultFile, dialect='excel')
    wr.writerows(results)