# PURE

Periodic User Representation Extraction

In [1]:
from pure import *

# Data Preparation

## Load Data & Info

In [21]:
df_2017 = pd.read_csv('info/active_users.2017.csv', index_col=False, parse_dates=['exerciseDate', 'startTime'])
df_2018 = pd.read_csv('info/active_users.2018.csv', index_col=False, parse_dates=['exerciseDate', 'startTime'])

In [22]:
df = pd.concat([df_2017, df_2018])

In [23]:
print('{} samples'.format(len(df)))
print('{} distinct users'.format(len(df['userId'].unique())))

385846 samples
1160 distinct users


In [24]:
with open('info/samples.health_state_prediction.test.json', 'r') as f:
    select_context = json.load(f)

## Select Data

In [25]:
df['valid'] = [((str(u) in select_context) and (str(d)[:7] in select_context[str(u)])) \
               for u,d in zip(df['userId'], df['exerciseDate'])]

In [26]:
df['valid'].value_counts()

False    381445
True       4401
Name: valid, dtype: int64

In [27]:
df = df[df['valid'] == True]

## Refine Data

In [28]:
df.isnull().sum()

exerciseID         0
userId             0
exerciseClassId    0
velocity           3
duration           0
distance           0
calorie            0
startTime          0
exerciseDate       0
valid              0
dtype: int64

In [29]:
df = df.dropna(subset=['startTime', 'exerciseDate'])
print('{} samples'.format(len(df)))

4401 samples


In [30]:
def prepare_data(df):
    time_list = ['morning', 'afternoon', 'evening', 'night']
    time_dic = [time_list[3]]*6 + [time_list[0]]*6 + [time_list[1]]*6 + [time_list[2]]*6
    
    df['exerciseDate'] = [datetime(dt.year, dt.month, dt.day) for dt in df['startTime']]
    df['exerciseTime'] = [time_dic[x.hour] for x in df['startTime']]
    df['isExercise'] = [1 for x in df['startTime']]
    df['weekday'] = [x.weekday() for x in df['exerciseDate']]
    df['distance'] = df['distance'].fillna(value=0)
    df['exerciseClassId'] = list(map(lambda x: x if x!=11 else None, df['exerciseClassId']))
    return df

In [31]:
df = prepare_data(df)

## Save Data

In [32]:
df.to_csv('health_state_prediction/saved/exercise_info.test.csv', index=False)

# Testing Process

## 0. Load Data

In [33]:
exercise_records = pd.read_csv('health_state_prediction/saved/exercise_info.test.csv', parse_dates=['exerciseDate', 'startTime'])

In [34]:
user_list = exercise_records['userId'].unique().tolist()
len(user_list), len(exercise_records)

(43, 4401)

In [35]:
exercise_records.isnull().sum()

exerciseID          0
userId              0
exerciseClassId    10
velocity            3
duration            0
distance            0
calorie             0
startTime           0
exerciseDate        0
valid               0
exerciseTime        0
isExercise          0
weekday             0
dtype: int64

In [36]:
exercise_records.head(3)

Unnamed: 0,exerciseID,userId,exerciseClassId,velocity,duration,distance,calorie,startTime,exerciseDate,valid,exerciseTime,isExercise,weekday
0,685016,28660,2.0,10.1,2893,8.101,532,2017-06-05 07:23:18,2017-06-05,True,morning,1,0
1,685435,28660,2.0,7.7,4524,9.708,802,2017-06-05 17:36:00,2017-06-05,True,afternoon,1,0
2,687036,28660,2.0,10.4,2858,8.272,481,2017-06-07 07:43:32,2017-06-07,True,morning,1,2


## 0. Load Module

In [2]:
with open('health_state_prediction/saved/PURE.pkl', 'rb') as f:
    uem = pkl.load(f)

In [3]:
uem.set_mode('test')

## 1. Transaction Construction

Discretization and preprocessing for preparing exercise records

In [39]:
exercise_records = uem.preprocess_exercise_records(exercise_records)

Convert from exercise records to exercise transactions

In [40]:
exercise_transactions = uem.construct_exercise_transactions(user_list, exercise_records, 
                                                            output_path='health_state_prediction/data/input/')

Processing user 26298    : 100%|██████████| 43/43 [00:01<00:00, 39.31it/s]


In [41]:
import json
with open('health_state_prediction/saved/exercise_transactions.test.json', 'w') as f:
    json.dump(exercise_transactions, f)

## 2. Pattern Extraction

Mining Periodic Frequent Patterns using the PFPM algorithm

In [42]:
import json
with open('health_state_prediction/saved/exercise_transactions.test.json', 'r') as f:
    exercise_transactions = json.load(f)

In [43]:
exercise_patterns = uem.extract_exercise_patterns(exercise_transactions, 
                                                  output_path='health_state_prediction/data/output/', 
                                                  file_size_limit=10*10**9, 
                                                  args={'minper':0, 'maxper':10, 'minavgper':0, 'maxavgper':7})

Processing file health_state_prediction/data/input/26298.2018-04.txt : 100%|██████████| 144/144 [00:30<00:00,  4.68it/s]


In [44]:
import json
with open('health_state_prediction/saved/exercise_patterns.test.json', 'w') as f:
    json.dump(exercise_patterns, f)

## 3. Pattern Clustering

In [4]:
import json
with open('health_state_prediction/saved/exercise_patterns.test.json', 'r') as f:
    exercise_patterns = json.load(f)

**Step 1.** Generate pattern vectors

In [5]:
info, pattern_vectors = uem.prepare_pattern_vectors(exercise_patterns, ret_info=True)

Processing file health_state_prediction/data/output/26298.2018-04.txt : 100%|██████████| 144/144 [00:00<00:00, 3771.64it/s]


_Save raw exercise patterns..._

In [6]:
info = info[['userID', 'time_window']]
for i in range(uem.pattern_idxer.size):
    info['pattern_{}'.format(i)] = [ptn[i] for ptn in pattern_vectors]
info.to_csv('health_state_prediction/data/exercise_patterns.test.csv', index=False)

**Step 2. & 3.** Calculate user representationis

In [46]:
result = uem.get_user_representations(exercise_patterns)
result.info()

Processing file health_state_prediction/data/output/26298.2018-04.txt : 100%|██████████| 144/144 [00:00<00:00, 4744.02it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 55 columns):
userID              142 non-null object
time_window         142 non-null object
periodicity_min     142 non-null float64
periodicity_max     142 non-null float64
periodicity_mean    142 non-null float64
cluster_0           142 non-null float64
cluster_1           142 non-null float64
cluster_2           142 non-null float64
cluster_3           142 non-null float64
cluster_4           142 non-null float64
cluster_5           142 non-null float64
cluster_6           142 non-null float64
cluster_7           142 non-null float64
cluster_8           142 non-null float64
cluster_9           142 non-null float64
cluster_10          142 non-null float64
cluster_11          142 non-null float64
cluster_12          142 non-null float64
cluster_13          142 non-null float64
cluster_14          142 non-null float64
cluster_15          142 non-null float64
cluster_16          142 non-null flo

_Save user representations..._

In [47]:
result.head(3)

Unnamed: 0,userID,time_window,periodicity_min,periodicity_max,periodicity_mean,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,...,cluster_40,cluster_41,cluster_42,cluster_43,cluster_44,cluster_45,cluster_46,cluster_47,cluster_48,cluster_49
0,28660,2017-06,1.625,5.2,2.971604,0.1,0.049412,0.067303,0.049736,0.051573,...,0.030303,0.025546,0.012927,0.022402,0.035991,0.026526,0.023148,0.024257,0.2,0.1
1,28660,2017-07,1.631579,5.166667,3.096704,0.125,0.024706,0.033651,0.024868,0.025319,...,0.015152,0.02177,0.013101,0.021541,0.021121,0.027407,0.02381,0.022322,0.25,0.125
2,28660,2017-08,1.24,6.2,3.690675,0.125,0.024706,0.033651,0.024868,0.025319,...,0.015152,0.02177,0.013101,0.021541,0.021121,0.027407,0.02381,0.022322,0.25,0.125


In [48]:
result = result.fillna(0)
result.to_csv('health_state_prediction/data/exercise_patterns.clusters_50.test.csv', index=False)