In [1]:
from os.path import join as dir_join
from os.path import exists as dir_exists
from os import makedirs, listdir
import re
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import time

from utills import Candidate, Platoon
from pattern_miner import Miner

def get_trajectory_id(text):
    m = re.search('client_(.+).csv', text)
    if m:
        found = m.group(1)
        return found
    else:
        raise ValueError()

TRAJ_FOLDER = 'paths'
columns = ['lat', 'long', 'datetime', 'trajectory_id']
FILE_NAME = dir_join(TRAJ_FOLDER, 'processed.csv')

if not dir_exists(FILE_NAME):
    if not dir_exists(TRAJ_FOLDER):
        raise ValueError(TRAJ_FOLDER + ' does not exist')
    folder_files = istdir(TRAJ_FOLDER)
    list_df = []
    for filename in folder_files:
        df = pd.read_csv(dir_join(TRAJ_FOLDER, filename), names=columns)
        df['datetime'] = pd.to_datetime(df['datetime'], unit='s')
        df['trajectory_id'] = get_trajectory_id(filename)
        list_df += [df]
    df = pd.concat(list_df, ignore_index=True)
    df.to_csv(FILE_NAME, index=False)

df = pd.read_csv(FILE_NAME, parse_dates=[columns.index('datetime')], dtype={'lat': np.float32, 'long': np.float32, 'trajectory_id': np.str_})
df.head()

Unnamed: 0,lat,long,datetime,trajectory_id
0,359.5,416.5,2017-12-21 13:01:42,4976
1,359.5,416.5,2017-12-21 13:01:49,4976
2,359.5,416.5,2017-12-21 13:01:54,4976
3,364.5,426.5,2017-12-21 13:01:59,4976
4,379.5,456.5,2017-12-21 13:02:06,4976


In [2]:
savefolder = 'results_small'
if not dir_exists(savefolder):
    makedirs(savefolder)

sampling_interval = pd.Timedelta(minutes=1)
split_border = pd.Timedelta(days=1)
max_time_interval = df['datetime'].max()-df['datetime'].min()
pl = Platoon(2, 60, 5, max_time_interval // sampling_interval)
miner = Miner(df, pl, sampling_interval)

time1 = time.time()
print('Start time: ' + time.ctime())
#miner.extract_staypoints_heatmap(10)
#miner.save_staypoints_heatmap(dir_join(savefolder, 'staypoints_heatmap.npy'))
miner.load_staypoints_heatmap(dir_join(savefolder, 'staypoints_heatmap.npy'))
time2 = time.time()
print('Staypoints extraction done. Time: ' + str(time2 - time1))
miner.unify_datetime(split_border)
time3 = time.time()
print('Data unification done. Time: ' + str(time3 - time2))
#miner.compute_candidate_stars(5)
#miner.save_candidate_stars(dir_join(savefolder, 'candidate_stars.json'))
miner.load_candidate_stars(dir_join(savefolder, 'candidate_stars.json'))
time4 = time.time()
print('Candidate stars computing done. Time: ' + str(time4 - time3))
miner.compute_pattern_set()
time5 = time.time()
print('Pattern set computing done. Time: ' + str(time5 - time4))
miner.compute_connection_rate()
#miner.save_connection_rate(dir_join(savefolder, 'connection_rate.npz'))
miner.load_connection_rate(dir_join(savefolder, 'connection_rate.npz'))
time6 = time.time()
print('Pattern set computing done. Time: ' + str(time6 - time5))
print('All time: ' + str(time6 - time1))
print('Finish time: ' + time.ctime())

Start time: Sun Apr 29 17:27:15 2018
Staypoints extraction done. Time: 0.008761405944824219
Data unification done. Time: 205.69818329811096
Candidate stars computing done. Time: 0.0559234619140625
Pattern set computing done. Time: 0.5513577461242676
Pattern set computing done. Time: 42.60465669631958
All time: 248.9188826084137
Finish time: Sun Apr 29 17:31:24 2018


In [3]:
userinfo = pd.read_csv('user_info.csv', parse_dates=['userinfo_dateofbirth'], dtype={'userinfo_sso': np.str_}).fillna('not_set')
bins = pd.date_range(userinfo['userinfo_dateofbirth'].min() - pd.Timedelta(days=365), pd.to_datetime('today'), freq='10Y')
bins = bins[[0,3]+list(range(6,len(bins)))] # drop redundant
userinfo['dateofbirth_cat'] = pd.cut(userinfo['userinfo_dateofbirth'], bins, labels=[str(bins[i-1].year)+'-'+str(bins[i].year) for i in range(1, len(bins))])
userinfo.head()

Unnamed: 0,userinfo_sso,userinfo_dateofbirth,userinfo_language,race,Gender,dateofbirth_cat
0,1,1970-01-01,de,"GreaterEuropean,WestEuropean,Italian",male,1969-1979
1,5,1984-11-01,en,"GreaterEuropean,WestEuropean,Italian",male,1979-1989
2,13,1970-01-01,de,"GreaterEuropean,WestEuropean,Germanic",male,1969-1979
3,25,1961-09-23,en,"GreaterEuropean,WestEuropean,Italian",female,1939-1969
4,28,1965-10-15,en,"GreaterEuropean,WestEuropean,Germanic",male,1939-1969


In [53]:
savefolder_userinfo = dir_join(savefolder, 'userinfo_hist')
if not dir_exists(savefolder_userinfo):
    makedirs(savefolder_userinfo)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in cat_cols:
    for i in G:
        column = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name]
        pd.Series(dict((y, x) for x, y in np.array(np.unique(column, return_counts=True))[::-1].T)).plot(kind='bar')
        plt.title(' '.join([column_name, 'Person', str(i)]))
        plt.savefig(dir_join(savefolder_userinfo, '.'.join([str(i), column_name, 'png'])))
        plt.clf()
    column = userinfo[column_name]
    pd.Series(dict((y, x) for x, y in np.array(np.unique(column, return_counts=True))[::-1].T)).plot(kind='bar')
    plt.title(' '.join([column_name, 'All']))
    plt.savefig(dir_join(savefolder_userinfo, '.'.join([column_name, 'All', 'png'])))
    plt.clf()

<matplotlib.figure.Figure at 0x7f700c9c5438>

In [56]:
savefolder_userinfo = dir_join(savefolder, 'unary_hist')
if not dir_exists(savefolder_userinfo):
    makedirs(savefolder_userinfo)

cat_cols = ['userinfo_language', 'race', 'Gender', 'dateofbirth_cat']
G = miner.graph()
for column_name in cat_cols:
    column = userinfo[column_name]
    values_dict = {}
    for value in column.unique():
        values_dict[value] = pd.Series()
    for i in G:
        value = userinfo[userinfo['userinfo_sso']==i][column_name].iloc[0]
        column = userinfo[userinfo['userinfo_sso'].isin(list(G.neighbors(i)))][column_name]
        if(len(column) > 0):
            values_dict[value] = values_dict[value].add(column.value_counts(normalize=True), fill_value=0)
    for value in values_dict:
        if(len(values_dict[value]) > 0):
            values_dict[value].plot(kind='bar')
            plt.title(' '.join([column_name, value]))
            plt.savefig(dir_join(savefolder_userinfo, '.'.join([column_name, value, 'png'])))
            plt.clf()
        else:
            print(' '.join(['Value', value, 'skipped']))

Value Asian,GreaterEastAsian,EastAsian skipped
Value 1909-1939 skipped


<matplotlib.figure.Figure at 0x7f700cb8e6a0>