In [1]:
import pandas as pd
import numpy as np
import re
import glob
import geohash
import json
import collections as cl
import datetime
from tqdm import tqdm_notebook as tqdm
import pickle
import random
import GPencode
import TrajectoryHash

In [3]:
## functions for reading from csv files

def transform_from_dir(dir_name, index, area):
    files = glob.glob("%s/*" % dir_name)
    for file in files:
        extract_columns(file, index, area)

# extract necessary columns (time, coordinate) and write to new file.
def extract_columns(file_name, index, area):
    print(file_name)
    time_tokyo = pd.read_csv(file_name, header=None)
    time_tokyo = time_tokyo.query('index %% 10 == %s' % index)
    time_tokyo = time_tokyo.iloc[:, 3:6]
    time_tokyo.columns = ["time", "long", "lat"]
    time_tokyo.drop_duplicates()
    time_tokyo.to_csv('./data/output-1minute-%s-index-%s.csv' % (area, str(index)), mode='a', index=False, header=False)
    

# 48bit
def encode_batch_data_by_th48(trajectory_data_n, data_list):
    trajectory_data_n.columns = ["time", "long", "lat"]
    for index, row in tqdm(trajectory_data_n.iterrows()):
        encoded_value = TrajectoryHash.encode(
            unixepoch_from_str(row["time"]), 
            TIME_START,
            TIME_END,
            row["lat"],
            row["long"],
            theta_t=24,
            theta_l=16
        )
        data_list.append(encoded_value)
    return True

# 42bit
def encode_batch_data_by_th42(trajectory_data_n, data_list):
    trajectory_data_n.columns = ["time", "long", "lat"]
    for index, row in tqdm(trajectory_data_n.iterrows()):
        encoded_value = TrajectoryHash.encode(
            unixepoch_from_str(row["time"]), 
            TIME_START,
            TIME_END,
            row["lat"],
            row["long"],
            theta_t=24,
            theta_l=14
        )
        data_list.append(encoded_value)
    return True

# 36bit
def encode_batch_data_by_th36(trajectory_data_n, data_list):
    trajectory_data_n.columns = ["time", "long", "lat"]
    for index, row in tqdm(trajectory_data_n.iterrows()):
        encoded_value = TrajectoryHash.encode(
            unixepoch_from_str(row["time"]), 
            TIME_START,
            TIME_END,
            row["lat"],
            row["long"],
            theta_t=24,
            theta_l=12
        )
        data_list.append(encoded_value)
    return True

# 72bit
def encode_batch_data_by_th72(trajectory_data_n, data_list):
    trajectory_data_n.columns = ["time", "long", "lat"]
    for index, row in tqdm(trajectory_data_n.iterrows()):
        encoded_value = TrajectoryHash.encode(
            unixepoch_from_str(row["time"]), 
            TIME_START,
            TIME_END,
            row["lat"],
            row["long"],
            theta_t=24,
            theta_l=24
        )
        data_list.append(encoded_value)
    return True

# 14byte
def encode_batch_data_by_gp10(trajectory_data_n, data_list):
    trajectory_data_n.columns = ["time", "long", "lat"]
    for index, row in tqdm(trajectory_data_n.iterrows()):
        encoded_value = GPencode.encode(
            unixepoch_from_str(row["time"]), 
            TIME_START,
            TIME_END,
            row["lat"],
            row["long"],
            theta_t=1440,
            theta_l=10
        )
        data_list.append(encoded_value)
    return True

def unixepoch_from_str(time_str):
    # datasetの日付けに2010-10-01などが少数混じっていたので．
    modified_time_str = '2008-10-01' + time_str[10:]
    return int(datetime.datetime.strptime(modified_time_str, '%Y-%m-%d %H:%M:%S').timestamp())

# This is given information here.
time_start_str = '2008-10-01 00:00:00'
TIME_START = unixepoch_from_str(time_start_str)
time_end_str = '2008-10-01 23:59:00'
TIME_END = unixepoch_from_str(time_end_str)

In [None]:
# # read data and extract necessary columns
# for index in range(10):
#     for i in range(24):
#         transform_from_dir("/Users/fumiyuki/Downloads/time-tokyo-%s" % str(i + 1), index, 'tokyo')
#         transform_from_dir("/Users/fumiyuki/Downloads/time-kinki-%s" % str(i + 1), index, 'kinki')

In [9]:
# read and encode trajectories and store data_list

th48_data_list = []
th42_data_list = []
# th36_data_list = []
# th72_data_list = []
gp10_data_list = []
for index in range(1):
    for area in ["tokyo", "kinki"]:
        trajectory_data = pd.read_csv('./data/output-1minute-%s-index-%s.csv' % (area, str(index+2)), header=None)
        trajectory_data = trajectory_data.drop_duplicates()
        encode_batch_data_by_th48(trajectory_data, th48_data_list)
        encode_batch_data_by_th42(trajectory_data, th42_data_list)
#         encode_batch_data_by_th36(trajectory_data, th36_data_list)
#         encode_batch_data_by_th72(trajectory_data, th72_data_list)
        encode_batch_data_by_gp10(trajectory_data, gp10_data_list)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(trajectory_data_n.iterrows()):


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(trajectory_data_n.iterrows()):


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm(trajectory_data_n.iterrows()):


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [10]:
np.random.seed(seed=0)
np.random.shuffle(gp10_data_list)
np.random.seed(seed=0)
np.random.shuffle(th48_data_list)
np.random.seed(seed=0)
np.random.shuffle(th42_data_list)
# np.random.seed(seed=0)
# np.random.shuffle(th36_data_list)
# np.random.seed(seed=0)
# np.random.shuffle(th72_data_list)

In [11]:
def json_dump_by_num(data_list, limit_num, method):
    tmp_data_list = data_list[:limit_num]
    tmp_data_list.sort()
    json_data = cl.OrderedDict()
    json_data["data"] = tmp_data_list
    now_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    filename = './data/new/%s-central-%s-%s.json' % (method, str(limit_num), now_timestamp)
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=None)

In [12]:
json_dump_by_num(th48_data_list, 10000000, "th48")
json_dump_by_num(th48_data_list, 20000000, "th48")
json_dump_by_num(th48_data_list, 30000000, "th48")
json_dump_by_num(th48_data_list, 40000000, "th48")
json_dump_by_num(th48_data_list, 50000000, "th48")
json_dump_by_num(th48_data_list, 70000000, "th48")
json_dump_by_num(th48_data_list, 100000000, "th48")

In [13]:
json_dump_by_num(gp10_data_list, 10000000, "gp10")
json_dump_by_num(gp10_data_list, 20000000, "gp10")
json_dump_by_num(gp10_data_list, 30000000, "gp10")
json_dump_by_num(gp10_data_list, 40000000, "gp10")
json_dump_by_num(gp10_data_list, 50000000, "gp10")
json_dump_by_num(gp10_data_list, 70000000, "gp10")
json_dump_by_num(gp10_data_list, 100000000, "gp10")

In [14]:
json_dump_by_num(th42_data_list, 10000000, "th42")
json_dump_by_num(th42_data_list, 20000000, "th42")
json_dump_by_num(th42_data_list, 30000000, "th42")
json_dump_by_num(th42_data_list, 40000000, "th42")
json_dump_by_num(th42_data_list, 50000000, "th42")
json_dump_by_num(th42_data_list, 70000000, "th42")
json_dump_by_num(th42_data_list, 100000000, "th42")

In [None]:
json_dump_by_num(th72_data_list, 10000000, "th72")
json_dump_by_num(th72_data_list, 20000000, "th72")
json_dump_by_num(th72_data_list, 30000000, "th72")
json_dump_by_num(th72_data_list, 40000000, "th72")
json_dump_by_num(th72_data_list, 50000000, "th72")
json_dump_by_num(th72_data_list, 70000000, "th72")
json_dump_by_num(th72_data_list, 100000000, "th72")

In [None]:
json_dump_by_num(th36_data_list, 10000000, "th36")
json_dump_by_num(th36_data_list, 20000000, "th36")
json_dump_by_num(th36_data_list, 30000000, "th36")
json_dump_by_num(th36_data_list, 40000000, "th36")
json_dump_by_num(th36_data_list, 50000000, "th36")
json_dump_by_num(th36_data_list, 70000000, "th36")
json_dump_by_num(th36_data_list, 100000000, "th36")

In [3]:
# read from csv files and extract necessary columns and distribute for each queries.
def transform_from_dir(dir_name, id_data_list, current, batch_size):
    files = glob.glob("%s/*" % dir_name)
    files.sort()
    files = files[current:current+batch_size]
    for file in tqdm(files):
        id_data = extract_columns(file)
        amari = 1440 - len(id_data)
        if amari < 0:
            if amari != -1440:
                print(file)
                continue
        id_data = id_data.append(id_data.iloc[:amari])
        length = len(id_data) // 1440
        for i in range(length):
            id_data_list.append(id_data[i*1440:(i+1)*1440])


def extract_columns(file_name):
    id_data = pd.read_csv(file_name, header=None)
    id_data = id_data.iloc[:, [3,4,5]]
    id_data.columns = ["time", "long", "lat"]
    id_data = id_data.drop_duplicates(subset=["time"])
    return id_data

In [None]:
id_data_list = []
batch_size = 250
current = 0
for i in range(2):
    for j in range(40): # for batch processing
        transform_from_dir("/Users/fumiyuki/Downloads/tokyo-id-%s/data" % str(i + 2), id_data_list, current, batch_size)
        transform_from_dir("/Users/fumiyuki/Downloads/kinki-id-%s" % str(i + 1), id_data_list, current, batch_size)
        current += batch_size
    current = 0

In [4]:
# with open('id_data_list.pickle', 'wb') as f:
#     pickle.dump(id_data_list, f, pickle.HIGHEST_PROTOCOL)

with open('id_data_list.pickle', 'rb') as f:
    data_list = pickle.load(f)

In [5]:
def encode_all_data_and_dump_json(data_list, client_limit, method, encode, theta_t, theta_l):
    current_id = 0
    
    json_data = cl.OrderedDict()
    same_data = []
    total_data_list = []
    for i, id_data in tqdm(enumerate(data_list)):
        assert(len(id_data) == 1440)
        encoded_list = []
        for index, row in id_data.iterrows():
            encoded_value = encode(
                unixepoch_from_str(row["time"]), 
                TIME_START,
                TIME_END,
                row["lat"],
                row["long"],
                theta_t=theta_t,
                theta_l=theta_l
            )
            encoded_list.append(encoded_value)
        value = { "geodata": encoded_list, "query_size": len(id_data), "query_id": current_id }
        total_data_list.append(value)
        current_id += 1
        if current_id == client_limit:
            break
    
    json_data["data"] = total_data_list
    json_data["client_size"] = current_id
    print("client size", current_id)
    
    now_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    filename = './data/new/%s-client-%s-%s.json' % (method, str(client_limit), now_timestamp)
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=None)

In [None]:
encode_all_data_and_dump_json(data_list, 1000, "th48", TrajectoryHash.encode, 24, 16)
encode_all_data_and_dump_json(data_list, 2000, "th48", TrajectoryHash.encode, 24, 16)
encode_all_data_and_dump_json(data_list, 3000, "th48", TrajectoryHash.encode, 24, 16)
encode_all_data_and_dump_json(data_list, 4000, "th48", TrajectoryHash.encode, 24, 16)
encode_all_data_and_dump_json(data_list, 5000, "th48", TrajectoryHash.encode, 24, 16)
encode_all_data_and_dump_json(data_list, 6000, "th48", TrajectoryHash.encode, 24, 16)
encode_all_data_and_dump_json(data_list, 7000, "th48", TrajectoryHash.encode, 24, 16)

In [6]:
encode_all_data_and_dump_json(data_list, 1000, "gp10", GPencode.encode, 1440, 10)
encode_all_data_and_dump_json(data_list, 2000, "gp10", GPencode.encode, 1440, 10)
encode_all_data_and_dump_json(data_list, 3000, "gp10", GPencode.encode, 1440, 10)
encode_all_data_and_dump_json(data_list, 4000, "gp10", GPencode.encode, 1440, 10)
encode_all_data_and_dump_json(data_list, 5000, "gp10", GPencode.encode, 1440, 10)
encode_all_data_and_dump_json(data_list, 6000, "gp10", GPencode.encode, 1440, 10)
encode_all_data_and_dump_json(data_list, 7000, "gp10", GPencode.encode, 1440, 10)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, id_data in tqdm(enumerate(data_list)):


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


client size 1000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


client size 2000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


client size 3000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


client size 4000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


client size 5000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


client size 6000


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


client size 7000
