In [1]:
import pandas as pd
import numpy as np
import re
import glob
import geohash
import json
import collections as cl
import datetime
from tqdm import tqdm_notebook as tqdm

In [None]:
## ここから↓データの読み込み

In [None]:
def transform_from_dir(dir_name):
    files = glob.glob("%s/*" % dir_name)
    for file in files:
        extract_columns(file)
        
def extract_10_minutes_data(trajectory_data):
    return trajectory_data[trajectory_data['time'].str.endswith('0:00')]

def extract_columns(file_name):
    print(file_name)
    time_tokyo = pd.read_csv(file_name, header=None)
    time_tokyo = time_tokyo.iloc[:, 3:6]
    time_tokyo.columns = ["time", "long", "lat"]
    time_tokyo = extract_10_minutes_data(time_tokyo)
    time_tokyo.to_csv('./data/output.csv', mode='a', index=False, header=False)

In [None]:
for i in range(24):
    transform_from_dir("/Users/fumiyuki/Downloads/time-tokyo-%s" % str(i + 1))

In [None]:
## ここから↓jsonにデータの書き込み

In [None]:
def encode_all_data(output_file_name, limit_num):
    trajectory_data = pd.read_csv(output_file_name, header=None)
    trajectory_data.columns = ["time", "long", "lat"]
    
    json_data = cl.OrderedDict()
    data_list = []
    count = 0
    for index, row in tqdm(trajectory_data.iterrows()):
        encoded_value = encode(row["time"], row["lat"], row["long"])
        data_list.append(encoded_value)
        count += 1
        if count == limit_num:
            break
    data_list.sort()
    json_data["data"] = data_list
    now_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    filename = './data/real-%s.json' % (now_timestamp)
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=None)

In [None]:
def timehash_encode(time):
    return str(int(time[11:13])*6 + int(time[14])).zfill(4)

def timehash_encode_for_1minute(time):
    return str(int(time[11:13])*60 + int(time[14:16])).zfill(4)

def encode(time, latitude, longtitude):
    t_hash = timehash_encode(time)
    g_hash = geohash.encode(latitude, longtitude, 9)
    return g_hash + t_hash

In [None]:
encode_all_data('./data/output.csv', 1000000)

In [None]:
## クエリデータ

In [2]:
def transform_from_dir(dir_name, id_data_list):
    files = glob.glob("%s/*" % dir_name)
    for file in tqdm(files):
        id_data = extract_columns(file)
        count = len(id_data) // 144
        for i in range(count):
            id_data_list.append(id_data[i*144:(i+1)*144])
    return id_data_list
        
def extract_10_minutes_data(trajectory_data):
    return trajectory_data[trajectory_data['time'].str.endswith('0:00')]

def extract_columns(file_name):
    id_data = pd.read_csv(file_name, header=None)
    id_data = id_data.iloc[:, [3,4,5]]
    id_data.columns = ["time", "long", "lat"]
    id_data = extract_10_minutes_data(id_data)
    id_data = id_data.drop_duplicates(subset=["time"])
    if len(id_data) % 144 != 0:
        print(file_name)
        raise ValueError("id_data is not 144 muliplies")
    return id_data

In [None]:
id_data_list = []
for i in range(4):
    transform_from_dir("/Users/fumiyuki/Downloads/tokyo-id-%s/data" % str(i + 1), id_data_list)

In [None]:
def encode_all_data(data_list, client_limit):
    current_id = 0
    
    json_data = cl.OrderedDict()
    same_data = []
    total_data_list = []
    for i, id_data in tqdm(enumerate(data_list)):
        same_data.append(id_data)
        if (i+1) % 14 == 0:
            encoded_list = []
            flat = pd.concat(same_data)
            assert(len(flat) == 2016)
            for index, row in flat.iterrows():
                encoded_value = encode(row["time"], row["lat"], row["long"]).encode()
                encoded_list.append(encoded_value)
            value = { "geodata": b''.join(encoded_list).hex(), "query_size": len(flat), "query_id": current_id }
            total_data_list.append(value)
            same_data = []
            current_id += 1
        if current_id == client_limit:
            break
    
    json_data["data"] = total_data_list
    json_data["client_size"] = current_id
    print("client size", current_id)
    
    now_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    filename = './data/client-%s-real-%s.json' % (str(client_limit), now_timestamp)
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=None)

In [None]:
encode_all_data(id_data_list, 6000)

In [None]:
# テスト

In [None]:
time_tokyo = pd.read_csv(f"/Users/fumiyuki/Downloads/time-tokyo-1/08TKY_time_0000.csv", header=None)
time_tokyo = time_tokyo.iloc[:, 3:6]
time_tokyo.columns = ["time", "long", "lat"]
time_tokyo = extract_10_minutes_data(time_tokyo)
time_tokyo["time"][0]  = '2008-10-01 00:00:00'