In [None]:
import pandas as pd
import numpy as np
import re
import glob
import geohash
import json
import collections as cl
import datetime
from tqdm import tqdm_notebook as tqdm
import pickle
import random
import GPencode

In [None]:
## common encoding function

In [None]:
# equivalent to periodical encoding
def timehash_encode(time):
    return str(int(time[11:13])*6 + int(time[14])).zfill(4)

# periodical encoding
def timehash_encode_for_1minute(time):
    return str(int(time[11:13])*60 + int(time[14:16])).zfill(4)

def encode(time, latitude, longtitude):
    t_hash = timehash_encode_for_1minute(time)
    g_hash = geohash.encode(latitude, longtitude, 10)
    return g_hash + t_hash

In [None]:
## functions for reading from csv files

In [None]:
def transform_from_dir(dir_name, index, area):
    files = glob.glob("%s/*" % dir_name)
    for file in files:
        extract_columns(file, index, area)

# extract necessary columns (time, coordinate) and write to new file.
def extract_columns(file_name, index, area):
    print(file_name)
    time_tokyo = pd.read_csv(file_name, header=None)
    time_tokyo = time_tokyo.query('index %% 10 == %s' % index)
    time_tokyo = time_tokyo.iloc[:, 3:6]
    time_tokyo.columns = ["time", "long", "lat"]
    time_tokyo.drop_duplicates()
    time_tokyo.to_csv('./data/output-1minute-%s-index-%s.csv' % (area, str(index)), mode='a', index=False, header=False)
    

# execute encode
def encode_batch_data(trajectory_data_n, data_list):
    trajectory_data_n.columns = ["time", "long", "lat"]
    for index, row in tqdm(trajectory_data_n.iterrows()):
        encoded_value = encode(row["time"], row["lat"], row["long"])
        data_list.append(encoded_value)
    return True


# execute encode
# GPencode parameter theta_t = 1440, theta_l = 10
def encode_batch_data(trajectory_data_n, data_list):
    trajectory_data_n.columns = ["time", "long", "lat"]
    for index, row in tqdm(trajectory_data_n.iterrows()):
        encoded_value = GPencode.encode(
            unixepoch_from_str(row["time"]), 
            TIME_START,
            TIME_END,
            row["lat"],
            row["long"],
            theta_t=1440,
            theta_l=10
        )
        data_list.append(encoded_value)
    return True

def unixepoch_from_str(time_str):
    return datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')

# This is given information here.
time_start_str = '2008-10-01 00:00:00'
TIME_START = unixepoch_from_str(time_start_str)
time_end_str = '2008-10-01 23:59:00'
TIME_END = unixepoch_from_str(time_end_str)

In [None]:
"""
ファイル名とデータセットについて

/Users/fumiyuki/Downloads/time-tokyo-%s
    東京近辺における軌跡データのオリジナルのデータセット
/Users/fumiyuki/Downloads/time-kinki-%s
    近畿近辺における軌跡データのオリジナルのデータセット


    2008-10-01 00:00:00　から2008-10-01 23:59:00 まで1分ごとにデータが入っている．
    同時刻に何万ものデータが入っているが，データは匿名化されていいて，このデータセットから一人あたりのデータセットを再現することはできない．
    なのでサーバサイドのデータとして利用する
    
    
    一方で
/Users/fumiyuki/Downloads/tokyo-id-%s
/Users/fumiyuki/Downloads/kinki-id-%s
    これらは，idごとに区別可能な1日分のデータセットがある．
    これらからクライアントのデータセットを作成する．ただしデータは1分ごとであり，1idあたり1日分しか識別できない．
"""

In [None]:
# read data and extract necessary columns
for index in range(10):
    for i in range(24):
        transform_from_dir("/Users/fumiyuki/Downloads/time-tokyo-%s" % str(i + 1), index, 'tokyo')
        transform_from_dir("/Users/fumiyuki/Downloads/time-kinki-%s" % str(i + 1), index, 'kinki')

In [None]:
"""
./data/output-1minute-%s-index-%s.csv
    元のデータから必要なカラムを抜き取って保存してあるcsvファイル．元のデータのサイズが大きくて使いづらいので小さくしている．
    あと，元のデータは時間ごとに分けられているため，
    時間の粒度は1分ごとで人の区別はない．サーバサイドのデータとして扱うので人を区別する必要はない．
    同じ時間，同じ場所のデータは落としているので注意． 
    時間空間的に公平になるようにTokyo, Kinkiのそれぞのデータセットから 　mod index でindexの剰余のデータごとに10個に分けている．
    なので各データセットに時間が均等に分布している．
"""

In [None]:
# read and encode trajectories and store data_list
data_list = []
for index in range(1):
    for area in ["tokyo", "kinki"]:
        trajectory_data = pd.read_csv('./data/output-1minute-%s-index-%s.csv' % (area, str(index+1)), header=None)
        trajectory_data = trajectory_data.drop_duplicates()
        encode_batch_data(trajectory_data, data_list)

In [None]:
# cache as pickle

"""
index1-2.pickleは上のindexが1と2のものをまとめたもので，
中には，東京都近畿のデータセットのうちの20%が含まれていることになる．
"""
# with open('index-1-2.pickle', 'wb') as f:
#     pickle.dump(data_list, f, pickle.HIGHEST_PROTOCOL)

# with open('index-1.pickle', 'rb') as f:
#     data_list = pickle.load(f)

# with open('index-1-2.pickle', 'rb') as f:
#     data_list = pickle.load(f)

In [None]:
## write server side infected data

""" format
{
    "data": [
        "xn7ehpxnex0001",
        ...
    ]
}
"""

In [None]:
def json_dump_by_num(data_list, limit_num):
    tmp_data_list = data_list[:limit_num]
    tmp_data_list.sort()
    json_data = cl.OrderedDict()
    json_data["data"] = tmp_data_list
    now_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    filename = './data/central-1minute-%s-%s.json' % (str(limit_num), now_timestamp)
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=None)

In [None]:
random.shuffle(data_list)
json_dump_by_num(data_list, 5000000)

In [None]:
## client side query data

In [None]:
# read from csv files and extract necessary columns and distribute for each queries.
def transform_from_dir(dir_name, id_data_list, current, batch_size):
    files = glob.glob("%s/*" % dir_name)
    files.sort()
    files = files[current:current+batch_size]
    for file in tqdm(files):
        id_data = extract_columns(file)
        amari = 1440 - len(id_data)
        if amari < 0:
            if amari != -1440:
                print(file)
                continue
        id_data = id_data.append(id_data.iloc[:amari])
        length = len(id_data) // 1440
        for i in range(length):
            id_data_list.append(id_data[i*1440:(i+1)*1440])


def extract_columns(file_name):
    id_data = pd.read_csv(file_name, header=None)
    id_data = id_data.iloc[:, [3,4,5]]
    id_data.columns = ["time", "long", "lat"]
    id_data = id_data.drop_duplicates(subset=["time"])
    return id_data

In [None]:
id_data_list = []
batch_size = 250
current = 0
for i in range(2):
    for j in range(40): # for batch processing
        transform_from_dir("/Users/fumiyuki/Downloads/tokyo-id-%s/data" % str(i + 2), id_data_list, current, batch_size)
        transform_from_dir("/Users/fumiyuki/Downloads/kinki-id-%s" % str(i + 1), id_data_list, current, batch_size)
        current += batch_size
    current = 0

In [None]:
import pickle

# with open('id_data_list.pickle', 'wb') as f:
#     pickle.dump(id_data_list, f, pickle.HIGHEST_PROTOCOL)

with open('id_data_list.pickle', 'rb') as f:
    data_list = pickle.load(f)

In [None]:
def encode_all_data_and_dump_json(data_list, client_limit):
    current_id = 0
    
    json_data = cl.OrderedDict()
    same_data = []
    total_data_list = []
    for i, id_data in tqdm(enumerate(data_list)):
        assert(len(id_data) == 1440)
        encoded_list = []
        for index, row in id_data.iterrows():
            encoded_value =GPencode.encode(
                unixepoch_from_str(row["time"]), 
                TIME_START,
                TIME_END,
                row["lat"],
                row["long"],
                theta_t=1440,
                theta_l=10
            )
            encoded_list.append(encoded_value)
        value = { "geodata": encode_list, "query_size": len(id_data), "query_id": current_id }
        total_data_list.append(value)
        current_id += 1
        if current_id == client_limit:
            break
    
    json_data["data"] = total_data_list
    json_data["client_size"] = current_id
    print("client size", current_id)
    
    now_timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    filename = './data/geohash10-client-%s-real-for-1minute-%s.json' % (str(client_limit), now_timestamp)
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=None)

In [None]:
encode_all_data_and_dump_json(data_list, 3000)

In [None]:
## Unique check (TOKYO)

In [None]:
data = pd.DataFrame()
for index in range(1):
    for i in tqdm(range(24)):
        files = glob.glob("/Users/fumiyuki/Downloads/time-tokyo-%s/*" % str(i+1))
        for file_name in files:
            print(file_name)
            time_tokyo = pd.read_csv(file_name, header=None)
            time_tokyo = time_tokyo.query('index %% 10 == %s' % index)
            time_tokyo = time_tokyo.iloc[:, 3:6]
            time_tokyo.columns = ["time", "long", "lat"]
            data = pd.concat([data, time_tokyo])

In [None]:
dropped_data = data.drop_duplicates()
len(dropped_data)/len(data)

In [None]:
## Unique check (KINKI)

In [None]:
data = pd.DataFrame()
for index in range(1):
    for i in tqdm(range(24)):
        files = glob.glob("/Users/fumiyuki/Downloads/time-kinki-%s/*" % str(i+1))
        for file_name in files:
            print(file_name)
            time_tokyo = pd.read_csv(file_name, header=None)
            time_tokyo = time_tokyo.query('index %% 10 == %s' % index)
            time_tokyo = time_tokyo.iloc[:, 3:6]
            time_tokyo.columns = ["time", "long", "lat"]
            data = pd.concat([data, time_tokyo])

In [None]:
dropped_data = data.drop_duplicates()
len(dropped_data)/len(data)