# Real data
- From [**Joras**](https://joras.csis.u-tokyo.ac.jp/)
    - To download dataset, you need to be authenticated.
- Tokyo
    - https://joras.csis.u-tokyo.ac.jp/dataset/show/page/2/id/3000200800#datalist
    - 2008 Tokyo, ID based csv #01 - #30
    - Download to `./download/`
    
- Kinki
    - https://joras.csis.u-tokyo.ac.jp/dataset/show/page/2/id/3038201000#datalist
    - 2010 Kinki Metropolitan Area, spatially reallocated, weekday, ID based csv #01 - #09
    - ~~2010 Kinki Metropolitan Area, spatially reallocated, holiday, ID based csv #01 - #07~~
    - Download to `./download/`

### 0. Extract necessary data

In [1]:
import pandas as pd
import numpy as np
import re
import glob
import json
import collections as cl
import datetime
from tqdm import tqdm_notebook as tqdm
import pickle
import random
import shutil

#### Tokyo

One day data for 587531 unique ids

```
id_files = glob.glob("./download/p-csv*/*/*")
len(list(id_files)) => 587531
```

...


Exception,  
other than 1440 size.

In [2]:
batch_size = 1
server_data_size = 14000
server_data_start = 50000

start = 1222819200
end = 1222905540

def process_tokyo(batch_size, start, end):
    id_files = sorted(glob.glob("./download/p-csv*/*/*"), key=extract_id)
    
    client_id = 1
    batch = []
    for filename in tqdm(id_files):
        if client_id > server_data_start + server_data_size:
            break
        user_id = extract_id(filename)
        
        id_tokyo_df = pd.read_csv(filename, header=None)
        id_tokyo_df = id_tokyo_df.iloc[:, 3:6]
        id_tokyo_df.columns = ["time", "long", "lat"]
        id_tokyo_df = id_tokyo_df.drop_duplicates().reset_index(drop=True)
        dates = pd.to_datetime(id_tokyo_df['time'])
        id_tokyo_df['time'] = (dates - pd.Timestamp("1970-01-01 00:00:00")) // pd.Timedelta('1s')
        if len(id_tokyo_df) != 1440:
            continue
        if id_tokyo_df['time'][0] < start or end < id_tokyo_df['time'][1439]:
            continue
        batch.append(id_tokyo_df)
        
        if client_id % batch_size == 0:
            batch_df = pd.concat(batch, axis=0).reset_index(drop=True)
            batch_df.to_csv('./real_data/tokyo/%d.csv' % (client_id // batch_size), mode='a', index=False, header=False)
            del batch
            batch = []
        
        client_id += 1
    
def extract_id(filename):
    match_result = re.findall(r"/(\d+).csv", filename)
    assert len(match_result) == 1
    return int(match_result[0])

def make_tokyo_server_data(server_data_size):
    id_files = sorted(glob.glob("./real_data/tokyo/*.csv"), key=extract_id)
    count = 0
    server_df_list = []
    for filename in tqdm(id_files):
        if server_data_size < count:
            break
        user_id = extract_id(filename)
        if user_id <= server_data_start:
            continue
        id_tokyo_df = pd.read_csv(filename, header=None)
        server_df_list.append(id_tokyo_df)
        count += 1
    server_df = pd.concat(server_df_list, axis=0).reset_index(drop=True)
    server_df.to_csv('./real_data/tokyo/gen/server-%d.csv' % (server_data_size), mode='a', index=False, header=False)
    
    
def make_tokyo_client_data():
    id_files = sorted(glob.glob("./real_data/tokyo/*.csv"), key=extract_id)
    for filename in tqdm(id_files):
        user_id = extract_id(filename)
        if user_id is None:
            continue
        if user_id >= server_data_start:
            continue
        new_path = shutil.move(filename, re.sub(r'(\d+).csv', f'gen/client-{user_id}-tokyo.csv', filename))


In [3]:
process_tokyo(batch_size, start, end)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm(id_files):


  0%|          | 0/587531 [00:00<?, ?it/s]

In [4]:
make_tokyo_server_data(server_data_size)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm(id_files):


  0%|          | 0/64000 [00:00<?, ?it/s]

In [5]:
make_tokyo_client_data()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm(id_files):


  0%|          | 0/64000 [00:00<?, ?it/s]

#### Kinki

One day data for 808794 unique ids

```
id_files = glob.glob("./download/00*/*.csv")
len(list(id_files)) => 808794
```

...


Exception,  
other than 1440 size.

In [6]:
batch_size = 1
server_data_size = 14000
server_data_start = 50000

start = 1285891200
end = 1285977540

def process_kinki(batch_size, start, end):
    id_files = sorted(glob.glob("./download/00*/*.csv"), key=extract_id)
    
    client_id = 1
    batch = []
    for filename in tqdm(id_files):
        if client_id > server_data_start + server_data_size:
            break
        user_id = extract_id(filename)
        
        id_kinki_df = pd.read_csv(filename, header=None)
        id_kinki_df = id_kinki_df.iloc[:, 3:6]
        id_kinki_df.columns = ["time", "long", "lat"]
        id_kinki_df = id_kinki_df.drop_duplicates().reset_index(drop=True)
        dates = pd.to_datetime(id_kinki_df['time'])
        id_kinki_df['time'] = (dates - pd.Timestamp("1970-01-01 00:00:00")) // pd.Timedelta('1s')
        
        if len(id_kinki_df) != 1440:
            continue
        if id_kinki_df['time'][0] < start or end < id_kinki_df['time'][1439]:
            continue

        batch.append(id_kinki_df)
        
        if client_id % batch_size == 0:
            batch_df = pd.concat(batch, axis=0).reset_index(drop=True)
            batch_df.to_csv('./real_data/kinki/%d.csv' % (client_id // batch_size), mode='a', index=False, header=False)
            del batch
            batch = []
        
        client_id += 1
    
def extract_id(filename):
    match_result = re.findall(r"/(\d+).csv", filename)
    assert len(match_result) == 1
    return int(match_result[0])

def make_kinki_server_data(server_data_size):
    id_files = sorted(glob.glob("./real_data/kinki/*.csv"), key=extract_id)
    count = 0
    server_df_list = []
    for filename in tqdm(id_files):
        if server_data_size < count:
            break
        user_id = extract_id(filename)
        if user_id <= server_data_start:
            continue
        id_kinki_df = pd.read_csv(filename, header=None)
        server_df_list.append(id_kinki_df)
        count += 1
    server_df = pd.concat(server_df_list, axis=0).reset_index(drop=True)
    server_df.to_csv('./real_data/kinki/gen/server-%d.csv' % (server_data_size), mode='a', index=False, header=False)
    
    
def make_kinki_client_data():
    id_files = sorted(glob.glob("./real_data/kinki/*.csv"), key=extract_id)
    for filename in tqdm(id_files):
        user_id = extract_id(filename)
        if user_id is None:
            continue
        if user_id >= server_data_start:
            continue
        new_path = shutil.move(filename, re.sub(r'(\d+).csv', f'gen/client-{user_id}-kinki.csv', filename))
        

In [7]:
process_kinki(batch_size, start, end)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm(id_files):


  0%|          | 0/808794 [00:00<?, ?it/s]

In [8]:
make_kinki_server_data(server_data_size)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm(id_files):


  0%|          | 0/64000 [00:00<?, ?it/s]

In [9]:
make_kinki_client_data()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm(id_files):


  0%|          | 0/64000 [00:00<?, ?it/s]