# Porto taxi dataset

In [1]:
from datetime import datetime

import numpy as np
import pandas as pd

raw_dataset = pd.read_csv(
    "assets/taxi_porto.csv",
    usecols=["TIMESTAMP", "POLYLINE"],
)

index = pd.date_range(
    start=datetime.fromtimestamp(min(raw_dataset["TIMESTAMP"])), 
    end=datetime.fromtimestamp(max(raw_dataset["TIMESTAMP"])), 
    freq="15min"
)

In [2]:
raw_dataset = raw_dataset.dropna()

In [3]:
raw_dataset.head()

Unnamed: 0,TIMESTAMP,POLYLINE
0,1372636858,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [4]:
from math import floor

LON_MIN = -8.6338
LON_MAX = -8.5862
LAT_MIN = 41.1369
LAT_MAX = 41.1690
BBox = (LON_MIN, LON_MAX, LAT_MIN, LAT_MAX)

N_ROWS = 20
N_COLS = 20
N_CELLS = N_ROWS * N_COLS

LAT_DIFF = LAT_MAX - LAT_MIN
LON_DIFF = LON_MAX - LON_MIN

CELL_LON = LON_DIFF / N_COLS
CELL_LAT = LAT_DIFF / N_ROWS

def get_cell(lat: float, lon: float) -> int:
    lon_delta = lon - LON_MIN
    col = floor(lon_delta / CELL_LON)
    
    lat_delta = lat - LAT_MIN
    row = floor(lat_delta / CELL_LAT)
    
    return row*N_COLS + col

In [5]:
gps = raw_dataset["POLYLINE"].apply(eval).dropna()
gps = gps.rename("gps")

KeyboardInterrupt: 

In [None]:
gps_dataset = pd.DataFrame(data={"timestamp": raw_dataset["TIMESTAMP"], "gps": gps})

In [None]:
df = pd.DataFrame(
    index=index,
    data=np.zeros((len(index), N_CELLS)),
    columns=[f"cell_{i}" for i in range(N_CELLS)]
)

In [None]:
base_ts = min(raw_dataset["TIMESTAMP"])

for i, g in gps_dataset.iterrows():
    first_ts = g["timestamp"]
    trajectory = g["gps"]
    if isinstance(trajectory, list) and len(trajectory) > 0:
        for j, (lon, lat) in enumerate(trajectory):
            if LON_MIN < lon < LON_MAX and LAT_MIN < lat < LAT_MAX:
                cell = get_cell(lat=lat, lon=lon)
                ts = first_ts + j*15
                
                row = floor((ts - base_ts) / (60 * 15))
                
                df.iloc[row][cell] += 1

In [None]:
df.head()

In [None]:
df.to_csv("assets/porto_taxi_cells.csv")

### Assignment to nodes

In [31]:
from pathlib import Path

n_nodes = 20
k=19
n_simulations = 10
network_path = Path(f"data/networks/porto_{n_nodes}n_{k}k")
output_path = Path(f"data/datasets/porto_{n_nodes}n_{k}k")

In [32]:
from category_encoders import BinaryEncoder
import pandas as pd

df = pd.read_csv("assets/porto_taxi_cells.csv", index_col=0, parse_dates=True)

def encode_time(df: pd.DataFrame) -> pd.DataFrame:
    df["hour"] = df.index.hour
    df["weekday"] = df.index.weekday
    return BinaryEncoder(cols=["hour", "weekday"]).fit_transform(df)

In [33]:
for n in range(n_simulations):
    dataset_path = output_path / str(n)
    dataset_path.mkdir(exist_ok=True, parents=True)
    
    towers = pd.read_csv(network_path / str(n) / "towers.csv")

    tower_cells = [
        [] for i in range(n_nodes)
    ]
    
    for i in range(N_ROWS):
        for j in range(N_COLS):
            cell = i*N_ROWS + j
            
            lat_begin = LAT_MIN + i*CELL_LAT
            lon_begin = LON_MIN + j*CELL_LON
            
            lat_center = lat_begin + CELL_LAT / 2
            lon_center = lon_begin + CELL_LON / 2
            
            min_dist = None
            nearest_tower = None
            for n_tower, tower in towers.iterrows():
                dist = (lat_center - tower["lat"])**2 + (lon_center - tower["lon"])**2
                if min_dist is None or min_dist > dist:
                    min_dist = dist
                    nearest_tower = n_tower
                    
            tower_cells[nearest_tower].append(cell)
            
    tower_calls = pd.DataFrame(
        np.zeros((len(index), n_nodes)), 
        columns=[f"tower_{i}" for i in range(n_nodes)], 
        index=index
    )
    
    for i, cells in enumerate(tower_cells):
        ind = f"tower_{i}"
        for cell in cells:
            tower_calls[ind] += df[f"cell_{cell}"]
            
        # tower_calls[ind] += 1
        # tmp_ind = tower_calls[ind][1:].index
        # tower_calls[ind] = pd.Series(
        #     tower_calls[ind][1:].to_numpy() / tower_calls[ind][:-1].to_numpy(),
        #     index=tmp_ind,
        # )
    
                        
    quantiles = tower_calls.quantile(0.8)
    tower_calls = tower_calls.clip(0, quantiles, axis=1)

    # max_value = tower_calls.max().max()
    # if max_value > 0:
    #     tower_calls = tower_calls / max_value
    # else:
    #     print("Max value is 0!!")
    
    node_datasets = [
        encode_time(pd.DataFrame({"requests": tower_calls[col]}))
        for col in tower_calls.columns
    ]
    
    for i, ds in enumerate(node_datasets):
        ds = ds.dropna()
        ds.to_parquet(dataset_path / f"node_{i}_notscaled.parquet")
    

  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  retu

In [34]:
from utils.data import prepare_dataset_for_training

ds_name = "4in_notscaled"

for n in range(n_simulations):
    towers_datasets = [
        pd.read_parquet(output_path / str(n) / f"node_{i}_notscaled.parquet") for i in range(n_nodes)
    ]
    
    small_ds_nodes = list(range(10))
    
    start_i = round(len(towers_datasets[0]) * 0.8)
    start_ind = index[start_i]
    
    towers_datasets = [
        ts[start_ind:] 
        if i in small_ds_nodes
        else ts
        for i, ts in enumerate(towers_datasets)
    ]
    
    # scale
    # max_value = max(*[ts['requests'].max() for ts in towers_datasets])
    # for i in range(len(towers_datasets)):
    #     towers_datasets[i]["requests"] /= max_value
    # 
    # folder = output_path / str(n) / ds_name
    # folder.mkdir(parents=True, exist_ok=True)
    # with (folder / "scaling_factor.txt").open("w") as f:
    #     f.write(str(max_value))
    
    prepare_dataset_for_training(
        towers_datasets=towers_datasets,
        output_folder=output_path / str(n) / ds_name,
        input_timesteps=4,
        output_timesteps=1,
        n_functions=1,
        n_auxiliary_features=8,
    )

Saved!
Saved!
Saved!
Saved!
Saved!
Saved!
Saved!
Saved!
Saved!
Saved!
