In [1]:
import pandas as pd
import datetime
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

In [2]:
PATH = "./train_data.csv"

In [3]:
df = pd.read_csv(PATH)

In [4]:
df = df[(df.lat > 59.831191) & (df.lat < 60.039332) & (df.lon > 30.142969) & (df.lon < 30.515771)]

In [5]:
def get_all_timestamps(min_v: int, max_v: int, name: str = "timestamp") -> pd.DataFrame:
    delta = datetime.timedelta(hours=1)
    min_v = datetime.datetime.fromtimestamp(min_v)
    max_v = datetime.datetime.fromtimestamp(max_v)
    value = min_v
    values = [value, ]
    while value != max_v:
        value += delta
        values.append(value)
    values = list(map(lambda x: x.timestamp(), values))
    df = pd.DataFrame()
    df[name] = values
    return df

In [6]:
all_timestamps = get_all_timestamps(df.timestamp.min(), df.timestamp.max())

In [7]:
train_points = []
val_points = []
train_groups = []
val_groups = []
n_records = []
for point, g in tqdm(df.groupby("point")):
    g = g.groupby("timestamp").apply(len).to_frame("n_pubs").reset_index(drop=False)
    g = g.merge(all_timestamps, how="outer", on="timestamp")
    g = g.fillna(0)
    year = g["timestamp"].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.year)
    g_train = g[year == 2019].set_index("timestamp").sort_index()["n_pubs"]
    g_val = g[year == 2020].set_index("timestamp").sort_index()["n_pubs"]
    if len(g_train) == 0:
        continue
    train_points.append(point)
    val_points.append(point)
    train_groups.append(g_train)
    val_groups.append(g_val)
    n_records.append((g_train > 0).sum())

100%|██████████| 6658/6658 [01:45<00:00, 63.31it/s]


In [12]:
pd.Series(n_records).describe()

count    6658.000000
mean      490.449384
std      1107.006545
min         0.000000
25%        15.000000
50%        72.000000
75%       339.000000
max      8374.000000
dtype: float64

In [13]:
selected_idxes = np.where(np.array(n_records) >= 339)[0]

In [14]:
train_points = [train_points[i] for i in selected_idxes]
train_groups = [train_groups[i] for i in selected_idxes]

In [15]:
selected_idxes = [i for i, val_point in enumerate(val_points) if val_point in train_points]

In [17]:
val_points = [val_points[i] for i in selected_idxes]
val_groups = [val_groups[i] for i in selected_idxes]

In [25]:
data = {
    "train": {
        "points": train_points,
        "groups": train_groups,
    },
    "val": {
        "points": val_points,
        "groups": val_groups,
    },
}

In [26]:
with open("data/selected_data.pickle", "wb") as f:
    pickle.dump(data, f)