In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./data/simple_preprocessed.csv")
df.describe()

Unnamed: 0,point,timestamp,count
count,3550113.0,3550113.0,3550113.0
mean,3498.905,5001.823,2.33486
std,1953.676,2688.499,4.190819
min,1.0,0.0,1.0
25%,1869.0,2708.0,1.0
50%,3575.0,5265.0,1.0
75%,5159.0,7275.0,2.0
max,7036.0,9503.0,600.0


In [3]:
count_type = np.int16
df["count"] = df["count"].astype(count_type)

In [4]:
start_hour = pd.read_csv("./data/preprocessed.csv")["timestamp"].min()
test_df = pd.read_csv("./data/source/valid.csv")
points_df = pd.read_csv("./data/points_df.csv")
test_df.drop(columns=["lat", "lon", "error"], inplace=True)
test_df["hour"] -= start_hour
test_df["hour"] /= 60 * 60
test_df["hour"] = test_df["hour"].astype(int)
test_df = test_df.merge(points_df, left_on="point", right_on="initial_point")
test_df.drop(columns=["point", "initial_point"], inplace=True)
test_df.rename(columns={"hour": "timestamp", "new_point": "point", "sum": "count"},inplace=True)
test_df["count"] = test_df["count"].astype(count_type)
test_data_per_point = {point: data.drop(columns=["point"]) for point, data in  test_df.groupby("point")}
print(len(test_data_per_point))
test_data_per_point[917]

155


Unnamed: 0,timestamp,count
0,10041,5
1,9996,5
2,9563,6
3,9884,8
4,9778,11
5,9824,7
6,9758,6
7,9523,7


In [5]:
test_df.describe()

Unnamed: 0,timestamp,count,point
count,709.0,709.0,709.0
mean,9856.311707,9.880113,3558.348378
std,199.684837,6.407844,1923.325208
min,9513.0,5.0,0.0
25%,9687.0,6.0,2119.0
50%,9860.0,7.0,3918.0
75%,10029.0,11.0,5238.0
max,10198.0,40.0,6874.0


In [6]:
data_per_point = {point: data.drop(columns=["point"]) for point, data in df.groupby("point") if point in test_data_per_point}
len(data_per_point)

151

In [7]:
data_per_point[917]

Unnamed: 0,timestamp,count
412096,1,1
412097,2,1
412098,5,1
412099,8,1
412100,10,2
...,...,...
418614,9499,11
418615,9500,4
418616,9501,5
418617,9502,1


In [8]:
total_hours = df["timestamp"].values.ptp() + 1
print(total_hours)
def transform_to_full_time(point_df):
    """
    Transform sparce dataframe with number of posts at exact hour into
    a list with hours as indices.
    """
    data = np.zeros(total_hours, dtype=count_type)
    for time, count in point_df.values:
        data[time] = count
    return data

9504


In [9]:
data_per_point = {point: transform_to_full_time(data) for point, data in data_per_point.items()}

In [10]:
data_per_point[917].sum()

18816

In [19]:
lag = 30 * 24
roll = 30 * 24
hidden = 100
batch = 10
epochs = 25

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

def timeline_to_windows(data):
    n = data.shape[0]
    return np.array([data[i: i + lag + roll] for i in range(n - (lag + roll))])

def target_loss(prediction, ground_truth, prediction_start):
    s = 0
    for hour, true_count in ground_truth.values:
        predicted_count = prediction[hour - prediction_start]
        if predicted_count == 0:
            s += 0 if true_count == 0 else 1
        else:
            s += abs(predicted_count - true_count) / predicted_count
    return s

def train_validate(train, test, data):
    X, y = train[:, :lag], train[:, lag:]
    X = X.reshape(X.shape[0], 1, X.shape[1])
    model = Sequential()

    model.add(LSTM(hidden, input_shape=(X.shape[1], X.shape[2]), dropout=0.5))
    model.add(Dense(y.shape[1], activation='relu'))
    model.compile(loss='mean_squared_error')
    model.fit(X, y, validation_split=0.2, epochs=epochs, batch_size=batch, verbose=0, shuffle=True)

    X = data[-lag:]
    X = X.reshape(1, 1, len(X))
    forecast = model.predict(X, batch_size=batch, verbose=0).reshape(-1)
    forecast = list(map(round, forecast))
    return target_loss(forecast, test, len(data))

In [25]:
loss = 0
points = 0
for point, test in test_data_per_point.items():
    if point not in data_per_point:
        print(f"Point {point} is not found in train data!!!")
        continue
    points += test.shape[0]
    data = data_per_point[point]
    windows = timeline_to_windows(data)
    cur_loss = train_validate(windows, test, data)
    loss += cur_loss
    print("Predict point:", point, "Loss:", cur_loss / test.shape[0], "Average posts per hour:", np.mean(data), "Test points:", test.shape[0])
print("Total loss: ", loss / points)

Point 0 is not found in train data!!!
Predict point: 296 Loss: 1.0833333333333335 Average posts per hour: 2.768097643097643 Test points: 2
Predict point: 329 Loss: 6.6 Average posts per hour: 0.9394991582491582 Test points: 5
Predict point: 431 Loss: 0.3333333333333333 Average posts per hour: 2.310395622895623 Test points: 3
Predict point: 449 Loss: 5.895833333333333 Average posts per hour: 2.0467171717171717 Test points: 8
Predict point: 566 Loss: 0.6666666666666666 Average posts per hour: 1.4433922558922558 Test points: 1
Predict point: 654 Loss: 4.0 Average posts per hour: 0.48463804713804715 Test points: 1
Predict point: 715 Loss: 0.5713046284640712 Average posts per hour: 7.4602272727272725 Test points: 19
Predict point: 722 Loss: 4.25 Average posts per hour: 1.0710227272727273 Test points: 2
Predict point: 817 Loss: 2.5 Average posts per hour: 1.1194234006734007 Test points: 1
Predict point: 869 Loss: 1.3666666666666667 Average posts per hour: 1.539983164983165 Test points: 5
Pre