In [1]:
import datetime
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

sys.path.append("./src/")

from dataset.build_dataframe import (
    get_wifi_dataframe,
    get_bluetooth_dataframe,
    get_population_count_df,
    get_bbox_df,
    get_nearest_timestamp,
    get_top_N_wifi_aps_only,
    pivot_tables,
    merge_dfs
)

In [2]:
sns.set_theme("notebook", "whitegrid")

In [3]:
RAW_DATA_PATH = "data/raw_data/"
BBOX_CSV_PATH = "data/bbox_results.csv"
POPULATION_COUNT_CSV_PATH = "data/manual_counts.csv"

In [5]:
wifi_df = get_wifi_dataframe(RAW_DATA_PATH)
bt_df = get_bluetooth_dataframe(RAW_DATA_PATH)
bbox_df = get_bbox_df(BBOX_CSV_PATH, ["timestamp", "device_idx", "bbox_count"])
population_count_df = get_population_count_df(POPULATION_COUNT_CSV_PATH, ["timestamp", "count", "comment"])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
bbox_df

Unnamed: 0,timestamp,device_idx,bbox_count
0,2024-02-28 12:01:00,0,19
1,2024-02-28 12:29:00,0,16
2,2024-02-28 13:01:00,0,17
3,2024-02-28 13:31:00,0,12
4,2024-02-28 14:01:00,0,8
...,...,...,...
74,2024-04-02 10:02:00,3,5
75,2024-04-02 10:03:00,3,3
76,2024-04-02 10:04:00,3,8
77,2024-04-02 10:38:00,3,7


In [7]:
bbox_df.iloc[77, 0]

Timestamp('2024-04-02 10:38:00')

In [9]:
for df in [wifi_df, bt_df, bbox_df]:
    df["timestamp"] = df["timestamp"].apply(
        lambda x: get_nearest_timestamp(x, population_count_df["timestamp"])
    )

In [10]:
bbox_df[bbox_df["timestamp"] > datetime.datetime.fromisoformat("2024-04-02T00:00:00")]

Unnamed: 0,timestamp,device_idx,bbox_count
18,2024-04-02 09:30:11,0,6
19,2024-04-02 10:04:02,0,5
20,2024-04-02 10:34:34,0,12
21,2024-04-02 12:15:00,0,13
38,2024-04-02 09:30:11,1,7
39,2024-04-02 10:04:02,1,4
40,2024-04-02 10:34:34,1,9
41,2024-04-02 12:15:00,1,10
53,2024-04-02 09:30:11,2,8
54,2024-04-02 10:04:02,2,8


In [11]:
top_5_wifi_df = get_top_N_wifi_aps_only(wifi_df, 5)

In [12]:
wifi_tabular, bt_tabular, bbox_tabular = pivot_tables(top_5_wifi_df, bt_df, bbox_df)

In [13]:
combined_tabular = merge_dfs(wifi_tabular, bt_tabular, bbox_tabular, population_count_df)

In [14]:
combined_tabular

Unnamed: 0,timestamp,"('signal_strength', 0, 1)","('signal_strength', 0, 2)","('signal_strength', 0, 3)","('signal_strength', 0, 4)","('signal_strength', 0, 5)","('signal_strength', 1, 1)","('signal_strength', 1, 2)","('signal_strength', 1, 3)","('signal_strength', 1, 4)",...,"('bt_device_count', 0)","('bt_device_count', 1)","('bt_device_count', 2)","('bt_device_count', 3)","('bbox_count', 0)","('bbox_count', 1)","('bbox_count', 2)","('bbox_count', 3)",count,comment
0,2024-02-28 12:03:29,87.0,85.0,74.0,65.0,65.0,84.0,70.0,64.0,64.0,...,9.5,47.0,57.5,44.0,19,11,2,17,128.0,0
1,2024-02-28 12:32:37,85.0,62.0,62.0,55.0,55.0,70.0,69.0,57.0,55.0,...,17.0,62.0,66.0,59.0,16,10,0,17,99.0,0
2,2024-02-28 13:03:31,82.0,69.0,62.0,62.0,57.0,89.0,65.0,60.0,60.0,...,23.0,19.0,89.0,57.0,17,16,1,14,109.0,0
3,2024-02-28 13:33:42,84.0,69.0,60.0,55.0,55.0,82.0,59.0,55.0,54.0,...,8.0,44.0,0.0,44.0,12,16,0,17,75.0,0
5,2024-02-28 14:31:54,85.0,79.0,72.0,69.0,67.0,75.0,62.0,55.0,52.0,...,7.5,21.0,28.0,15.0,10,9,0,4,23.0,0
6,2024-02-28 15:02:15,89.0,67.0,65.0,57.0,54.0,79.0,72.0,55.0,55.0,...,5.0,26.0,0.0,12.0,9,8,0,12,38.0,0
7,2024-02-28 15:31:27,82.0,80.0,70.0,69.0,69.0,80.0,65.0,60.0,59.0,...,7.5,40.0,32.0,20.0,7,12,1,5,34.0,0
8,2024-02-28 16:01:24,84.0,84.0,75.0,74.0,70.0,75.0,59.0,59.0,54.0,...,6.0,23.0,0.0,14.0,4,9,0,2,21.0,0
9,2024-03-07 12:34:28,79.0,69.0,67.0,55.0,55.0,80.0,70.0,69.0,62.0,...,22.0,128.0,10.0,50.0,16,13,9,11,165.0,0
10,2024-03-07 13:31:15,80.0,69.0,65.0,64.0,57.0,84.0,69.0,67.0,65.0,...,11.0,72.0,2.0,28.0,15,9,8,5,55.0,0


In [15]:
device_idx = 0
top_n = 5
total_devices = 4
combined_tabular.columns[
    [1 + top_n * device_idx + i for i in range(top_n)]
    + [1 + total_devices * top_n + device_idx]
    + [1 + total_devices * (top_n + 1) + device_idx]
]

Index(['('signal_strength', 0, 1)', '('signal_strength', 0, 2)',
       '('signal_strength', 0, 3)', '('signal_strength', 0, 4)',
       '('signal_strength', 0, 5)', '('bt_device_count', 0)',
       '('bbox_count', 0)'],
      dtype='object')

In [16]:
combined_tabular.iloc[3, [1 + top_n * device_idx + i for i in range(top_n)]].astype(int).tolist()

[84, 69, 60, 55, 55]

In [17]:
combined_tabular.iloc[0, 1 + total_devices * top_n + device_idx]

9.5

In [18]:
combined_tabular[combined_tabular["comment"] == "Koufu"]

Unnamed: 0,timestamp,"('signal_strength', 0, 1)","('signal_strength', 0, 2)","('signal_strength', 0, 3)","('signal_strength', 0, 4)","('signal_strength', 0, 5)","('signal_strength', 1, 1)","('signal_strength', 1, 2)","('signal_strength', 1, 3)","('signal_strength', 1, 4)",...,"('bt_device_count', 0)","('bt_device_count', 1)","('bt_device_count', 2)","('bt_device_count', 3)","('bbox_count', 0)","('bbox_count', 1)","('bbox_count', 2)","('bbox_count', 3)",count,comment
15,2024-04-02 09:30:11,95.0,92.0,65.0,62.0,49.0,84.0,80.0,69.0,62.0,...,58.0,32.0,31.0,14.0,6,7,8,0,23.0,Koufu
16,2024-04-02 10:04:02,92.0,82.0,69.0,64.0,55.0,100.0,90.0,77.0,62.0,...,62.0,29.0,27.0,9.0,5,4,8,8,40.0,Koufu
17,2024-04-02 10:34:34,95.0,77.0,75.0,59.0,50.0,87.0,84.0,77.0,69.0,...,56.0,50.0,45.0,16.0,12,9,9,7,62.0,Koufu
18,2024-04-02 12:15:00,77.0,75.0,70.0,59.0,55.0,94.0,84.0,72.0,72.0,...,102.0,119.0,87.0,49.0,13,10,8,14,251.0,Koufu


In [19]:
koufu_df = combined_tabular[combined_tabular["comment"] == "Koufu"].drop(columns=["comment"])
koufu_df.to_csv("src/deployment/demo/koufu.csv", index=False)

# Machine Learning

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF
from sklearn.model_selection import train_test_split, KFold

In [21]:
FOLDS = 3

## With Koufu Data

In [22]:
X = combined_tabular.drop(columns=['timestamp', 'count', 'comment']).to_numpy()
y = combined_tabular["count"].to_numpy()

In [23]:
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
scores = []
models = []
results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    reg = LinearRegression().fit(X_train, y_train)
    scores.append(reg.score(X_test, y_test))
    models.append(reg)
    results.append([reg.predict(X_test), y_test])

scores

[0.5282568791467438, 0.6304061312736791, -15.99310246492539]

In [24]:
for model in models:
    coef = model.coef_
    print(
        sorted(
            list(
                zip(
                    combined_tabular.drop(
                        columns=["timestamp", "count", "comment"]
                    ).columns,
                    coef,
                )
            ),
            key=lambda x: x[1],
            reverse=True,
        )
    )

[("('bt_device_count', 0)", 1.115224061617116), ("('bt_device_count', 3)", 1.0261738484520428), ("('signal_strength', 0, 3)", 0.9281858355375749), ("('bbox_count', 3)", 0.7008549006556367), ("('signal_strength', 2, 1)", 0.660509029303773), ("('signal_strength', 0, 5)", 0.6355744038512879), ("('bt_device_count', 1)", 0.5682227048633383), ("('signal_strength', 2, 4)", 0.5407004676087033), ("('bt_device_count', 2)", 0.49391641492004157), ("('signal_strength', 3, 4)", 0.49096081926959534), ("('signal_strength', 0, 4)", 0.4896826198977238), ("('signal_strength', 1, 5)", 0.3448046862453512), ("('bbox_count', 2)", 0.3175611968767248), ("('signal_strength', 1, 4)", 0.29781904429313116), ("('bbox_count', 0)", 0.27897635161680295), ("('signal_strength', 1, 1)", 0.2099955840028763), ("('bbox_count', 1)", 0.17936377719701133), ("('signal_strength', 3, 3)", 0.13073583976241407), ("('signal_strength', 0, 2)", 0.056423530686707885), ("('signal_strength', 3, 5)", 0.03169989950408524), ("('signal_stren

In [25]:
results[2]

[array([ 6.19612851, 66.69492399, 61.73554684, 57.64221163, 58.16227633,
        23.81829406]),
 array([23., 34., 21., 38., 31., 23.])]

## With Koufu data as test set

In [26]:
from sklearn.base import RegressorMixin

In [27]:
def fit_model_with_koufu_test(
    model: RegressorMixin, X: np.ndarray, Y: np.ndarray, start_idx: int = 14
) -> RegressorMixin:
    X_train, X_test = X[0:start_idx], X[start_idx:]
    y_train, y_test = Y[0:start_idx], Y[start_idx:]
    model.fit(X_train, y_train)  # type:ignore
    print(f"Train score: {model.score(X_train, y_train)}")  # type:ignore
    print(f"Test score: {model.score(X_test, y_test)}")  # type:ignore
    print(model.predict(X_test), y_test)  # type:ignore
    return model

In [28]:
linreg = LinearRegression()
fit_model_with_koufu_test(linreg, X, y)

Train score: 1.0
Test score: -1.2149742783111073
[159.59301359 161.71104222 197.35837143 401.65346695] [ 23.  40.  62. 251.]


In [36]:
print(
    sorted(
        list(
            zip(
                combined_tabular.drop(
                    columns=["timestamp", "count", "comment"]
                ).columns,
                linreg.coef_,
            )
        ),
        key=lambda x: x[1],
        reverse=True,
    )
)

[("('signal_strength', 0, 2)", 1.898402529100758), ("('bt_device_count', 0)", 1.8186316741913051), ("('bbox_count', 3)", 1.571937466940123), ("('bt_device_count', 1)", 1.2168261205887463), ("('bt_device_count', 2)", 1.1210667107971735), ("('signal_strength', 0, 3)", 1.0635879233232535), ("('signal_strength', 1, 1)", 0.9323303146581763), ("('signal_strength', 3, 3)", 0.7941903765499005), ("('bbox_count', 2)", 0.7567722693601752), ("('bbox_count', 0)", 0.6798650042123033), ("('signal_strength', 3, 4)", 0.607399995641518), ("('signal_strength', 2, 3)", 0.5847323457719045), ("('signal_strength', 3, 5)", 0.5076435368289722), ("('signal_strength', 2, 4)", 0.30766039262812517), ("('signal_strength', 1, 2)", 0.23070264021103815), ("('signal_strength', 1, 5)", 0.21724036188068138), ("('bbox_count', 1)", 0.026743342016004094), ("('signal_strength', 1, 4)", -0.14224564731943912), ("('signal_strength', 3, 2)", -0.19672501706055087), ("('signal_strength', 2, 2)", -0.2725089001519296), ("('signal_st

In [29]:
gpr = GaussianProcessRegressor(kernel=(DotProduct() + RBF() + WhiteKernel()), random_state=42)
fit_model_with_koufu_test(gpr, X, y)

Train score: 0.9481655268477308
Test score: 0.7183195402461604
[ 87.76513724  78.73878907 116.18329786 280.01494983] [ 23.  40.  62. 251.]


In [35]:
gpr.predict(X[-1].reshape(1, -1), return_std=True)

(array([280.01494983]), array([80.4323143]))

Using 1 sample of the Koufu set for calibration:

In [30]:
from sklearn.metrics import r2_score

In [31]:
for idx in range(4):
    alpha = y_test[idx] / reg.predict(X_test[idx].reshape(1, -1))
    indices = list(set(list(range(4))).difference({idx}))
    preds = reg.predict(X_test[indices]) * alpha
    print(r2_score(y_test[indices], preds))

-757.9159702620476
-2.344901861963816
-6.481382289285299
-7.6242538500834875


# Saving models

In [33]:
import pickle

In [34]:
pickle.dump(gpr, open("src/deployment/models/gpr.pkl", "wb"))
pickle.dump(linreg, open("src/deployment/models/linreg.pkl", "wb"))