In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from dataset.build_dataframe import (
    get_wifi_dataframe,
    get_bluetooth_dataframe,
    get_population_count_df,
    get_bbox_df,
    get_nearest_timestamp,
    get_top_N_wifi_aps_only,
    pivot_tables,
    merge_dfs
)

In [2]:
sns.set_theme("notebook", "whitegrid")

In [3]:
RAW_DATA_PATH = "data/raw_data/"
BBOX_CSV_PATH = "data/bbox_results.csv"
POPULATION_COUNT_CSV_PATH = "data/manual_counts.csv"

In [4]:
wifi_df = get_wifi_dataframe(RAW_DATA_PATH)
bt_df = get_bluetooth_dataframe(RAW_DATA_PATH)
bbox_df = get_bbox_df(BBOX_CSV_PATH, ["timestamp", "device_idx", "bbox_count"])
population_count_df = get_population_count_df(POPULATION_COUNT_CSV_PATH, ["timestamp", "count", "comment"])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [5]:
for df in [wifi_df, bt_df, bbox_df]:
    df["timestamp"] = df["timestamp"].apply(
        lambda x: get_nearest_timestamp(x, population_count_df["timestamp"])
    )

In [6]:
top_5_wifi_df = get_top_N_wifi_aps_only(wifi_df, 5)

In [7]:
wifi_tabular, bt_tabular, bbox_tabular = pivot_tables(top_5_wifi_df, bt_df, bbox_df)

In [8]:
combined_tabular = merge_dfs(wifi_tabular, bt_tabular, bbox_tabular, population_count_df)

In [39]:
device_idx = 0
top_n = 5
total_devices = 4
combined_tabular.columns[
    [1 + top_n * device_idx + i for i in range(top_n)]
    + [1 + total_devices * top_n + device_idx]
    + [1 + total_devices * (top_n + 1) + device_idx]
]

Index(['('signal_strength', 0, 1)', '('signal_strength', 0, 2)',
       '('signal_strength', 0, 3)', '('signal_strength', 0, 4)',
       '('signal_strength', 0, 5)', '('bt_device_count', 0)',
       '('bbox_count', 0)'],
      dtype='object')

In [50]:
combined_tabular.iloc[3, [1 + top_n * device_idx + i for i in range(top_n)]].astype(int).tolist()

[84, 69, 60, 55, 55]

In [49]:
combined_tabular.iloc[0, 1 + total_devices * top_n + device_idx]

9.5

In [30]:
combined_tabular[combined_tabular["comment"] == "Koufu"]

Unnamed: 0,timestamp,"('signal_strength', 0, 1)","('signal_strength', 0, 2)","('signal_strength', 0, 3)","('signal_strength', 0, 4)","('signal_strength', 0, 5)","('signal_strength', 1, 1)","('signal_strength', 1, 2)","('signal_strength', 1, 3)","('signal_strength', 1, 4)","('signal_strength', 1, 5)","('signal_strength', 2, 1)","('signal_strength', 2, 2)","('signal_strength', 2, 3)","('signal_strength', 2, 4)","('signal_strength', 2, 5)","('signal_strength', 3, 1)","('signal_strength', 3, 2)","('signal_strength', 3, 3)","('signal_strength', 3, 4)","('signal_strength', 3, 5)","('bt_device_count', 0)","('bt_device_count', 1)","('bt_device_count', 2)","('bt_device_count', 3)","('bbox_count', 0)","('bbox_count', 1)","('bbox_count', 2)","('bbox_count', 3)",count,comment
15,2024-04-02 09:30:11,95.0,92.0,65.0,62.0,49.0,84.0,80.0,69.0,62.0,57.0,79.0,79.0,77.0,65.0,60.0,75.0,69.0,60.0,50.0,44.0,58.0,32.0,31.0,14.0,0.0,0.0,0.0,0.0,23.0,Koufu
16,2024-04-02 10:04:02,92.0,82.0,69.0,64.0,55.0,100.0,90.0,77.0,62.0,60.0,82.0,79.0,77.0,77.0,70.0,69.0,62.0,52.0,49.0,39.0,62.0,29.0,27.0,9.0,0.0,0.0,0.0,0.0,40.0,Koufu
17,2024-04-02 10:34:34,95.0,77.0,75.0,59.0,50.0,87.0,84.0,77.0,69.0,60.0,82.0,80.0,72.0,65.0,62.0,72.0,69.0,62.0,55.0,49.0,56.0,50.0,45.0,16.0,0.0,0.0,0.0,0.0,62.0,Koufu
18,2024-04-02 12:15:00,77.0,75.0,70.0,59.0,55.0,94.0,84.0,72.0,72.0,55.0,90.0,77.0,74.0,72.0,62.0,62.0,62.0,55.0,55.0,37.0,102.0,119.0,87.0,49.0,0.0,0.0,0.0,0.0,251.0,Koufu


In [31]:
koufu_df = combined_tabular[combined_tabular["comment"] == "Koufu"].drop(columns=["comment"])
koufu_df.to_csv("data/koufu.csv", index=False)

# Machine Learning

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF
from sklearn.model_selection import train_test_split, KFold

In [12]:
FOLDS = 3

## With Koufu Data

In [9]:
X = combined_tabular.drop(columns=['timestamp', 'count', 'comment']).to_numpy()
y = combined_tabular["count"].to_numpy()

In [13]:
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
scores = []
models = []
results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    reg = LinearRegression().fit(X_train, y_train)
    scores.append(reg.score(X_test, y_test))
    models.append(reg)
    results.append([reg.predict(X_test), y_test])

scores

[0.7163506576791634, 0.4864512686403619, -4.583583358514798]

In [14]:
for model in models:
    coef = model.coef_
    print(
        sorted(
            list(
                zip(
                    combined_tabular.drop(
                        columns=["timestamp", "count", "comment"]
                    ).columns,
                    coef,
                )
            ),
            key=lambda x: x[1],
            reverse=True,
        )
    )

[("('bt_device_count', 0)", 1.3149322265424659), ("('signal_strength', 0, 3)", 0.9520134240198961), ("('signal_strength', 0, 5)", 0.9049181267506936), ("('bt_device_count', 1)", 0.8185001999156386), ("('signal_strength', 3, 4)", 0.7782172263359662), ("('bt_device_count', 3)", 0.6968934278066079), ("('bt_device_count', 2)", 0.5160368997885905), ("('signal_strength', 3, 5)", 0.46825119129154646), ("('bbox_count', 3)", 0.3930520239951808), ("('bbox_count', 0)", 0.35154403020247144), ("('signal_strength', 2, 5)", 0.2916738289043958), ("('signal_strength', 0, 4)", 0.2887710767541241), ("('bbox_count', 1)", 0.28760461589396263), ("('signal_strength', 1, 2)", 0.2766073354134049), ("('signal_strength', 3, 3)", 0.27046630323784654), ("('signal_strength', 2, 2)", 0.14044330029235746), ("('signal_strength', 1, 1)", 0.12954485260739498), ("('signal_strength', 2, 4)", 0.05138446207658409), ("('signal_strength', 1, 5)", 0.03329263226732729), ("('bbox_count', 2)", 0.0055010366774178344), ("('signal_s

In [15]:
results[2]

[array([ 1.05468175, 42.45575997, 39.61887437, 47.84671164, 49.38998772,
        27.87475325]),
 array([23., 34., 21., 38., 31., 23.])]

## With Koufu data as test set

In [21]:
from sklearn.base import RegressorMixin

In [25]:
def fit_model_with_koufu_test(
    model: RegressorMixin, X: np.ndarray, Y: np.ndarray, start_idx: int = 14
) -> RegressorMixin:
    X_train, X_test = X[0:start_idx], X[start_idx:]
    y_train, y_test = Y[0:start_idx], Y[start_idx:]
    model.fit(X_train, y_train)  # type:ignore
    print(f"Train score: {model.score(X_train, y_train)}")  # type:ignore
    print(f"Test score: {model.score(X_test, y_test)}")  # type:ignore
    print(model.predict(X_test), y_test)  # type:ignore
    return model

In [28]:
linreg = LinearRegression()
fit_model_with_koufu_test(linreg, X, y)

Train score: 1.0
Test score: 0.741699457671144
[ 90.98560711  73.6952661  108.54242302 278.6231129 ] [ 23.  40.  62. 251.]


In [27]:
gpr = GaussianProcessRegressor(kernel=(DotProduct() + RBF() + WhiteKernel()), random_state=42)
fit_model_with_koufu_test(gpr, X, y)

Train score: 0.9862609052340188
Test score: 0.9302647994174769
[ 62.90805839  45.44888946  84.34870693 236.05387765] [ 23.  40.  62. 251.]


Using 1 sample of the Koufu set for calibration:

In [17]:
from sklearn.metrics import r2_score

In [18]:
for idx in range(4):
    alpha = y_test[idx] / reg.predict(X_test[idx].reshape(1, -1))
    indices = list(set(list(range(4))).difference({idx}))
    preds = reg.predict(X_test[indices]) * alpha
    print(r2_score(y_test[indices], preds))

-0.2730434837284408
0.6415923536617594
0.712379573123959
-6.131964552892656


# Saving models

In [32]:
import pickle

In [33]:
pickle.dump(gpr, open("models/gpr.pkl", "wb"))
pickle.dump(linreg, open("models/linreg.pkl", "wb"))