In [2]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from dataset.build_dataframe import (
    get_wifi_dataframe,
    get_bluetooth_dataframe,
    get_population_count_df,
    get_bbox_df,
    get_nearest_timestamp,
    get_top_N_wifi_aps_only,
    pivot_tables,
    merge_dfs
)

In [20]:
sns.set_theme("notebook", "whitegrid")

In [3]:
RAW_DATA_PATH = "data/raw_data/"
BBOX_CSV_PATH = "data/bbox_results.csv"
POPULATION_COUNT_CSV_PATH = "data/manual_counts.csv"

In [4]:
wifi_df = get_wifi_dataframe(RAW_DATA_PATH)
bt_df = get_bluetooth_dataframe(RAW_DATA_PATH)
bbox_df = get_bbox_df(BBOX_CSV_PATH, ["timestamp", "device_idx", "bbox_count"])
population_count_df = get_population_count_df(POPULATION_COUNT_CSV_PATH, ["timestamp", "count", "comment"])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [5]:
for df in [wifi_df, bt_df, bbox_df]:
    df["timestamp"] = df["timestamp"].apply(
        lambda x: get_nearest_timestamp(x, population_count_df["timestamp"])
    )

In [6]:
top_5_wifi_df = get_top_N_wifi_aps_only(wifi_df, 5)

In [7]:
wifi_tabular, bt_tabular, bbox_tabular = pivot_tables(top_5_wifi_df, bt_df, bbox_df)

In [19]:
combined_tabular = merge_dfs(wifi_tabular, bt_tabular, bbox_tabular, population_count_df)

# Machine Learning

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold

In [10]:
FOLDS = 3

## With Koufu Data

In [11]:
X = combined_tabular.drop(columns=['timestamp', 'count', 'comment']).to_numpy()
y = combined_tabular["count"].to_numpy()

In [12]:
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
scores = []
models = []
results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    reg = LinearRegression().fit(X_train, y_train)
    scores.append(reg.score(X_test, y_test))
    models.append(reg)
    results.append([reg.predict(X_test), y_test])

scores

[0.7163506576791634, 0.4864512686403619, -4.583583358514798]

In [13]:
for model in models:
    coef = model.coef_
    print(
        sorted(
            list(
                zip(
                    combined_tabular.drop(
                        columns=["timestamp", "count", "comment"]
                    ).columns,
                    coef,
                )
            ),
            key=lambda x: x[1],
            reverse=True,
        )
    )

[("('bt_device_count', 0)", 1.3149322265424659), ("('signal_strength', 0, 3)", 0.9520134240198961), ("('signal_strength', 0, 5)", 0.9049181267506936), ("('bt_device_count', 1)", 0.8185001999156386), ("('signal_strength', 3, 4)", 0.7782172263359662), ("('bt_device_count', 3)", 0.6968934278066079), ("('bt_device_count', 2)", 0.5160368997885905), ("('signal_strength', 3, 5)", 0.46825119129154646), ("('bbox_count', 3)", 0.3930520239951808), ("('bbox_count', 0)", 0.35154403020247144), ("('signal_strength', 2, 5)", 0.2916738289043958), ("('signal_strength', 0, 4)", 0.2887710767541241), ("('bbox_count', 1)", 0.28760461589396263), ("('signal_strength', 1, 2)", 0.2766073354134049), ("('signal_strength', 3, 3)", 0.27046630323784654), ("('signal_strength', 2, 2)", 0.14044330029235746), ("('signal_strength', 1, 1)", 0.12954485260739498), ("('signal_strength', 2, 4)", 0.05138446207658409), ("('signal_strength', 1, 5)", 0.03329263226732729), ("('bbox_count', 2)", 0.0055010366774178344), ("('signal_s

In [14]:
results[2]

[array([ 1.05468175, 42.45575997, 39.61887437, 47.84671164, 49.38998772,
        27.87475325]),
 array([23., 34., 21., 38., 31., 23.])]

## With Koufu data as test set

In [15]:
X_train, X_test = X[0:14], X[14:]
y_train, y_test = y[0:14], y[14:]
reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X_test, y_test))
print(reg.predict(X_test), y_test)

0.741699457671144
[ 90.98560711  73.6952661  108.54242302 278.6231129 ] [ 23.  40.  62. 251.]


Using 1 sample of the Koufu set for calibration:

In [16]:
from sklearn.metrics import r2_score

In [17]:
for idx in range(4):
    alpha = y_test[idx] / reg.predict(X_test[idx].reshape(1, -1))
    indices = list(set(list(range(4))).difference({idx}))
    preds = reg.predict(X_test[indices]) * alpha
    print(r2_score(y_test[indices], preds))

-0.2730434837284408
0.6415923536617594
0.712379573123959
-6.131964552892656
