# Step 1: data engineering

In [32]:
# import the data into a pandas dataframe
import pandas as pd
from datetime import datetime
df = pd.read_csv("dataset.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
df

Unnamed: 0,timestamp,station,device,rssi
0,2025-03-17 20:38:21.936000+00:00,station1,Ton-M5StickC-0,-63
1,2025-03-17 20:38:22.962000+00:00,station2,Ton-M5StickC-0,-76
2,2025-03-17 20:38:24.950000+00:00,station1,Ton-M5StickC-0,-73
3,2025-03-17 20:38:26.044000+00:00,station2,Ton-M5StickC-0,-77
4,2025-03-17 20:38:28.021000+00:00,station1,Ton-M5StickC-0,-54
...,...,...,...,...
7843,2025-03-19 19:17:02.107000+00:00,station1,Asset-T1,-47
7844,2025-03-19 19:17:04.660000+00:00,station2,Asset-T1,-82
7845,2025-03-19 19:17:07.627000+00:00,station2,Asset-T1,-81
7846,2025-03-19 19:17:13.798000+00:00,station2,Asset-T1,-85


In [54]:
# query data from the API and store it in a dictionary
import requests
import json

# query data from asset API
devices = ['Asset-T0', 'Asset-T1', 'Asset-T2', 'Ton-M5StickC-0', 'Ton-M5StickC-1', 'Ton-M5StickC-2', 'Ton-M5StickC-3']
base_url = 'http://localhost:5000/api/asset/'
raw_data = {}
for device in devices:
    url = base_url + device + '?mins=100000&rssi=-100'
    resp = requests.get(url)
    data = json.loads(resp.text)
    raw_data[device] = data['data']
print(raw_data)
# Remove devices with no data
not_found_devices = []
for device in raw_data.keys():
    if len(raw_data[device]) == 0:
        not_found_devices.append(device)
for not_found_device in not_found_devices:
    print('Removing ' + not_found_device)
    del raw_data[not_found_device]
print(raw_data)

{'Asset-T0': [{'rssi': -86, 'station': 'station2', 'timestamp': '2025-03-19T19:12:43.525000'}, {'rssi': -69, 'station': 'station1', 'timestamp': '2025-03-19T19:12:40.499000'}, {'rssi': -30, 'station': 'station2', 'timestamp': '2025-03-19T19:12:40.443000'}, {'rssi': -30, 'station': 'station2', 'timestamp': '2025-03-19T19:12:37.488000'}, {'rssi': -65, 'station': 'station1', 'timestamp': '2025-03-19T19:12:34.503000'}, {'rssi': -30, 'station': 'station2', 'timestamp': '2025-03-19T19:12:34.307000'}, {'rssi': -30, 'station': 'station2', 'timestamp': '2025-03-19T19:12:31.237000'}, {'rssi': -29, 'station': 'station2', 'timestamp': '2025-03-19T19:12:28.170000'}, {'rssi': -67, 'station': 'station1', 'timestamp': '2025-03-19T19:12:25.439000'}, {'rssi': -29, 'station': 'station2', 'timestamp': '2025-03-19T19:12:25.206000'}, {'rssi': -65, 'station': 'station1', 'timestamp': '2025-03-19T19:12:22.421000'}, {'rssi': -30, 'station': 'station2', 'timestamp': '2025-03-19T19:12:22.021000'}, {'rssi': -30, 

In [55]:
# re-organize the data into table format
rssi_data = []
idx = 0
for device in raw_data.keys():
    for data in raw_data[device]:
        record = []
        record.append(data['timestamp'])
        record.append(device)
        record.append(data['station'])
        record.append(data['rssi'])
        record.append(0)
        rssi_data.append(record)
print(len(rssi_data), rssi_data)

7848 [['2025-03-19T19:12:43.525000', 'Asset-T0', 'station2', -86, 0], ['2025-03-19T19:12:40.499000', 'Asset-T0', 'station1', -69, 0], ['2025-03-19T19:12:40.443000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:37.488000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:34.503000', 'Asset-T0', 'station1', -65, 0], ['2025-03-19T19:12:34.307000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:31.237000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:28.170000', 'Asset-T0', 'station2', -29, 0], ['2025-03-19T19:12:25.439000', 'Asset-T0', 'station1', -67, 0], ['2025-03-19T19:12:25.206000', 'Asset-T0', 'station2', -29, 0], ['2025-03-19T19:12:22.421000', 'Asset-T0', 'station1', -65, 0], ['2025-03-19T19:12:22.021000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:19.071000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:15.883000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:12.816000', 'Asset-T0', 'station2', -30, 0], ['2025-03-19T19:12:09.843000', 'As

In [56]:
# import the data into a pandas dataframe
import pandas as pd
from datetime import datetime
df = pd.DataFrame(rssi_data, columns=['timestamp', 'device', 'station', 'rssi', 'label'])
df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
df.shape

(7848, 5)

In [57]:
# group data by timestamp
station_list = df['station'].unique()
device_list = df['device'].unique()
df_dicts = {}
for station in station_list:
    for device in device_list:
        sub_df = df[(df['station'] == station) & (df['device'] == device)]
        if sub_df.empty:
            continue
        rssi_values = sub_df[['timestamp','rssi']].resample('1min', on='timestamp').mean()      # Mean -> Low Pass Filter
        rssi_values = rssi_values.dropna().reset_index()
        idx = station + '+' + device
        df_dicts[idx] = rssi_values
print(df_dicts)

{'station2+Asset-T0':              timestamp       rssi
0  2025-03-19 17:42:00 -81.666667
1  2025-03-19 17:43:00 -88.312500
2  2025-03-19 17:44:00 -89.714286
3  2025-03-19 17:45:00 -88.133333
4  2025-03-19 17:46:00 -88.176471
..                 ...        ...
86 2025-03-19 19:08:00 -29.263158
87 2025-03-19 19:09:00 -29.210526
88 2025-03-19 19:10:00 -29.333333
89 2025-03-19 19:11:00 -29.631579
90 2025-03-19 19:12:00 -33.400000

[91 rows x 2 columns], 'station2+Asset-T1':              timestamp       rssi
0  2025-03-19 13:53:00 -93.250000
1  2025-03-19 13:54:00 -89.882353
2  2025-03-19 13:55:00 -86.687500
3  2025-03-19 13:56:00 -86.583333
4  2025-03-19 13:57:00 -87.437500
5  2025-03-19 13:58:00 -87.000000
6  2025-03-19 13:59:00 -87.785714
7  2025-03-19 14:00:00 -88.333333
8  2025-03-19 14:01:00 -88.357143
9  2025-03-19 14:02:00 -88.071429
10 2025-03-19 14:03:00 -88.600000
11 2025-03-19 14:04:00 -87.214286
12 2025-03-19 14:05:00 -87.083333
13 2025-03-19 14:06:00 -87.333333
14 2025-03-19 1

In [58]:
# export the data to a CSV file with columns station, device, timestamp, rssi
import csv
import random

with open('dataset.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['station', 'device', 'timestamp', 'rssi', 'label'])
    for key in df_dicts.keys():
        station, device = key.split('+')
        for idx, row in df_dicts[key].iterrows():
            writer.writerow([station, device, row['timestamp'], row['rssi'], random.choice([0, 1, 2])])

# Step 2: ML engineering

In [59]:
import csv

# import label data from CSV file
label_data = []
with open('dataset.csv', mode='r') as file:
    reader = csv.reader(file)
    for idx, row in enumerate(reader):
        if idx == 0:
            continue
        label_data.append(row)
print(label_data)

[['station2', 'Asset-T0', '2025-03-19 17:42:00', '-81.66666666666667', '1'], ['station2', 'Asset-T0', '2025-03-19 17:43:00', '-88.3125', '1'], ['station2', 'Asset-T0', '2025-03-19 17:44:00', '-89.71428571428571', '2'], ['station2', 'Asset-T0', '2025-03-19 17:45:00', '-88.13333333333334', '1'], ['station2', 'Asset-T0', '2025-03-19 17:46:00', '-88.17647058823529', '2'], ['station2', 'Asset-T0', '2025-03-19 17:47:00', '-85.5', '1'], ['station2', 'Asset-T0', '2025-03-19 17:48:00', '-85.0', '0'], ['station2', 'Asset-T0', '2025-03-19 17:49:00', '-89.13333333333334', '1'], ['station2', 'Asset-T0', '2025-03-19 17:50:00', '-89.375', '0'], ['station2', 'Asset-T0', '2025-03-19 17:51:00', '-86.9090909090909', '0'], ['station2', 'Asset-T0', '2025-03-19 17:52:00', '-87.54545454545455', '2'], ['station2', 'Asset-T0', '2025-03-19 17:53:00', '-88.47368421052632', '1'], ['station2', 'Asset-T0', '2025-03-19 17:54:00', '-89.94736842105263', '1'], ['station2', 'Asset-T0', '2025-03-19 17:55:00', '-87.166666

In [60]:
# prepare data into ML training format
import pandas as pd

feature_df = pd.DataFrame(label_data, columns=["station", "device", "timestamp", "rssi", "label"])
feature_df["label"] = feature_df["label"].astype(int)
feature_df["rssi"] = feature_df["rssi"].astype(float)
feature_station1_df = feature_df[feature_df["station"] == "station1"].reset_index(drop=True)
feature_station2_df = feature_df[feature_df["station"] == "station2"].reset_index(drop=True)
feature_station3_df = feature_df[feature_df["station"] == "station3"].reset_index(drop=True)
data_len = min(feature_station1_df.shape[0], feature_station2_df.shape[0], feature_station3_df.shape[0])
rows = []
for index in range(data_len):
    row = [feature_station1_df.loc[index, "rssi"], feature_station1_df.loc[index, "rssi"], feature_station3_df.loc[index, "rssi"], feature_df.loc[index, "label"]]
    rows.append(row)

train_df = pd.DataFrame(rows, columns=["station1", "station2", "station3", "label"])
train_df

Unnamed: 0,station1,station2,station3,label
0,-67.714286,-67.714286,-92.000000,1
1,-70.000000,-70.000000,-30.812500,1
2,-63.500000,-63.500000,-31.000000,2
3,-63.500000,-63.500000,-30.944444,1
4,-64.000000,-64.000000,-30.833333,2
...,...,...,...,...
153,-68.500000,-68.500000,-83.000000,2
154,-73.500000,-73.500000,-85.937500,2
155,-72.000000,-72.000000,-55.800000,1
156,-67.000000,-67.000000,-86.272727,2


In [61]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

x = train_df[["station1", "station2", "station3"]]
y = train_df["label"]

model = DecisionTreeClassifier(max_depth=8)
model.fit(x, y)
pred = model.predict(x)
report = classification_report(y, pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.46      0.63        54
           1       0.60      0.75      0.67        57
           2       0.59      0.77      0.67        47

    accuracy                           0.66       158
   macro avg       0.73      0.66      0.66       158
weighted avg       0.73      0.66      0.66       158



## Assignment

In [79]:
# import the data into a pandas dataframe
import pandas as pd
from datetime import datetime
df = pd.read_csv("dataset.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
df.head()

Unnamed: 0,station,device,timestamp,rssi,label
0,station2,Asset-T0,2025-03-19 17:42:00,-81.666667,1
1,station2,Asset-T0,2025-03-19 17:43:00,-88.3125,1
2,station2,Asset-T0,2025-03-19 17:44:00,-89.714286,2
3,station2,Asset-T0,2025-03-19 17:45:00,-88.133333,1
4,station2,Asset-T0,2025-03-19 17:46:00,-88.176471,2


In [80]:
station_dfs = {}
for station in df["station"].unique():
    station_dfs[station] = df[df["station"] == station]
for station_id in station_dfs.keys():
    station_dfs[station_id] = station_dfs[station_id][["timestamp", "rssi"]].resample("1min", on="timestamp").mean()

station_df = pd.concat(station_dfs.values(), axis=1)
station_df.columns = station_dfs.keys()
station_df.dropna(inplace=True)
station_df["label"] = 0
station_df.head()

Unnamed: 0_level_0,station2,station1,station3,label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-03-17 20:38:00,-75.818182,-61.222222,-88.111111,0
2025-03-17 20:39:00,-83.75,-68.4,-77.2,0
2025-03-17 20:40:00,-78.964286,-59.944444,-83.5,0
2025-03-17 20:41:00,-72.888889,-70.0,-74.416667,0
2025-03-17 20:42:00,-72.846154,-61.266667,-87.733333,0


In [81]:
def label_create(x):
    return x.idxmax()

column_mapping = {"station1": 0, "station2": 1, "station3": 2}
station_df["label"] = station_df[["station1", "station2", "station3"]].apply(label_create, axis=1).map(column_mapping)
print(station_df)

                      station2   station1   station3  label
timestamp                                                  
2025-03-17 20:38:00 -75.818182 -61.222222 -88.111111      0
2025-03-17 20:39:00 -83.750000 -68.400000 -77.200000      0
2025-03-17 20:40:00 -78.964286 -59.944444 -83.500000      0
2025-03-17 20:41:00 -72.888889 -70.000000 -74.416667      0
2025-03-17 20:42:00 -72.846154 -61.266667 -87.733333      0
...                        ...        ...        ...    ...
2025-03-19 15:46:00 -61.157895 -64.000000 -83.937500      1
2025-03-19 15:47:00 -61.052632 -63.500000 -83.894737      1
2025-03-19 15:48:00 -61.833333 -64.750000 -83.555556      1
2025-03-19 15:49:00 -61.947368 -64.333333 -84.058824      1
2025-03-19 15:50:00 -61.650000 -64.500000 -84.363636      1

[107 rows x 4 columns]


In [82]:
station_df.to_csv("station_df.csv")

In [84]:
import csv

label_data = []
with open("station_df.csv", mode="r") as file:
    reader = csv.reader(file)
    for idx, row in enumerate(reader):
        if idx == 0:
            continue
        label_data.append(row)
train_df = pd.DataFrame(label_data, columns=["timestamp", "station1", "station2", "station3", "label"])
train_df["station1"] = train_df["station1"].astype(float)
train_df["station2"] = train_df["station2"].astype(float)
train_df["station3"] = train_df["station3"].astype(float)
train_df["label"] = train_df["label"].astype(int)

In [96]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

x = train_df[["station1", "station2", "station3"]]
y = train_df["label"]

model = DecisionTreeClassifier(max_depth=8)
model.fit(x, y)
pred = model.predict([[-60, -80, -90]])
result = [key for key, val in column_mapping.items() if val == pred]
result



['station2']