In [1]:
#################################################
# 0. Libraries

from dataclasses import dataclass
import numpy as np 
import pickle
import pandas as pd
import os
import glob
from tqdm.notebook import tqdm
from datetime import datetime
import json
import math
from functools import reduce
from joblib import parallel_backend, Parallel, delayed
import scipy.signal as signal

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})
from PIL import Image
import plotly.graph_objs as go
import cv2

np.random.seed(12)
np.set_printoptions(suppress=True)
pd.set_option('display.max_colwidth', 1000)

#################################################

In [16]:
#################################################
# 1. Global Variables & Paths

# Paths 

PATH_DATA = '../../01_Data/'
path_data_train = PATH_DATA + 'train/'
path_data_test = PATH_DATA + 'test/'
path_metadata = PATH_DATA + 'metadata/'


# Global Variables

DICT_FLOOR_MAP = {'1F' :  0, '2F' : 1, '3F' : 2, '4F' : 3, '5F' : 4, 
                     '6F' : 5, '7F' : 6, '8F' : 7, '9F' : 8,
                     'B'  : -1, 'B1' : -1, 'B2' : -2, 'B3' : -3, 
                     'BF' : -1, 'BM' : -1, 
                     'F1' : 0, 'F2' : 1, 'F3' : 2, 'F4' : 3, 'F5' : 4, 
                     'F6' : 5, 'F7' : 6, 'F8' : 7, 'F9' : 8, 'F10': 9,
                     'L1' : 0, 'L2' : 1, 'L3' : 2, 'L4' : 3, 'L5' : 4, 
                     'L6' : 5, 'L7' : 6, 'L8' : 7, 'L9' : 8, 'L10': 9, 
                     'L11': 10,
                     'G'  : 0, 'LG1': 0, 'LG2': 1, 'LM' : 0, 'M'  : 0, 
                     'P1' : 0, 'P2' : 1,}

SEQ_LEN_IMU = 128
SEQ_LEN_WIFI = 150
SEQ_LEN_BEACON = 100

NETWORK_SIZE = 550
VERSION = '010'

list_train_paths = glob.glob(path_data_train + '*/*/*')
list_test_paths = glob.glob(path_data_test + '*')
list_metadata_paths = glob.glob(path_metadata + '*')

#################################################

In [17]:
#################################################
# 2. Classes

# copy from https://github.com/location-competition/indoor-location-competition-20/blob/master/io_f.py
@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray
        
    
class Experiment(object):
    def __init__(self, **kwargs):
        self.__dict__ = kwargs

    def __repr__(self):
        return str(self.__dict__)

#################################################

In [18]:
#################################################
# 3. Functions

def getMeanStd(df, cols):
    return {col: df[col].mean() for col in cols}, {col: df[col].std() for col in cols}


def getCategories(df, cols):
    cats_d = {}
    for col in cols:
        if df[col].dtype.name == 'category':
            print(f'{col} already categorized')
        else:
            df[col] = pd.Categorical(df[col])
        cats_d[col] = df[col].cat.categories.values
    return cats_d


# https://github.com/location-competition/indoor-location-competition-20/blob/master/io_f.py
def readDataFile(path_input):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []

    with open(path_input, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')

        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            if len(line_data)>=5:
                ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            rssi = line_data[6]
            lastts = line_data[-1]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi, lastts]
            ibeacon.append(ibeacon_data)
            continue

        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])

    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)
    
    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)

    
# Prepare IMU data
    
def getImuData(sample_file):
    if sample_file.acce.shape[0]==0 or sample_file.acce_uncali.shape[0]==0 or sample_file.gyro.shape[0]==0 or \
       sample_file.gyro_uncali.shape[0]==0 or sample_file.magn.shape[0]==0 or sample_file.magn_uncali.shape[0]==0 or \
       sample_file.ahrs.shape[0]==0:
        df_imu = pd.DataFrame()
    else:
        df_acce = pd.DataFrame({
            'timestamp' : sample_file.acce[:, 0],
            'acce_x' : sample_file.acce[:, 1],
            'acce_y' : sample_file.acce[:, 2],
            'acce_z' : sample_file.acce[:, 3]
        })
        df_acce_uncali = pd.DataFrame({
            'timestamp' : sample_file.acce_uncali[:, 0],
            'acce_uncali_x' : sample_file.acce_uncali[:, 1],
            'acce_uncali_y' : sample_file.acce_uncali[:, 2],
            'acce_uncali_z' : sample_file.acce_uncali[:, 3]
        })
        df_gyro = pd.DataFrame({
            'timestamp' : sample_file.gyro[:, 0],
            'gyro_x' : sample_file.gyro[:, 1],
            'gyro_y' : sample_file.gyro[:, 2],
            'gyro_z' : sample_file.gyro[:, 3]
        })
        df_gyro_uncali = pd.DataFrame({
            'timestamp' : sample_file.gyro_uncali[:, 0],
            'gyro_uncali_x' : sample_file.gyro_uncali[:, 1],
            'gyro_uncali_y' : sample_file.gyro_uncali[:, 2],
            'gyro_uncali_z' : sample_file.gyro_uncali[:, 3]
        })
        df_magn = pd.DataFrame({
            'timestamp' : sample_file.magn[:, 0],
            'magn_x' : sample_file.magn[:, 1],
            'magn_y' : sample_file.magn[:, 2],
            'magn_z' : sample_file.magn[:, 3]
        })
        df_magn_uncali = pd.DataFrame({
            'timestamp' : sample_file.magn_uncali[:, 0],
            'magn_uncali_x' : sample_file.magn_uncali[:, 1],
            'magn_uncali_y' : sample_file.magn_uncali[:, 2],
            'magn_uncali_z' : sample_file.magn_uncali[:, 3]
        })
        df_ahrs = pd.DataFrame({
            'timestamp' : sample_file.ahrs[:, 0],
            'ahrs_x' : sample_file.ahrs[:, 1],
            'ahrs_y' : sample_file.ahrs[:, 2],
            'ahrs_z' : sample_file.ahrs[:, 3]
        })

        list_df_imu = [df_acce, df_acce_uncali, df_gyro, df_gyro_uncali, df_magn, df_magn_uncali, df_ahrs]
        df_imu = reduce(lambda left, right: pd.merge(left, right, on=['timestamp'],
                                                      how='inner'), list_df_imu)
        df_imu = df_imu.sort_values(['timestamp']).reset_index(drop=True)
        df_imu['timestamp'] = df_imu['timestamp']#/1000.0
        df_imu[['acce_x', 'acce_y', 'acce_z']] = df_imu[['acce_x', 'acce_y', 'acce_z']].cumsum()
        df_imu[['acce_uncali_x', 'acce_uncali_y', 'acce_uncali_z']] = df_imu[['acce_uncali_x', 'acce_uncali_y', 'acce_uncali_z']].cumsum()
        df_imu[['gyro_x', 'gyro_y', 'gyro_z']] = df_imu[['gyro_x', 'gyro_y', 'gyro_z']].cumsum()
        df_imu[['gyro_uncali_x', 'gyro_uncali_y', 'gyro_uncali_z']] = df_imu[['gyro_uncali_x', 'gyro_uncali_y', 'gyro_uncali_z']].cumsum()
        
    return df_imu


def getWifiData(sample_file):
    if sample_file.wifi.shape[0]>0:
        df_wifi = pd.DataFrame(sample_file.wifi)
        df_wifi.columns = ['timestamp', 'ssid', 'bssid', 'rssi', 'last_seen_timestamp']
        df_wifi['timestamp'] = df_wifi['timestamp'].astype(np.int64)
        df_wifi['last_seen_timestamp'] = df_wifi['last_seen_timestamp'].astype(np.int64)
        df_wifi = df_wifi.sort_values(['timestamp']).reset_index(drop=True)
        df_wifi['timestamp'] = df_wifi['timestamp']#/1000.0
        df_wifi['last_seen_timestamp'] = df_wifi['last_seen_timestamp']#/1000.0
    else:
        df_wifi = pd.DataFrame()
    
    return df_wifi


def getBeaconData(sample_file, test=False):
    if sample_file.ibeacon.shape[0]>0:
        df_beacon = pd.DataFrame(sample_file.ibeacon)
        if not test:
            df_beacon = df_beacon.iloc[:, :-1]
            df_beacon.columns = ['timestamp', 'uuid', 'rssi']
            df_beacon['timestamp'] = df_beacon['timestamp'].astype(np.int64)
        else:
            df_beacon.columns = ['timestamp', 'uuid', 'rssi', 'last_timestamp']
            df_beacon['timestamp'] = df_beacon['timestamp'].astype(np.int64)
            df_beacon['last_timestamp'] = df_beacon['last_timestamp'].astype(np.int64)
            # df_beacon = df_beacon.drop(['last_timestamp'], axis=1)
    else:
        df_beacon = pd.DataFrame()

    return df_beacon


def getWaypointData(sample_file):
    df_waypoint = pd.DataFrame(sample_file.waypoint)
    df_waypoint.columns = ['timestamp', 'waypoint_x','waypoint_y']
    df_waypoint['timestamp'] = df_waypoint['timestamp']#/1000.0
    df_waypoint['ts_diff_start'] = df_waypoint['timestamp'].diff().fillna(0.).cumsum()
    df_waypoint['ts_diff_last'] = (df_waypoint['timestamp'] - df_waypoint['timestamp'].shift(periods=1)).fillna(df_waypoint['timestamp']-df_waypoint['timestamp'])
    # df_waypoint['ts_diff_sec'] = df_waypoint['ts_diff'].apply(lambda x: x.total_seconds()).fillna(0)
    # df_waypoint['ts_diff_milisec'] = df_waypoint['ts_diff_sec'] * 1000

    return df_waypoint


## Sequences

def getSequencesImu(df, df_w, window_mean=5):
    list_seqs = []
    timestamps = df_w['timestamp'].values
    min_ts, max_ts = df_w['timestamp'].min(), df_w['timestamp'].max()
    if df.shape[0]>=1:
        df = df[(df['timestamp'] >= min_ts) & (df['timestamp'] <= max_ts)].reset_index(drop=True)
        for i in range(len(timestamps)):
            if i==0:
                arr_values = np.zeros((SEQ_LEN_IMU, 24))
            else:
                ts_actual, ts_next = timestamps[i-1], timestamps[i]
                df_tmp_ = df[(df['timestamp'] >= ts_actual) & (df['timestamp'] < ts_next)].reset_index(drop=True)
                df_tmp_ = df_tmp_.groupby(np.arange(len(df_tmp_)) // window_mean).mean()
                df_tmp_['ts_diff_w'] = np.abs(ts_actual - df_tmp_['timestamp'])
                df_tmp_['ts_diff_w0'] = np.abs(min_ts - df_tmp_['timestamp'])
                df_tmp_['timestamp'] = pd.to_datetime(df_tmp_['timestamp']/1000.0, unit='s')
                series_ts_diff = (df_tmp_['timestamp'] - df_tmp_['timestamp'].shift(periods=1)).fillna(df_tmp_['timestamp']-df_tmp_['timestamp'])
                df_tmp_['ts_diff_last'] = series_ts_diff.apply(lambda x: x.total_seconds())
                arr_values = df_tmp_.values[-SEQ_LEN_IMU:, 1:].astype(np.float32)

            padt = (0, (SEQ_LEN_IMU-arr_values.shape[0]))
            arr_values = np.pad(arr_values, (padt, (0, 0)), constant_values=(0))
            list_seqs.append(np.expand_dims(arr_values, 0))
        
    return list_seqs


def getSequencesWifi(df, df_w, dict_unique_ssid, dict_unique_bssid):
    list_seqs = []
    timestamps = df_w['timestamp'].values
    min_ts, max_ts = df_w['timestamp'].min(), df_w['timestamp'].max()
    if df.shape[0]>=1:
        df = df[df['timestamp'] <= max_ts].reset_index(drop=True)
        for i in range(len(timestamps)):
            if i==0:
                ts_actual = timestamps[i]
                df_tmp_ = df[df['last_seen_timestamp'] < ts_actual].reset_index(drop=True)
            else:
                ts_actual, ts_next = timestamps[i-1], timestamps[i]
                df_tmp_ = df[(df['last_seen_timestamp'] >= ts_actual) & (df['last_seen_timestamp'] < ts_next)].reset_index(drop=True)

            # Most relevant signals
            df_tmp_['rssi'] = df_tmp_['rssi'].astype(np.int32) / 100.0
            df_tmp_ = df_tmp_.sort_values(['rssi'], ascending=False).reset_index(drop=True)
            df_tmp_['ts_diff_sec'] = df_tmp_['timestamp'] - df_tmp_['last_seen_timestamp']
            df_tmp_['ts_diff_w'] = ts_actual - df_tmp_['last_seen_timestamp']
            df_tmp_['ts_diff_w0'] = np.abs(min_ts - df_tmp_['last_seen_timestamp'])
            df_tmp_['ssid'] = df_tmp_['ssid'].apply(lambda x: dict_unique_ssid[x] if x in dict_unique_ssid else dict_unique_ssid['<NA>'])
            df_tmp_['bssid'] = df_tmp_['bssid'].apply(lambda x: dict_unique_bssid[x] if x in dict_unique_bssid else dict_unique_bssid['<NA>'])
            df_tmp_ = df_tmp_.drop(['last_seen_timestamp'], axis=1)
            arr_values = df_tmp_.values[:SEQ_LEN_WIFI, 1:].astype(np.float32)
            padt = (0, (SEQ_LEN_WIFI-arr_values.shape[0]))
            arr_values = np.pad(arr_values, (padt, (0, 0)), constant_values=(0))
            list_seqs.append(np.expand_dims(arr_values, 0))
    else:
        arr_values = np.zeros((SEQ_LEN_BEACON, 7))
        list_seqs = [np.expand_dims(arr_values, 0) for _ in range(timestamps.shape[0])]
        
    return list_seqs


def getSequencesBeacon(df, df_w, dict_unique_uuid):
    list_seqs = []
    timestamps = df_w['timestamp'].values
    min_ts, max_ts = df_w['timestamp'].min(), df_w['timestamp'].max()
    if df.shape[0]>=1:
        df = df[df['timestamp'] <= max_ts].reset_index(drop=True)
        for i in range(len(timestamps)):
            if i==0:
                ts_actual = timestamps[i]
                df_tmp_ = df[df['timestamp'] <= ts_actual].reset_index(drop=True)
            else:
                ts_actual, ts_next = timestamps[i-1], timestamps[i]
                df_tmp_ = df[(df['timestamp'] >= ts_actual) & (df['timestamp'] < ts_next)].reset_index(drop=True)
            
            # Most relevant signals
            df_tmp_['rssi'] = df_tmp_['rssi'].astype(np.int32) / 100.0
            df_tmp_ = df_tmp_.sort_values(['rssi'], ascending=False).reset_index(drop=True)
            df_tmp_['ts_diff_w'] = ts_actual - df_tmp_['timestamp']
            df_tmp_['ts_diff_w0'] = np.abs(min_ts - df_tmp_['timestamp'])
            df_tmp_['uuid'] = df_tmp_['uuid'].apply(lambda x: dict_unique_uuid[x] if x in dict_unique_uuid else dict_unique_uuid['<NA>'])
            arr_values = df_tmp_.values[:SEQ_LEN_BEACON, 1:].astype(np.float32)
            padt = (0, (SEQ_LEN_BEACON-arr_values.shape[0]))
            arr_values = np.pad(arr_values, (padt, (0, 0)), constant_values=(0))
            list_seqs.append(np.expand_dims(arr_values, 0))
    else:
        arr_values = np.zeros((SEQ_LEN_BEACON, 4))
        list_seqs = [np.expand_dims(arr_values, 0) for _ in range(timestamps.shape[0])]
        
    return list_seqs


def getWaypointDataSampleSubmission(df_ss, path):
    df_ss = df_ss[df_ss['path']==path].reset_index(drop=True)
    df_ss['ts_diff_start'] = df_ss['timestamp'].diff().fillna(0.).cumsum()
    df_ss['ts_diff_last'] = (df_ss['timestamp'] - df_ss['timestamp'].shift(periods=1)).fillna(df_ss['timestamp']-df_ss['timestamp'])
    return df_ss

# AllSequences for floor prediction 

def getAllFloorPrediction(df_w, df_imu_, df_wifi_, df_beacon_, window_mean=20):
    min_ts, max_ts = df_w['timestamp'].min(), df_w['timestamp'].max()
    # Filter
    df_imu_ = df_imu_[(df_imu_['timestamp'] >= min_ts) & (df_imu_['timestamp'] <= max_ts)].reset_index(drop=True)
    df_wifi_ = df_wifi_[(df_wifi_['timestamp'] >= min_ts) & (df_wifi_['timestamp'] <= max_ts)].reset_index(drop=True)
    
    # Imu
    df_imu_ = df_imu_[['timestamp', 'magn_x', 'magn_y', 'magn_z', 'magn_uncali_x', 'magn_uncali_y', 'magn_uncali_z']]
    df_imu_ = df_imu_.groupby(np.arange(len(df_imu_)) // window_mean).mean().reset_index(drop=True)
    imu_data = df_imu_.values[:100, 1:]
    padt = (0, (100-imu_data.shape[0]))
    imu_data = np.pad(imu_data, (padt, (0, 0)), constant_values=(0))
    
    # Wifi
    df_wifi_['rssi'] = df_wifi_['rssi'].astype(np.int32) / 100.0
    df_wifi_ = df_wifi_.sort_values(['rssi'], ascending=False).reset_index(drop=True)
    df_wifi_ = df_wifi_[['timestamp', 'ssid', 'bssid', 'rssi']]
    df_wifi_['ssid'] = df_wifi_['ssid'].apply(lambda x: dict_unique_ssid[x] if x in dict_unique_ssid else dict_unique_ssid['<NA>'])
    df_wifi_['bssid'] = df_wifi_['bssid'].apply(lambda x: dict_unique_bssid[x] if x in dict_unique_bssid else dict_unique_bssid['<NA>'])
    wifi_data = df_wifi_.values[:NETWORK_SIZE, 1:]
    padt = (0, (NETWORK_SIZE-wifi_data.shape[0]))
    wifi_data = np.pad(wifi_data, (padt, (0, 0)), constant_values=(0))
    
    # Beacon
    if len(df_beacon_)>0:
        df_beacon_ = df_beacon_[['timestamp', 'uuid', 'rssi']]
        df_beacon_ = df_beacon_[(df_beacon_['timestamp'] >= min_ts) & (df_beacon_['timestamp'] <= max_ts)].reset_index(drop=True)
        df_beacon_['rssi'] = df_beacon_['rssi'].astype(np.int32) / 100.0
        df_beacon_ = df_beacon_.sort_values(['rssi'], ascending=False).reset_index(drop=True)
        df_beacon_['uuid'] = df_beacon_['uuid'].apply(lambda x: dict_unique_uuid[x] if x in dict_unique_uuid else dict_unique_uuid['<NA>'])
        beacon_data = df_beacon_.values[:NETWORK_SIZE, 1:]
        padt = (0, (NETWORK_SIZE-beacon_data.shape[0]))
        beacon_data = np.pad(beacon_data, (padt, (0, 0)), constant_values=(0))
    else:
        beacon_data = np.zeros((NETWORK_SIZE, 2))
    return imu_data, wifi_data, beacon_data


def getTestGapTimeStamp(df_beacon_, df_wifi_):
    if len(df_beacon_)>=1:
        gap = df_beacon_['last_timestamp'] - df_beacon_['timestamp']
        assert gap.unique().shape[0]==1
        gap = gap.values[0]
    else:
        wifi_groups = df_wifi_.groupby('timestamp')  
        gap = (wifi_groups['last_seen_timestamp'].max().astype(np.int64) - wifi_groups['timestamp'].max().astype(int)).max()
        gap = gap
    return gap 
    
#################################################

In [19]:
#################################################
# 4. Get Wifi & Beacon Categories

list_unique_ssid = []
list_unique_bssid = []
list_unique_uuid = []
list_features = []


def readCategories(file):
    sample_file = readDataFile(path_input=file)
    df_wifi = getWifiData(sample_file)
    df_beacon = getBeaconData(sample_file)
    if df_wifi.shape[0] > 0:
        v0, v1 = df_wifi['ssid'].unique(), df_wifi['bssid'].unique()
    else:
        v0, v1 = np.array([]), np.array([])
    if df_beacon.shape[0] > 0:
        v2 = df_beacon['uuid'].unique()
    else:
        v2 = np.array([])
    return (v0, v1, v2)
    
num_processors = 16
# with parallel_backend('threading', n_jobs=num_processors):
results = Parallel(n_jobs=num_processors)(delayed(readCategories)(file) for file in tqdm(list_train_paths))
    
for feat_ in tqdm(results):
    ssid, bssid, uuid  = feat_
    list_unique_ssid.append(ssid)
    list_unique_bssid.append(bssid)
    list_unique_uuid.append(uuid)
    
list_unique_ssid = np.unique(np.concatenate(list_unique_ssid))
list_unique_bssid = np.unique(np.concatenate(list_unique_bssid))
list_unique_uuid = np.unique(np.concatenate(list_unique_uuid))

dict_unique_ssid = {value : i+1 for i, value in enumerate(list_unique_ssid)}
dict_unique_bssid = {value : i+1 for i, value in enumerate(list_unique_bssid)}
dict_unique_uuid = {value : i+1 for i, value in enumerate(list_unique_uuid)}

dict_unique_ssid['<NA>'] = 0
dict_unique_bssid['<NA>'] = 0
dict_unique_uuid['<NA>'] = 0

list_all_sites = set([path.split("metadata\\")[-1] for path in list_metadata_paths])
dict_all_sites = {site_ : i for i, site_ in enumerate(list_all_sites)}

# #################################################

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26925.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
#################################################
# 5. Get num waypoints distributions on both sets

# df_sample_submission = pd.read_csv(PATH_DATA + 'sample_submission.csv')
# df_sample_submission['site'] = df_sample_submission['site_path_timestamp'].\
#                                               apply(lambda x: x.split('_'))
# df_sample_submission['path'] = df_sample_submission['site_path_timestamp'].\
#                                 apply(lambda x: x.split('_')[1])
# df_sample_submission['timestamp'] = df_sample_submission['site_path_timestamp'].\
#                                 apply(lambda x: x.split('_')[2]).astype(np.int64)/1000.0

# list_columns = ['site_path_timestamp', 'site', 'path', 'timestamp']
# df_sample_submission = df_sample_submission[list_columns]

# list_num_waypoints = []
# for file_ in tqdm(list_test_paths):
#     # Load data
#     sample_file = readDataFile(path_input=file_)
#     path = file_.split('\\')[-1].replace('.txt', '')
#     ## Preprocess data
#     df = getWaypointDataSampleSubmission(df_sample_submission, path)
#     if df.shape[0]>=1:
#         list_num_waypoints.append(df.shape[0])
        
# unique, counts = np.unique(list_num_waypoints, return_counts=True)
# print(pd.Series(list_num_waypoints).describe())

#Train
# count    26925.000000
# mean         6.190641
# std          5.814579
# min          1.000000
# 25%          3.000000
# 50%          4.000000
# 75%          7.000000
# max        107.000000

#Test
# count    626.000000
# mean      16.186901
# std        9.459229
# min        6.000000
# 25%       10.000000
# 50%       14.000000
# 75%       18.750000
# max      107.000000

# with open(f'{path_generated_data}dict_training_waypoints_{VERSION}.pkl', 'rb') as f:
#     dict_training_waypoints = pickle.load(f)

# list_num_waypoints = []
# for site in dict_training_waypoints:
#     for floor in dict_training_waypoints[site]:
#         assert len(dict_training_waypoints[site][floor]['x'])==len(dict_training_waypoints[site][floor]['y'])
#         if len(dict_training_waypoints[site][floor]['x']) >= 1:
#             list_num_waypoints.append(len(dict_training_waypoints[site][floor]['x']))   
        
# pd.Series(list_num_waypoints).describe()

# count     967.000000
# mean      172.371251
# std       247.254857
# min         3.000000
# 25%        43.000000
# 50%        81.000000
# 75%       168.500000
# max      2027.000000
# dtype: float64

#################################################

In [13]:
#################################################
# 5. Get All training points for each site
# list_all_floors = np.unique(list(DICT_FLOOR_MAP.values()))
# list_all_sites = set([dict_all_sites[path.split("metadata\\")[-1]] for path in list_metadata_paths])

# dict_training_waypoints = {}
# for site in list_all_sites:
#     dict_training_waypoints[site] = {}
#     for floor in list_all_floors:
#         dict_training_waypoints[site][floor] = {
#             'x' : [],
#             'y' : []
#         }

# for train_file_ in tqdm(list_train_paths):
#     sample_file = readDataFile(path_input=train_file_)
#     floor = DICT_FLOOR_MAP[train_file_.split('\\')[-2]]
#     site = dict_all_sites[train_file_.split('\\')[1]]
#     df_waypoint = getWaypointData(sample_file)
#     if df_waypoint.shape[0]>=1:
#         df_waypoint['floor'] = floor
#         df_waypoint['site'] = site
#         dict_training_waypoints[site][floor]['x'].extend(df_waypoint['waypoint_x'].values/100.)
#         dict_training_waypoints[site][floor]['y'].extend(df_waypoint['waypoint_y'].values/100.)
        
# with open(f'{path_generated_data}/dict_training_waypoints_{VERSION}.pkl', 'wb') as f:
#     pickle.dump(dict_training_waypoints, f, protocol=pickle.HIGHEST_PROTOCOL)



#################################################

In [23]:
#################################################
# 6. Build Train sequences
# ~1h
# We dont have records before starting point on training so its useless trying to predict it.
# For each path point(f, x, y) we will use the data from later timestamps to predict the actual waypoint.

path_output_data = PATH_DATA + 'GeneratedData/v0.3/train/'
list_all_sites = set([path.split("metadata\\")[-1] for path in list_metadata_paths])
list_traces_train_filtered, list_floors_train_filtered = [], []
print('->Preparing train files...')
for train_file_ in tqdm(list_train_paths):
    # Load data
    sample_file = readDataFile(path_input=train_file_)
    floor = DICT_FLOOR_MAP[train_file_.split('\\')[-2]]
    site = dict_all_sites[train_file_.split('\\')[1]]
    path = train_file_.split('\\')[-1].replace('.txt', '')
    ## Preprocess data
    df_waypoint = getWaypointData(sample_file)
    # df_waypoint = df_waypoint.iloc[1:, :]
    if df_waypoint.shape[0]>=1:
        df_waypoint['floor'] = floor
        df_waypoint['site'] = site
        df_imu = getImuData(sample_file)
        df_wifi = getWifiData(sample_file)
        if df_imu.shape[0] >= 1 and df_wifi.shape[0] >= 1:
            df_beacon = getBeaconData(sample_file)
            
            ## Prepare sequences
            # Waypoint
            data_waypoint = df_waypoint[['site', 'ts_diff_start', 'ts_diff_last']]
            data_waypoint_y = df_waypoint[['floor', 'waypoint_x', 'waypoint_y']]
            # Imu
            seq_imu = getSequencesImu(df_imu, df_waypoint, window_mean=8)#0.02sec->0.18sec
            seq_imu = np.concatenate(seq_imu)
            # Wifi
            seq_wifi = getSequencesWifi(df_wifi, df_waypoint, dict_unique_ssid, dict_unique_bssid)
            seq_wifi = np.concatenate(seq_wifi)
            # Beacon
            seq_beacon = getSequencesBeacon(df_beacon, df_waypoint, dict_unique_uuid)
            seq_beacon = np.concatenate(seq_beacon)
            
            # Final assertions
            assert seq_imu.shape==(df_waypoint.shape[0], SEQ_LEN_IMU, 24)
            assert seq_wifi.shape==(df_waypoint.shape[0], SEQ_LEN_WIFI, 6)
            assert seq_beacon.shape==(df_waypoint.shape[0], SEQ_LEN_BEACON, 4)
            assert data_waypoint.shape==(df_waypoint.shape[0], 3)
            assert data_waypoint_y.shape==(df_waypoint.shape[0], 3)
            list_traces_train_filtered.append(path)
            list_floors_train_filtered.append(floor)
            
            # Get condensed data 
            all_imu, all_wifi, all_beacon = getAllFloorPrediction(df_waypoint, df_imu, df_wifi, df_beacon, window_mean=20)
            assert all_imu.shape == (100, 6)
            assert all_wifi.shape == (NETWORK_SIZE, 3)
            assert all_beacon.shape == (NETWORK_SIZE, 2)
            ## Save data
            # Seq

            path_out = path_output_data + path
            if not os.path.exists(path_out):
                os.mkdir(path_out)
            np.save(f'{path_out}/seq_imu.npy', seq_imu)
            np.save(f'{path_out}/seq_wifi.npy', seq_wifi)
            np.save(f'{path_out}/seq_beacon.npy', seq_beacon)
            np.save(f'{path_out}/waypoint_data.npy', data_waypoint)
            np.save(f'{path_out}/waypoint_predict.npy', data_waypoint_y)
            # All
            np.save(f'{path_out}/all_imu.npy', all_imu)
            np.save(f'{path_out}/all_wifi.npy', all_wifi)
            np.save(f'{path_out}/all_beacon.npy', all_beacon)
            
# print(seq_imu.shape, seq_wifi.shape, seq_beacon.shape, data_waypoint.shape, data_waypoint_y.shape)

#################################################

->Preparing train files...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26925.0), HTML(value='')))





In [24]:
#################################################
# 7. Build Test sequences

path_output_data = PATH_DATA + 'GeneratedData/v0.3/test/'
list_all_sites = set([path.split("metadata\\")[-1] for path in list_metadata_paths])
list_paths_test_filtered = []
print('->Preparing test files...')

# 6.1Parse sample submission
df_sample_submission = pd.read_csv(PATH_DATA + 'sample_submission.csv')
df_sample_submission['site'] = df_sample_submission['site_path_timestamp'].\
                                              apply(lambda x: dict_all_sites[x.split('_')[0]])
df_sample_submission['path'] = df_sample_submission['site_path_timestamp'].\
                                apply(lambda x: x.split('_')[1])
df_sample_submission['timestamp'] = df_sample_submission['site_path_timestamp'].\
                                apply(lambda x: x.split('_')[2]).astype(np.int64)

list_columns = ['site_path_timestamp', 'site', 'path', 'timestamp']
df_sample_submission = df_sample_submission[list_columns]

# 6.2 Get Sequences
for i, test_file_ in enumerate(tqdm(list_test_paths)):
    # Load data
    sample_file = readDataFile(path_input=test_file_)
    path = test_file_.split('\\')[-1].replace('.txt', '')
    ## Preprocess data
    df_waypoint = getWaypointDataSampleSubmission(df_sample_submission, path)
    df_beacon = getBeaconData(sample_file, test=True)
    df_wifi = getWifiData(sample_file)
    gap = getTestGapTimeStamp(df_beacon, df_wifi)
    df_waypoint['timestamp'] = df_waypoint['timestamp'] + gap
    df_wifi['timestamp'] = df_wifi['timestamp'] + gap
    assert np.sum((df_waypoint['timestamp']>=1e12) & (df_waypoint['timestamp']<=1e15))==len(df_waypoint)
    if len(df_beacon)>=1:
        df_beacon['timestamp'] = df_beacon['timestamp'] + gap
        df_beacon = df_beacon.drop(['last_timestamp'], axis=1)
    df_imu = getImuData(sample_file)
    df_imu['timestamp'] = df_imu['timestamp'] + gap
    if df_imu.shape[0] >= 1 and df_wifi.shape[0] >= 1:
        ## Prepare sequences
        # Waypoint
        data_waypoint = df_waypoint[['site', 'ts_diff_start', 'ts_diff_last']]
        # Imu
        seq_imu = getSequencesImu(df_imu, df_waypoint, window_mean=8)#0.02sec->0.18sec
        seq_imu = np.concatenate(seq_imu)
        # Wifi
        seq_wifi = getSequencesWifi(df_wifi, df_waypoint, dict_unique_ssid, dict_unique_bssid)
        seq_wifi = np.concatenate(seq_wifi)
        # Beacon
        seq_beacon = getSequencesBeacon(df_beacon, df_waypoint, dict_unique_uuid)
        seq_beacon = np.concatenate(seq_beacon)
        # Final assertions
        assert seq_imu.shape==(df_waypoint.shape[0], SEQ_LEN_IMU, 24)
        assert seq_wifi.shape==(df_waypoint.shape[0], SEQ_LEN_WIFI, 6)
        assert seq_beacon.shape==(df_waypoint.shape[0], SEQ_LEN_BEACON, 4)
        assert data_waypoint.shape==(df_waypoint.shape[0], 3)

        # Get condensed data 
        all_imu, all_wifi, all_beacon = getAllFloorPrediction(df_waypoint, df_imu, df_wifi, df_beacon, window_mean=20)
        ## Save data
        # Seq
        
        path_out = path_output_data + path
        if not os.path.exists(path_out):
            os.mkdir(path_out)
        np.save(f'{path_out}/seq_imu.npy', seq_imu)
        np.save(f'{path_out}/seq_wifi.npy', seq_wifi)
        np.save(f'{path_out}/seq_beacon.npy', seq_beacon)
        np.save(f'{path_out}/waypoint_data.npy', data_waypoint)
        # All
        np.save(f'{path_out}/all_imu.npy', all_imu)
        np.save(f'{path_out}/all_wifi.npy', all_wifi)
        np.save(f'{path_out}/all_beacon.npy', all_beacon)

#################################################

->Preparing test files...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=626.0), HTML(value='')))




In [25]:
#################################################
# 8. Get stats

path_generated_data = PATH_DATA + 'GeneratedData/v0.3/'
list_paths_generated_data_train = glob.glob(path_generated_data + 'train/*')
list_paths_generated_data_test = glob.glob(path_generated_data + 'test/*')
list_mean_imu, list_std_imu = [], []
list_mean_w, list_std_w = [], []
list_mean_wifi, list_std_wifi = [], []
list_mean_beacon, list_std_beacon = [], []
for i, train_file_ in enumerate(tqdm(list_paths_generated_data_train)):
    # Load data
    data_imu = np.expand_dims(np.load(train_file_ + '/seq_imu.npy'), 0)
    data_waypoint = np.expand_dims(np.load(train_file_ + '/waypoint_data.npy'), 0)
    data_wifi = np.expand_dims(np.load(train_file_ + '/seq_wifi.npy'), 0)
    data_beacon = np.expand_dims(np.load(train_file_ + '/seq_beacon.npy'), 0)
    mean_ = data_imu.mean(axis=(1, 2)).squeeze()
    std_ = data_imu.std(axis=(1, 2)).squeeze()
    list_mean_imu.append(mean_)
    list_std_imu.append(std_)
    mean_ = data_waypoint[:, :, 1:].mean(axis=1).squeeze()
    std_ = data_waypoint[:, :, 1:].std(axis=1).squeeze()
    list_mean_w.append(mean_)
    list_std_w.append(std_)
    mean_ = data_wifi[:, :, :, 2:].mean(axis=(1, 2)).squeeze()
    std_ = data_wifi[:, :, :, 2:].std(axis=(1, 2)).squeeze()
    list_mean_wifi.append(mean_)
    list_std_wifi.append(std_)
    mean_ = data_beacon[:, :, :, 1:].mean(axis=(1, 2)).squeeze()
    std_ = data_beacon[:, :, :, 1:].std(axis=(1, 2)).squeeze()
    list_mean_beacon.append(mean_)
    list_std_beacon.append(std_)
    # print(data_wifibeacon[:, :, :, 6].min(), data_wifibeacon[:, :, :, 6].max())

mean_imu = np.mean(np.asarray([d for d in list_mean_imu]), axis=0) 
std_imu = np.mean(np.asarray([d for d in list_std_imu]), axis=0)
mean_w = np.mean(np.asarray([d for d in list_mean_w]), axis=0) 
std_w = np.mean(np.asarray([d for d in list_std_w]), axis=0)
mean_wifi = np.mean(np.asarray([d for d in list_mean_wifi]), axis=0) 
std_wifi = np.mean(np.asarray([d for d in list_std_wifi]), axis=0)
mean_beacon = np.mean(np.asarray([d for d in list_mean_beacon]), axis=0) 
std_beacon = np.mean(np.asarray([d for d in list_std_beacon]), axis=0)

#################################################

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=24464.0), HTML(value='')))




In [26]:
#################################################
# 8. Get all waypoints

# path_generated_data = PATH_DATA + 'GeneratedData/'
# list_paths_generated_data_train = glob.glob(path_generated_data + 'train/*')
# list_paths_generated_data_test = glob.glob(path_generated_data + 'test/*')
# list_mean_imu, list_std_imu = [], []
# list_mean_wx, list_std_wx, list_mean_wy, list_std_wy = [], [], [], []
# df_all_waypoints = pd.DataFrame()
# for train_file_ in tqdm(list_train_paths):
#     # Load data
#     site = train_file_.split('\\')[1]
#     sample_file = readDataFile(path_input=train_file_)
#     df_waypoint = getWaypointData(sample_file)
#     df_waypoint['site'] = site
#     df_all_waypoints = pd.concat([df_all_waypoints, df_waypoint], axis=0).reset_index(drop=True)

# df_all_waypoints = df_all_waypoints.sort_values(['site', 'timestamp']). reset_index(drop=True)

#################################################

In [28]:
#################################################
# 9. Save experiment

experiment = Experiment(
    mean_imu=mean_imu,
    std_imu=std_imu,
    mean_w=mean_w,
    std_w=std_w,
    mean_wifi=mean_wifi,
    std_wifi=std_wifi,
    mean_beacon=mean_beacon,
    std_beacon=std_beacon,
    list_traces_train_filtered=list_traces_train_filtered,
    list_floors_train_filtered=list(set(list_floors_train_filtered)),
    dict_all_sites=dict_all_sites,
    dict_unique_uuid=dict_unique_uuid,
    dict_unique_bssid=dict_unique_bssid,
    dict_unique_ssid=dict_unique_ssid,
    dict_floor_map=DICT_FLOOR_MAP,
    seq_len_imu=SEQ_LEN_IMU,
    version=VERSION
)

with open(f'{path_generated_data}/Experiment_{VERSION}.pkl', 'wb') as f:
    pickle.dump(experiment, f, protocol=pickle.HIGHEST_PROTOCOL)
    
#################################################