###  Overview

* predict the`noise = ground truth - baseline` for latDeg & lngDeg with Phone IMU data.

* use and modify this stop prediction notebook -> https://www.kaggle.com/katomash/a-car-is-moving-or-not-accuracy-94

## Light GBM with IMU data


In [1]:
import pandas as pd
import pathlib
import numpy as np
import os

### Method:Make train data

In [1]:
#  making ground truth file
def make_gt(path, collectionName, phoneName, is_test=False):
    # ground_truth
    p = pathlib.Path(path)
    top_dir = "test" if is_test else "train"
    gt_files = list(p.glob(f'{top_dir}/*/*/ground_truth.csv'))
    
    
    gts = []
    for gt_file in gt_files:
        gts.append(pd.read_csv(gt_file))
    
    # baseline
    cols = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']
    
    if not is_test:
        baseline = pd.read_csv(path + '/baseline_locations_train.csv', usecols=cols)
        ground_truth = pd.concat(gts)
        ground_truth = ground_truth.merge(baseline, how='inner', on=cols[:3], suffixes=('_gt', '_bs'))

    else:
        baseline = pd.read_csv(path + '/baseline_locations_test.csv', usecols=cols)
        ground_truth = baseline

    ground_truth["millisSinceGpsEpoch_orig"] = ground_truth["millisSinceGpsEpoch"].to_numpy()
    ground_truth["millisSinceGpsEpoch"] = ground_truth["millisSinceGpsEpoch"]//1000
    if (collectionName is None) or (phoneName is None):
        return ground_truth
    else:
        return ground_truth[(ground_truth['collectionName'] == collectionName) & (ground_truth['phoneName'] == phoneName)]
    

def make_tag(df, tag_v):
    df.loc[df['speedMps'] < tag_v, 'tag'] = 1
    df.loc[df['speedMps'] >= tag_v, 'tag'] = 0
    return df


# loading gnss file
def gnss_log_to_dataframes(path):
    print('Loading ' + path, flush=True)
    gnss_section_names = {'Raw', 'UncalAccel', 'UncalGyro', 'UncalMag', 'Fix', 'Status', 'OrientationDeg'}
    with open(path) as f_open:
        datalines = f_open.readlines()

    datas = {k: [] for k in gnss_section_names}
    gnss_map = {k: [] for k in gnss_section_names}
    for dataline in datalines:
        is_header = dataline.startswith('#')
        dataline = dataline.strip('#').strip().split(',')
        # skip over notes, version numbers, etc
        if is_header and dataline[0] in gnss_section_names:
            try:
                gnss_map[dataline[0]] = dataline[1:]
            except:
                pass
        elif not is_header:
            try:
                datas[dataline[0]].append(dataline[1:])
            except:
                pass
    results = dict()
    for k, v in datas.items():
        results[k] = pd.DataFrame(v, columns=gnss_map[k])
    # pandas doesn't properly infer types from these lists by default
    for k, df in results.items():
        for col in df.columns:
            if col == 'CodeType':
                continue
            try:
                results[k][col] = pd.to_numeric(results[k][col])
            except:
                pass
    return results

def add_IMU(df, INPUT, cname, pname, is_test=False):
    phone = cname + "_"  + pname

    top_dir = "train" if not is_test else "test"
    path = INPUT + f"/{top_dir}/" + cname + "/" + pname + "/" + pname + "_GnssLog.txt"
    cache_path = str(Path("../data/raw_csvs/", cname + "_" + pname))
    print(cache_path)
    targets = ["UncalAccel", "UncalMag", "UncalGyro"]
    try:
        acce_df = pd.read_csv(cache_path + "_UncalAccel" + ".csv")
        magn_df = pd.read_csv(cache_path + "_UncalMag" + ".csv")
        gyro_df = pd.read_csv(cache_path + "_UncalGyro" + ".csv")

    except FileNotFoundError as e:
        gnss_dfs = gnss_log_to_dataframes(path)
        for target in targets:
            gnss_dfs[target].to_csv(cache_path + "_" + target + ".csv")
        acce_df = pd.read_csv(cache_path + "_UncalAccel" + ".csv")
        magn_df = pd.read_csv(cache_path + "_UncalMag" + ".csv")
        gyro_df = pd.read_csv(cache_path + "_UncalGyro" + ".csv")
    if len(acce_df) == 0:
        return df

    acce_df["millisSinceGpsEpoch"] = acce_df["utcTimeMillis"] - 315964800000
    acce_df["millisSinceGpsEpoch"] = acce_df["millisSinceGpsEpoch"] // 1000 + 18
    magn_df["millisSinceGpsEpoch"] = magn_df["utcTimeMillis"] - 315964800000
    magn_df["millisSinceGpsEpoch"] = magn_df["millisSinceGpsEpoch"] // 1000 + 18
    gyro_df["millisSinceGpsEpoch"] = gyro_df["utcTimeMillis"] - 315964800000
    gyro_df["millisSinceGpsEpoch"] = gyro_df["millisSinceGpsEpoch"] // 1000 + 18

    acce_df["x_f_acce"] = acce_df["UncalAccelZMps2"]
    acce_df["y_f_acce"] = acce_df["UncalAccelXMps2"]
    acce_df["z_f_acce"] = acce_df["UncalAccelYMps2"]
    # magn
    magn_df["x_f_magn"] = magn_df["UncalMagZMicroT"]
    magn_df["y_f_magn"] = magn_df["UncalMagYMicroT"]
    magn_df["z_f_magn"] = magn_df["UncalMagXMicroT"]
    # gyro
    gyro_df["x_f_gyro"] = gyro_df["UncalGyroXRadPerSec"]
    gyro_df["y_f_gyro"] = gyro_df["UncalGyroYRadPerSec"]
    gyro_df["z_f_gyro"] = gyro_df["UncalGyroZRadPerSec"]

    target_cols = [
        "collectionName",
        "phoneName",
        "millisSinceGpsEpoch",
        "millisSinceGpsEpoch_orig",
    ]

    if not is_test:
        target_cols.extend(
            [
                "heightAboveWgs84EllipsoidM",
                "speedMps",
                "latDeg_gt",
                "lngDeg_gt",
                "latDeg_bs",
                "lngDeg_bs",
            ]
        )

    accs = ["x_f_acce", "y_f_acce", "z_f_acce"]
    df = pd.merge_asof(
        df[target_cols].sort_values("millisSinceGpsEpoch"),
        acce_df[["millisSinceGpsEpoch"] + accs].sort_values("millisSinceGpsEpoch"),
        on="millisSinceGpsEpoch",
        direction="nearest",
    )
    target_cols.extend(accs)
    magns = ["x_f_magn", "y_f_magn", "z_f_magn"]
    df = pd.merge_asof(
        df[target_cols].sort_values("millisSinceGpsEpoch"),
        magn_df[["millisSinceGpsEpoch"] + magns].sort_values("millisSinceGpsEpoch"),
        on="millisSinceGpsEpoch",
        direction="nearest",
    )
    target_cols.extend(magns)
    gyros = ["x_f_gyro", "y_f_gyro", "z_f_gyro"]
    df = pd.merge_asof(
        df[target_cols].sort_values("millisSinceGpsEpoch"),
        gyro_df[["millisSinceGpsEpoch"] + gyros].sort_values("millisSinceGpsEpoch"),
        on="millisSinceGpsEpoch",
        direction="nearest",
    )
    return df


def make_train(INPUT, train_cname, tag_v, is_test=False):
    # make ground_truth file
    os.makedirs(Path("../data"), exist_ok=True)
    os.makedirs(Path("../data/raw_csvs/"), exist_ok=True)
    gt = make_gt(INPUT, None, None, is_test=is_test)
    train_df = pd.DataFrame()
    if is_test:
        train_cname = gt.collectionName.unique()
    for cname in train_cname:
        phone_list = gt[gt["collectionName"] == cname]["phoneName"].drop_duplicates()
        for pname in phone_list:
            df = gt[(gt["collectionName"] == cname) & (gt["phoneName"] == pname)]
            df = add_IMU(df, INPUT, cname, pname, is_test=is_test)
            train_df = pd.concat([train_df, df])
    if not is_test:
        # make tag
        train_df = make_tag(train_df, tag_v)
    return train_df

###  Method:Model(Light GBM)

In [3]:
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


def lgbm(train, test, col, lgb_params, target:str="tag", val=None):
    if target == "tag":
        model = lgb.LGBMClassifier(**lgb_params)
        mode = "class"
    else:
        model = lgb.LGBMRegressor(**lgb_params)
        mode = "reg"
        
    if val is not None:
        eval_set = [(val[col], val.loc[:, target].to_numpy())]
    else:
        None
    model.fit(train[col], train.loc[:, target].to_numpy(), eval_set=eval_set, early_stopping_rounds=20, verbose=False)
    preds = model.predict(test[col])
    if mode == "class":
        print('confusion matrix :  \n', confusion_matrix(preds, test[target]))
        print('accuracy score : ', accuracy_score(preds, test[target]))
        print('f1 score : ', f1_score(preds, test[target]))

    elif mode == "reg":
        test["phone"] = test.collectionName + "_" + test.phoneName
        test[target + "_pred"] = preds
        phone_mae = []
        # the fisrt step, there is no velocity info
        for phone, df_ in test.groupby("phone"):
            phone_mae.append(
                np.mean(
                    np.abs(df_[target + "_pred"].values - df_[target].values)
                )
            )
        print('mae score : ', np.mean(np.abs(preds - test[target])))
        print('phone mae score : ', np.mean(phone_mae))

    return model, preds

### Method:Confirm Score

In [4]:
def get_train_score(df):
    # calc_distance_error
    df['err'] =  calc_haversine(df.latDeg_bs, df.lngDeg_bs, 
    df.latDeg_gt, df.lngDeg_gt)
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score


def percentile50(x):
    return np.percentile(x, 50)


def percentile95(x):
    return np.percentile(x, 95)


def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(a**0.5)
    dist = 6_367_000 * c
    return dist

In [5]:
from typing import List
def calc_avg_vel(df: pd.DataFrame, add_future: bool = False, global_targets:List[str] = ["latDeg_gt"]):
#     if is_database:
#         global_targets = ["latDeg_gt", "lngDeg_gt"]
#     else:
#         global_targets = ["latDeg", "lngDeg"]

#     local_targets = [key + "_diff_prev" for key in global_targets]
    local_targets = [key for key in global_targets]
    window_sizes = [5, 15, 45]
    new_targets = []
    dfs = []
    for phone, df_ in df.groupby("phone"):
        for window_size in window_sizes:
            tri_center = (
                df_[local_targets]
                .rolling(window=window_size, min_periods=1, center=False)
                .mean()
            ).fillna(0.0)
            for target in local_targets:
                new_targets.append(target + "_prev_" + str(window_size))
                df_[new_targets[-1]] = tri_center[target]
                if add_future:
                    next_target = new_targets[-1].replace("prev", "next")
                    df_[next_target] = (
                        df_[new_targets[-1]].shift(-window_size).fillna(0.0)
                    )
                    new_targets.append(next_target)
        dfs.append(df_)

    new_targets = sorted(list(set(new_targets)))
    df = pd.concat(dfs, axis=0)
    local_targets.extend(new_targets)
#     local_targets = local_targets + ["latDeg_diff_next", "lngDeg_diff_next"]
    df.reset_index(drop=False, inplace=True)
    return df, global_targets, local_targets

### Initial value

Please adjust the parameters as you like.

## split

In [6]:
#
from src.dataset.datamodule import make_split, merge_split_info
from pathlib import Path
df_path = pd.read_csv(
    Path("./src/meta_data/path_meta_info.csv")
)
# train/val split
df_path = make_split(df=df_path, n_splits=3)

all_cname = df_path.collectionName.values.tolist()

In [7]:
INPUT = '../input/google-smartphone-decimeter-challenge'

In [8]:
tag_v = 5.0e-7/1.0e-5
col = ["x_f_acce",  "z_f_acce", "x_f_gyro", "y_f_gyro"]
# col = ["x_f_acce", "y_f_acce", "z_f_acce", "x_f_gyro", "y_f_gyro", "z_f_gyro"]
# col = ["x_f_acce", "y_f_acce", "z_f_acce", "x_f_magn", "y_f_magn", "z_f_magn", "x_f_gyro", "y_f_gyro", "z_f_gyro"]
# parameter
lgb_params = {
    'num_leaves': 90,
    'n_estimators': 125,

}

### Main

In [9]:
all_imu_path = "../data/imu_data.csv"
try: 
    train_df = pd.read_csv(all_imu_path)
except Exception as e:
    train_df = make_train(INPUT, all_cname, tag_v)
    train_df.to_csv(all_imu_path, index=False)

../data/raw_csvs/2020-05-14-US-MTV-1_Pixel4XLModded
../data/raw_csvs/2020-05-14-US-MTV-1_Pixel4
../data/raw_csvs/2020-05-14-US-MTV-2_Pixel4XLModded
../data/raw_csvs/2020-05-14-US-MTV-2_Pixel4
../data/raw_csvs/2020-05-21-US-MTV-1_Pixel4
../data/raw_csvs/2020-05-21-US-MTV-2_Pixel4XL
../data/raw_csvs/2020-05-21-US-MTV-2_Pixel4
../data/raw_csvs/2020-05-29-US-MTV-1_Pixel4XLModded
../data/raw_csvs/2020-05-29-US-MTV-1_Pixel4XL
../data/raw_csvs/2020-05-29-US-MTV-1_Pixel4
../data/raw_csvs/2020-05-29-US-MTV-2_Pixel4XL
../data/raw_csvs/2020-05-29-US-MTV-2_Pixel4
../data/raw_csvs/2020-06-04-US-MTV-1_Pixel4XLModded
../data/raw_csvs/2020-06-04-US-MTV-1_Pixel4XL
../data/raw_csvs/2020-06-04-US-MTV-1_Pixel4
../data/raw_csvs/2020-06-05-US-MTV-1_Pixel4XLModded
../data/raw_csvs/2020-06-05-US-MTV-1_Pixel4XL
../data/raw_csvs/2020-06-05-US-MTV-1_Pixel4
../data/raw_csvs/2020-06-05-US-MTV-2_Pixel4XL
../data/raw_csvs/2020-06-05-US-MTV-2_Pixel4
../data/raw_csvs/2020-06-11-US-MTV-1_Pixel4XL
../data/raw_csvs/2020-

In [10]:
# na check
train_df["phone"] = train_df["collectionName"] + "_" + train_df["phoneName"]
na_mask = train_df[pd.isna(train_df[col]).sum(axis=1) != 0]
na_mask.phone.unique()

array(['2020-08-03-US-MTV-1_Mi8', '2020-08-03-US-MTV-1_Pixel4',
       '2020-08-06-US-MTV-2_Mi8', '2020-08-06-US-MTV-2_Pixel4XL',
       '2020-08-06-US-MTV-2_Pixel4'], dtype=object)

In [16]:
from src.dataset.utils import calc_triangle_center, get_groundtruth
from src.postprocess.postporcess import apply_kf_smoothing
from src.postprocess.visualize import add_distance_diff

from typing import Tuple
import os

def load_dataset(is_test: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
    data_dir = Path(
         "../input/google-smartphone-decimeter-challenge"
        )
    fname = "test" if is_test else "train"
    df = pd.read_csv(data_dir / f"baseline_locations_{fname}.csv")

    if not is_test:
        # merge graoundtruth
        df = df.merge(
            get_groundtruth(data_dir),
            on=["collectionName", "phoneName", "millisSinceGpsEpoch"],
        )

    # area_df from
    # https://www.kaggle.com/columbia2131/area-knn-prediction-train-hand-label
    area_df = pd.read_csv(
        Path(f"./src/meta_data/{fname}_area.csv")
    )

    df = apply_kf_smoothing(df=df)
    df = add_distance_diff(df=df, is_test=is_test)

    if is_test:
        area_df = area_df.rename(columns={"area_pred": "area_target"})

    df = pd.merge(df, area_df[["collectionName", "area_target"]], on=["collectionName"])
    return df, area_df

posi_pred_df, area_df = load_dataset(is_test=False)
posi_pred_df["millisSinceGpsEpoch_orig"] = posi_pred_df["millisSinceGpsEpoch"]
posi_pred_df = posi_pred_df[[
                "collectionName",
                "phoneName",
                "millisSinceGpsEpoch_orig",
                "latDeg", "lngDeg", "area_target"
            ]] 
train_df = pd.merge(
    train_df, posi_pred_df, on=[
                "collectionName",
                "phoneName",
                "millisSinceGpsEpoch_orig",
            ]
)
col = col + ["latDeg", "lngDeg"]

len(train_df)

100%|██████████| 73/73 [00:17<00:00,  4.07it/s]


131342

In [17]:
# remove nan data
print(len(train_df))
train_df = train_df.loc[~train_df.phone.isin(na_mask.phone.unique())]
print(len(train_df))


131342
122075


In [None]:
train_df, col, local_col = calc_avg_vel(df=train_df, add_future=True, global_targets=col)

In [20]:
import copy
train_df_orig = copy.deepcopy(train_df)

In [21]:
# prediction with light gbm

SEED = 4
def predict_fold(train_df, local_col, targets=["latDeg", "lngDeg"], surpress_deg=False):
    models = {target:[] for target in targets}
    pred_df = copy.deepcopy(train_df)
    pred_df["preds"] = 0.
    if surpress_deg:
        local_col = [col for col in local_col if col.find("Deg") == -1]
    for target in ["latDeg", "lngDeg"]:
        train_df[ target + "gt_diff"] = train_df[  target +"_gt"] - train_df[target] 
        for val_fold in [0, 1, 2]:
            train_cname = df_path.loc[df_path.fold != val_fold, :].collectionName.values.tolist()
            test_cname = df_path.loc[df_path.fold == val_fold, :].collectionName.values.tolist()
            print(len(train_df.loc[train_df.collectionName.isin(train_cname)]), len(train_df.loc[train_df.collectionName.isin(test_cname)]))
            model, pred_df.loc[train_df.collectionName.isin(test_cname),target+"_"+'preds'] = lgbm(
                train_df.loc[train_df.collectionName.isin(train_cname)],
                train_df.loc[train_df.collectionName.isin(test_cname)],
                local_col, 
                lgb_params, 
                target=target + "gt_diff",
                val=train_df.loc[train_df.collectionName.isin(test_cname)],
            )
            models[target].append(model)
    return models, pred_df

models_tree, pred_df_tree = predict_fold(train_df=train_df_orig.loc[train_df_orig.area_target == 1], local_col=local_col, surpress_deg=True)
models_down, pred_df_down = predict_fold(train_df=train_df_orig.loc[train_df_orig.area_target == 2], local_col=local_col)
pred_df = pd.concat([pred_df_tree, pred_df_down], axis=0)
for target in ["latDeg", "lngDeg"]:
    pred_df[target] = pred_df[ target+"_"+'preds']  + pred_df[target]
    
pred_df.to_csv("../data/light_gbm_noise_surpress_deg_val.csv", index=False)

13832 9011
mae score :  1.7477490101849436e-05
phone mae score :  1.7699385180178932e-05
15916 6927
mae score :  3.252598894953974e-05
phone mae score :  3.0794945602548165e-05
15938 6905
mae score :  1.196446656948378e-05
phone mae score :  1.1961263027422203e-05
13832 9011
mae score :  1.879570528379853e-05
phone mae score :  1.8913553170064998e-05
15916 6927
mae score :  2.945558585184073e-05
phone mae score :  2.7637106403749977e-05
15938 6905
mae score :  2.3488964905433836e-05
phone mae score :  2.349821609460388e-05
8797 5716
mae score :  5.793286488581495e-05
phone mae score :  5.787150528477016e-05
10416 4097
mae score :  4.515413716734235e-05
phone mae score :  4.511962031655828e-05
9813 4700
mae score :  5.5553017759709585e-05
phone mae score :  5.5598862488139805e-05
8797 5716
mae score :  7.088294506229839e-05
phone mae score :  7.088350665889765e-05
10416 4097
mae score :  5.5998516515878315e-05
phone mae score :  5.595847396636757e-05
9813 4700
mae score :  5.70055676183

In [22]:
from post_check import print_metric
met_df = print_metric(pred_df)
met_df["collectionName"] = met_df["phone"].apply(lambda x: x.split("_")[0])
met_df = met_df.merge(area_df[["collectionName", "area_target"]],on=["collectionName"])
met_df.groupby("area_target").mean()


Val evaluation details:
 8.397334935084832


Unnamed: 0_level_0,dist_50,dist_95,avg_dist_50_95
area_target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.701101,7.490842,5.095971
2,5.546006,26.655027,16.100516


In [23]:
met_df = print_metric(train_df)
met_df["collectionName"] = met_df["phone"].apply(lambda x: x.split("_")[0])
met_df = met_df.merge(area_df[["collectionName", "area_target"]],on=["collectionName"])
met_df.groupby("area_target").mean()

Val evaluation details:
 4.769343850950051


Unnamed: 0_level_0,dist_50,dist_95,avg_dist_50_95
area_target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.651009,4.170212,2.91061
1,2.873503,7.817413,5.345458
2,5.650465,30.939429,18.294947


In [24]:
# 8.50
# # 1 	2.783603 	7.722676 	5.253140
# # 2 	5.546006 	26.655027 	16.100516

# suppress deg on 1 
# Val evaluation details:
#  8.397334935133

# 	dist_50 	dist_95 	avg_dist_50_95
# area_target 			
# 1 	2.701101 	7.490842 	5.095971
# 2 	5.546006 	26.655027 	16.100516

In [25]:
all_imu_path_test = "../data/imu_data_test.csv"
test_df = make_train(INPUT, all_cname, tag_v, is_test=True)
try:
    test_df = pd.read_csv(all_imu_path_test)
except Exception as e:
    test_df = make_train(INPUT, all_cname, tag_v, is_test=True)
    test_df.to_csv(all_imu_path_test, index=False)
    
test_df["phone"] = test_df["collectionName"] +"_" + test_df["phoneName"]

../data/raw_csvs/2020-05-15-US-MTV-1_Pixel4
../data/raw_csvs/2020-05-15-US-MTV-1_Pixel4XL
../data/raw_csvs/2020-05-28-US-MTV-1_Pixel4
../data/raw_csvs/2020-05-28-US-MTV-1_Pixel4XL
../data/raw_csvs/2020-05-28-US-MTV-2_Pixel4
../data/raw_csvs/2020-05-28-US-MTV-2_Pixel4XL
../data/raw_csvs/2020-05-28-US-MTV-2_Pixel4XLModded
../data/raw_csvs/2020-06-04-US-MTV-2_Pixel4
../data/raw_csvs/2020-06-04-US-MTV-2_Pixel4XL
../data/raw_csvs/2020-06-04-US-MTV-2_Pixel4XLModded
../data/raw_csvs/2020-06-10-US-MTV-1_Pixel4
../data/raw_csvs/2020-06-10-US-MTV-1_Pixel4XL
../data/raw_csvs/2020-06-10-US-MTV-1_Pixel4XLModded
../data/raw_csvs/2020-06-10-US-MTV-2_Pixel4
../data/raw_csvs/2020-06-10-US-MTV-2_Pixel4XL
../data/raw_csvs/2020-06-10-US-MTV-2_Pixel4XLModded
../data/raw_csvs/2020-08-03-US-MTV-2_Mi8
../data/raw_csvs/2020-08-03-US-MTV-2_Pixel4
../data/raw_csvs/2020-08-03-US-MTV-2_Pixel4XL
../data/raw_csvs/2020-08-13-US-MTV-1_Mi8
../data/raw_csvs/2020-08-13-US-MTV-1_Pixel4
../data/raw_csvs/2021-03-16-US-MTV-2

In [26]:
na_mask = test_df[pd.isna(test_df[col]).sum(axis=1) != 0]
na_mask.phone.unique()


array(['2020-05-15-US-MTV-1_Pixel4', '2020-05-15-US-MTV-1_Pixel4XL',
       '2020-05-28-US-MTV-1_Pixel4', '2020-05-28-US-MTV-1_Pixel4XL',
       '2020-05-28-US-MTV-2_Pixel4', '2020-05-28-US-MTV-2_Pixel4XL',
       '2020-05-28-US-MTV-2_Pixel4XLModded', '2020-06-04-US-MTV-2_Pixel4',
       '2020-06-04-US-MTV-2_Pixel4XL',
       '2020-06-04-US-MTV-2_Pixel4XLModded', '2020-06-10-US-MTV-1_Pixel4',
       '2020-06-10-US-MTV-1_Pixel4XL',
       '2020-06-10-US-MTV-1_Pixel4XLModded', '2020-06-10-US-MTV-2_Pixel4',
       '2020-06-10-US-MTV-2_Pixel4XL',
       '2020-06-10-US-MTV-2_Pixel4XLModded', '2020-08-03-US-MTV-2_Mi8',
       '2020-08-03-US-MTV-2_Pixel4', '2020-08-03-US-MTV-2_Pixel4XL',
       '2020-08-13-US-MTV-1_Mi8', '2020-08-13-US-MTV-1_Pixel4',
       '2021-03-16-US-MTV-2_Pixel4Modded',
       '2021-03-16-US-MTV-2_SamsungS20Ultra',
       '2021-03-16-US-RWC-2_Pixel4XL', '2021-03-16-US-RWC-2_Pixel5',
       '2021-03-16-US-RWC-2_SamsungS20Ultra', '2021-03-25-US-PAO-1_Mi8',
       '2021-03

In [27]:
del test_df["latDeg"], test_df["lngDeg"] 


In [28]:
# na check
col = ["x_f_acce",  "z_f_acce", "x_f_gyro", "y_f_gyro"]
test_df["phone"] = test_df["collectionName"] + "_" + test_df["phoneName"]
na_mask = test_df[pd.isna(test_df[col]).sum(axis=1) != 0]

posi_pred_df, area_df = load_dataset(is_test=True)

posi_pred_df["millisSinceGpsEpoch_orig"] = posi_pred_df["millisSinceGpsEpoch"]
posi_pred_df = posi_pred_df[[
                "collectionName",
                "phoneName",
                "millisSinceGpsEpoch_orig",
                "latDeg", "lngDeg", "area_target"
            ]] 
test_df = pd.merge(
    test_df, posi_pred_df, on=[
                "collectionName",
                "phoneName",
                "millisSinceGpsEpoch_orig",
            ]
)
col = col + ["latDeg", "lngDeg"]

100%|██████████| 48/48 [00:12<00:00,  3.90it/s]


In [29]:
# remove nan data
print(len(test_df))
test_df = test_df.loc[~test_df.phone.isin(na_mask.phone.unique())]
print(len(test_df))
test_df, col, local_col = calc_avg_vel(df=test_df, add_future=True, global_targets=col)

91486
79988


In [30]:
dfs = []
def make_submission(test_df, local_col, surpress_deg=True):
    deg_col = [col for col in local_col if col.find("Deg") > -1]
    non_deg_col = [col for col in local_col if col.find("Deg") == -1]
    
    for area, df_ in test_df.groupby("area_target"):
        local_col = deg_col + non_deg_col
        if area == 1:
            models = models_tree
            if surpress_deg:
                local_col = non_deg_col
        elif area == 2:
            models = models_down
        else:
            dfs.append(df_)
            continue
        for target, model in models.items():
            preds = np.zeros_like(df_[local_col].to_numpy()[:, 0])
            for fold_model in model:
                preds += fold_model.predict(df_[local_col])
            preds *= 1/3
            df_.loc[:, target] = preds + df_.loc[:, target]
        dfs.append(df_)
    pred_df = pd.concat(dfs)
    pred_df["millisSinceGpsEpoch"] = pred_df["millisSinceGpsEpoch_orig"] 
    save_path = "../data/light_gbm_noise_surpress_deg.csv"
    print(f"save prediction file on {save_path}")
    pred_df.to_csv(save_path, index=False)
    return pred_df
        
pred_df = make_submission(test_df=test_df, local_col=local_col)

In [31]:
pred_df["area_target"].value_counts()
test_df["pred_diff"] = test_df[target] - pred_df[target]

In [33]:
test_df.groupby("area_target").describe()

Unnamed: 0_level_0,index,index,index,index,index,index,index,index,millisSinceGpsEpoch,millisSinceGpsEpoch,...,lngDeg_next_45,lngDeg_next_45,pred_diff,pred_diff,pred_diff,pred_diff,pred_diff,pred_diff,pred_diff,pred_diff
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
area_target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,37792.0,20024.573799,12904.289041,0.0,9447.75,18895.5,28343.25,47562.0,37792.0,1277835000.0,...,-122.130952,0.0,37792.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,35832.0,66168.22812,11286.731968,47563.0,56520.75,65478.5,76760.25,87445.0,35832.0,1301803000.0,...,-122.076464,0.0,35832.0,-1e-05,7e-06,-6.4e-05,-1.4e-05,-1.1e-05,-7e-06,0.000205
2,6364.0,84266.444689,6929.943853,74067.0,75657.75,88303.5,89894.25,91485.0,6364.0,1303544000.0,...,-121.885452,0.0,6364.0,-1e-05,2.4e-05,-7.4e-05,-2.1e-05,-1.1e-05,-4e-06,0.000133
