In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
from pathlib import Path
import datetime
from datetime import datetime

from pandarallel import pandarallel
pandarallel.initialize()
from vincenty import vincenty
from scipy import interpolate

import lightgbm as lgbm
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

import plotly.express as px

#BASE_DIR = '/Users/vadimzubkov/Desktop/smartphone-decimeter-2022/train/2021-03-16-US-MTV-1'
#BASE_DIR = Path('/Users/vadimzubkov/Desktop/smartphone-decimeter-2022/train/2021-03-16-US-MTV-1')

path = '/Users/vadimzubkov/Desktop/smartphone-decimeter-2022'

In [None]:
def vincenty_meter(r, lat='LatitudeDegrees', lng='LongitudeDegrees', tlat='t_latDeg', tlng='t_lngDeg'):
    return vincenty((r[lat], r[lng]), (r[tlat], r[tlng])) * 1000


def check_meter(input_df: pd.DataFrame, save=False):
    output_df = input_df.copy()
    
    output_df['meter'] = input_df.parallel_apply(vincenty_meter, axis=1)
    if save == True:
        output_df.to_csv('train_output.csv', index=False)

    meter_score = output_df['meter'].mean()
    print(f'meter: {meter_score}') 

    scores = []
    for trip in output_df['tripId'].unique():
        p_50 = np.percentile(output_df.loc[output_df['tripId']==trip, 'meter'], 50)
        p_95 = np.percentile(output_df.loc[output_df['tripId']==trip, 'meter'], 95)
        scores.append(p_50)
        scores.append(p_95)

    score = sum(scores) / len(scores)
    print(f'CV: {score}')
    
    return output_df

In [None]:
# Read data
train_kf = pd.read_csv(f'{path}/baseline_train.csv') # wls baseline 
#train_gt = pd.concat([pd.read_csv(path) for path in tqdm(BASE_DIR.glob('*/ground_truth.csv'),total=4)])
test_base = pd.read_csv(f'{path}/submission_kf.csv') ### test ???
#train_kf=train_kf.dropna(subset=['LatitudeDegrees','LongitudeDegrees'])
train_gt = pd.read_csv(f'{path}/train_gt.csv')
#train_kf['LatitudeDegrees'] = train_kf['LatitudeDegrees'].interpolate('spline', order=3)
#train_kf['LongitudeDegrees'] = train_kf['LongitudeDegrees'].interpolate('spline', order=3)

train_kf['drive'] = train_kf['tripId'].apply(lambda x:x.split("/")[0])
train_kf['phone'] = train_kf['tripId'].apply(lambda x:x.split("/")[1])
test_base['drive'] = test_base['tripId'].apply(lambda x:x.split("/")[0])
test_base['phone'] = test_base['tripId'].apply(lambda x:x.split("/")[1])

train_gt.rename(columns={'LatitudeDegrees':'t_latDeg',
                         'LongitudeDegrees':'t_lngDeg',
                         'tripId':'t_tripId',
                         #'UnixTimeMillis': 'utcTimeMillis' 
                     }
               ,inplace=True)
train_base = train_kf.merge(
    train_gt, on=['drive', 'phone',
                  'UnixTimeMillis'
                 ]
)

In [None]:
train_base = check_meter(train_base)

In [None]:
# Generalized functions of LightGBM
def fit_lgbm(X, y, train_df, params: dict=None, verbose=100, seed: int=42, N_SPLITS: int=3):
    models = []
    oof_pred = np.zeros(len(y), dtype=np.float64)
    
    kf = GroupKFold(n_splits=N_SPLITS)
    for i, (idx_train, idx_valid) in enumerate(kf.split(X, y, train_df['drive'].reset_index(drop=True))):
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        model = lgbm.LGBMRegressor(**params)
        model.fit(x_train, y_train, 
            eval_set=[(x_valid, y_valid)],  
            early_stopping_rounds=verbose, 
            eval_metric='mae',
            verbose=0)
            
        pred_i = model.predict_proba(x_valid)[:, 1]
        oof_pred[x_valid.index] = pred_i
        models.append(model)

    return oof_pred, models

def predict_lgbm(models, feat_df):
    pred = np.array([model.predict_proba(feat_df.values)[:, 1] for model in models])
    pred = np.mean(pred, axis=0)
    return pred