In [6]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt 
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor

from sklearn.svm import SVR  
from sklearn.linear_model import *
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold,GridSearchCV,RandomizedSearchCV,GroupKFold
from sklearn.preprocessing import OrdinalEncoder,Normalizer,MinMaxScaler,StandardScaler

In [7]:
def smape(y_true, y_pred):
    y_true += 1
    y_pred += 1
    
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    positive_index = (y_true!=0) | (y_pred!=0)
    smape = np.zeros(len(y_true))
    smape[positive_index] = numerator[positive_index] / denominator[positive_index]
    smape = 100 * np.mean(smape)
    return smape

In [8]:
train = pd.read_csv("../data/raw/train_clinical_data.csv")
sup = pd.read_csv('../data/raw/supplemental_clinical_data.csv')
train = pd.concat([train,sup],ignore_index=True).drop(['upd23b_clinical_state_on_medication'],axis=1)
train

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,55,0,10.0,6.0,15.0,
1,55_3,55,3,10.0,7.0,25.0,
2,55_6,55,6,8.0,10.0,34.0,
3,55_9,55,9,8.0,9.0,30.0,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0
...,...,...,...,...,...,...,...
4833,65382_0,65382,0,,,0.0,
4834,65405_0,65405,0,5.0,16.0,31.0,0.0
4835,65405_5,65405,5,,,57.0,
4836,65530_0,65530,0,10.0,6.0,24.0,0.0


In [9]:
import warnings
warnings.filterwarnings("ignore")

models = {}

target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
fold = 17


model_pool = [Lars(),RandomForestRegressor(),SVR(),Lasso()]
# model_pool = [Lars(),RandomForestRegressor(),SVR(),Lasso(),LGBMRegressor(),XGBRegressor()]
for u in target:
    # Drop NAs
    temp = train.dropna(subset=[u]) 

    # For updrs_3, dropping 0's improve results
    if u == 'updrs_3':
        temp = temp[temp[u] != 0]

    # Train data
    X = temp['visit_month'].values.reshape(-1,1)
    y = temp[u] 

    enc = OrdinalEncoder()
    groups = enc.fit_transform(pd.DataFrame(temp.patient_id)).reshape(1,-1)[0].tolist()
    cv = GroupKFold(n_splits=fold)

    model_candidates = []
    scores = []
    for ind, model in enumerate(model_pool):
        model_candidates.append(
            RandomizedSearchCV(model,
                               {item[0]:[item[1]] for item in model.get_params().items()},
                               cv = cv.split(X, y, groups),
                               scoring=make_scorer(smape),
                               verbose = 1
                              ).fit(X,y)
        )
        scores.append((ind,model_candidates[-1].best_score_))
        print(u,str(model_candidates[ind].estimator).split('(')[0],model_candidates[-1].best_score_)

    winning_model = model_candidates[sorted(scores,key = lambda x:x[1])[0][0]]
    print(f"Pick best performing model for {u}:{str(winning_model.estimator).split('(')[0]}",'\n','-'*50)
    models[u] = winning_model

Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_1 Lars 55.713366705133936
Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_1 RandomForestRegressor 55.68099875422541
Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_1 SVR 55.118112899303775
Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_1 Lasso 55.712334923713044
Pick best performing model for updrs_1:SVR 
 --------------------------------------------------
Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_2 Lars 64.91536809196735
Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_2 RandomForestRegressor 64.83356873294646
Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_2 SVR 64.86975090650007
Fitting 17 folds for each of 1 candidates, totalling 17 fits
updrs_2 Lasso 64.940633949303
Pick best performing model for updrs_2:RandomForestRegressor 
 --------------------------------------------------
Fitting 17 folds fo

In [10]:
df_train_clinical_data = pd.read_csv("../data/raw/train_clinical_data.csv")
df_supplemental_clinical_data = pd.read_csv('../data/raw/supplemental_clinical_data.csv')

target_columns_clinical_data = ['updrs_1',]
target_columns_clinical_and_supplemental_data = ['updrs_2','updrs_3', 'updrs_4']

target_visit_month_medians_clinical_data = df_train_clinical_data.groupby('visit_month')[target_columns_clinical_data].median()
target_visit_month_medians_clinical_and_supplemental_data = pd.concat((df_train_clinical_data,df_supplemental_clinical_data),
                                                                      axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].median()

# target_visit_month_medians_clinical_data = df_train_clinical_data.groupby('visit_month')[target_columns_clinical_data].mean()
# target_visit_month_medians_clinical_and_supplemental_data = pd.concat((df_train_clinical_data,df_supplemental_clinical_data),
#                                                                       axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].mean()

# target_visit_month_medians_clinical_data = df_train_clinical_data.groupby('visit_month')[target_columns_clinical_data].apply(pd.DataFrame.mode).groupby('visit_month').mean()
# target_visit_month_medians_clinical_and_supplemental_data = pd.concat((df_train_clinical_data,df_supplemental_clinical_data),
#                                                                       axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].apply(pd.DataFrame.mode).groupby('visit_month').mean()


# Drop 5th month visit that is coming from the supplemental clinical data
target_visit_month_medians_clinical_and_supplemental_data = target_visit_month_medians_clinical_and_supplemental_data.drop(5)

# Concatenate visit_month medians of targets
target_visit_month_medians = pd.concat(
    (target_visit_month_medians_clinical_data,target_visit_month_medians_clinical_and_supplemental_data),
    axis=1, ignore_index=False)

# Replace expanding window max of updrs values with current updrs values
target_visit_month_medians = target_visit_month_medians.expanding(min_periods=1).max()
target_visit_month_medians

Unnamed: 0_level_0,updrs_1,updrs_2,updrs_3,updrs_4
visit_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4.5,4.0,18.0,0.0
3,4.5,5.0,19.0,0.0
6,6.0,6.0,21.0,0.0
9,6.0,6.0,21.0,0.0
12,6.0,6.0,21.0,0.0
18,6.0,6.0,21.0,0.0
24,6.0,6.0,21.0,0.0
30,7.0,6.0,22.0,0.0
36,7.0,6.0,22.0,0.0
42,7.0,7.0,23.0,0.0


In [11]:
def symmetric_mean_absolute_percentage_error(y_true, y_pred):

    smape = 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

    return smape


def score(df, target_columns, prediction_columns):
    
    y_true = []
    y_pred = []
    
    for target_column, prediction_column in zip(target_columns, prediction_columns):
        target_idx = df[target_column].notna()
        y_true.append(df.loc[target_idx, target_column].values + 1)
        y_pred.append(df.loc[target_idx, prediction_column].values + 1)
        
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
        
    score = symmetric_mean_absolute_percentage_error(
        y_true=y_true,
        y_pred=y_pred
    )
    
    return score

fold_columns = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
target_columns = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

for target_column in target_columns:
        
    target_idx = df_train_clinical_data[target_column].notna()
    df_train = df_train_clinical_data.loc[target_idx]
    print(f'Target: {target_column} Dataset Shape: {df_train.shape}')
        
    df_train_clinical_data.loc[target_idx, f'{target_column}_predictions'] = df_train_clinical_data.loc[target_idx, 'visit_month'].map(target_visit_month_medians[target_column])
    val_score = score(
        df=df_train_clinical_data.loc[target_idx],
        target_columns=[target_column],
        prediction_columns=[f'{target_column}_predictions']
    )
    print(f'Validation SMAPE: {val_score:.4f}\n')
    
global_oof_score = score(
    df=df_train_clinical_data,
    target_columns=target_columns,
    prediction_columns=[f'{target_column}_predictions' for target_column in target_columns]
)
print(f'Global OOF SMAPE: {global_oof_score:.4f}')

Target: updrs_1 Dataset Shape: (2614, 8)
Validation SMAPE: 55.7414

Target: updrs_2 Dataset Shape: (2613, 9)
Validation SMAPE: 70.7972

Target: updrs_3 Dataset Shape: (2590, 10)
Validation SMAPE: 69.1957

Target: updrs_4 Dataset Shape: (1577, 11)
Validation SMAPE: 48.0103

Global OOF SMAPE: 62.3409
