In [None]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt 
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.svm import SVR  
from sklearn.linear_model import *
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold,GridSearchCV,RandomizedSearchCV,GroupKFold
from sklearn.preprocessing import OrdinalEncoder,Normalizer,MinMaxScaler,StandardScaler

In [None]:
def smape(y_true, y_pred):
    y_true += 1
    y_pred += 1
    
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    positive_index = (y_true!=0) | (y_pred!=0)
    smape = np.zeros(len(y_true))
    smape[positive_index] = numerator[positive_index] / denominator[positive_index]
    smape = 100 * np.mean(smape)
    return smape

In [None]:
# train_proteins=pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
# train_peptides=pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
# train_clinical=pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')

# def features(df, train_proteins, train_peptides):
#     proteins_npx_ft = train_proteins\
#         .groupby('visit_id')\
#         .agg(NPX_min=('NPX','min'), NPX_max=('NPX','max'), NPX_mean=('NPX','mean'), NPX_std=('NPX','std'))\
#         .reset_index()
#     peptides_PeptideAbundance_ft = train_peptides\
#         .groupby('visit_id')\
#         .agg(Abe_min=('PeptideAbundance','min'), Abe_max=('PeptideAbundance','max'),Abe_mean=('PeptideAbundance','mean'), Abe_std=('PeptideAbundance','std'))\
#         .reset_index()
#     df = pd.merge(df, proteins_npx_ft, on = 'visit_id', how = 'left')
#     df = pd.merge(df, peptides_PeptideAbundance_ft, on = 'visit_id', how = 'left')
#     return df

# train_df = features(train_clinical, train_proteins, train_peptides)
# sds = StandardScaler()
# scale_col = ['visit_month','NPX_min','NPX_max','NPX_mean','NPX_std', 'Abe_min', 'Abe_max', 'Abe_mean', 'Abe_std']
# train_df[scale_col] = sds.fit_transform(train_df[scale_col])

# train_df

In [None]:
# import warnings
# warnings.filterwarnings("ignore")
# models = {}

# target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
# fold = 17
# # num_folds = {u:[] for u in target}
# # for fold in range(2,27,5):
# #     print('<'*50,f'n_fold:{fold}','>'*50)

# model_pool = [Lars(),RandomForestRegressor(),SVR(),Lasso(),LGBMRegressor(),XGBRegressor()]
# for u in target:
#     # Drop NAs
#     temp = train_df.dropna(subset=[u]+scale_col) 
    
#     # For updrs_3, dropping 0's improve results
#     if u == 'updrs_3':
#         temp = temp[temp[u] != 0]

#     # Train data
#     X = temp[scale_col].values
#     y = temp[u] 

#     enc = OrdinalEncoder()
#     groups = enc.fit_transform(pd.DataFrame(temp.patient_id)).reshape(1,-1)[0].tolist()
#     cv = GroupKFold(n_splits=fold)

#     model_candidates = []
#     scores = []
#     for ind, model in enumerate(model_pool):
#         model_candidates.append(
#             RandomizedSearchCV(model,
#                                {item[0]:[item[1]] for item in model.get_params().items()},
#                                cv = cv.split(X, y, groups),
#                                scoring=make_scorer(smape),
#                                verbose = -1
#                               ).fit(X,y)
#         )
#         scores.append((ind,model_candidates[-1].best_score_))
#         print(u,str(model_candidates[ind].estimator).split('(')[0],model_candidates[-1].best_score_)

#     winning_model = model_candidates[sorted(scores,key = lambda x:x[1])[0][0]]
#     num_folds[u].append(winning_model.best_score_)
#     print(f"Pick best performing model for {u}:{str(winning_model.estimator).split('(')[0]}",'\n','-'*50)
#     models[u] = winning_model
    
# # for lst in num_folds.values():
# #     plt.plot(lst)
# # plt.show()

In [None]:
train = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
sup = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
train = train.append(sup, ignore_index=True).drop(['upd23b_clinical_state_on_medication'],axis=1)
train

In [None]:
import warnings
warnings.filterwarnings("ignore")
models = {}

target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
fold = 17
# num_folds = {u:[] for u in target}
# for fold in range(2,27,5):
#     print('<'*50,f'n_fold:{fold}','>'*50)

model_pool = [Lars(),RandomForestRegressor(),SVR(),Lasso(),LGBMRegressor(),XGBRegressor()]
for u in target:
    # Drop NAs
    temp = train.dropna(subset=[u]) 

    # For updrs_3, dropping 0's improve results
    if u == 'updrs_3':
        temp = temp[temp[u] != 0]

    # Train data
    X = temp['visit_month'].values.reshape(-1,1)
    y = temp[u] 

    enc = OrdinalEncoder()
    groups = enc.fit_transform(pd.DataFrame(temp.patient_id)).reshape(1,-1)[0].tolist()
    cv = GroupKFold(n_splits=fold)

    model_candidates = []
    scores = []
    for ind, model in enumerate(model_pool):
        model_candidates.append(
            RandomizedSearchCV(model,
                               {item[0]:[item[1]] for item in model.get_params().items()},
                               cv = cv.split(X, y, groups),
                               scoring=make_scorer(smape),
                               verbose = -1
                              ).fit(X,y)
        )
        scores.append((ind,model_candidates[-1].best_score_))
        print(u,str(model_candidates[ind].estimator).split('(')[0],model_candidates[-1].best_score_)

    winning_model = model_candidates[sorted(scores,key = lambda x:x[1])[0][0]]
    print(f"Pick best performing model for {u}:{str(winning_model.estimator).split('(')[0]}",'\n','-'*50)
    models[u] = winning_model
    
    
#     num_folds[u].append(winning_model.best_score_)
    
# for lst in num_folds.values():
#     plt.plot(lst)
# plt.show()

In [None]:
# def create_X_y_train_dataset(df, updrs_part, plus_month):
# #     df_ = df.dropna(subset=[f'updrs_{updrs_part}'])
#     df_ = df
#     X_visit_ids = []
#     y_visit_ids = []
#     patient_ids = df['patient_id'].unique()
#     for i, patient_id in enumerate(patient_ids):
#         patient_df = df_[df_['patient_id']==patient_id]
#         plus_months = patient_df['visit_month'] + plus_month
#         plus_months = patient_df.query('visit_month in @plus_months')['visit_month']
#         original_months = plus_months - plus_month
        
#         X_visit_id = [f'{patient_id}_{original_month}' for original_month in original_months]
#         y_visit_id = [f'{patient_id}_{plus_month}' for plus_month in plus_months]
        
#         X_visit_ids.extend(X_visit_id)
#         y_visit_ids.extend(y_visit_id)
    
#     X = df_.query('visit_id in @X_visit_ids')
#     X = X.drop(['patient_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'], axis=1)
#     X.reset_index(drop=True, inplace=True)
    
#     y = df_.query('visit_id in @y_visit_ids')
#     y = y[['visit_id', f'updrs_{updrs_part}']]
#     y.reset_index(drop=True, inplace=True)
#     return X, y

# def create_X_y_dict(df):
#     X_dict = {}
#     y_dict = {}
#     for updrs_part in [1, 2, 3, 4]:
#         for plus_month in [0, 6, 12, 24]:
#             X, y = create_X_y_train_dataset(df, updrs_part, plus_month)
#             key = f'updrs_{updrs_part}_plus_month{plus_month}'
            
#             df_ = pd.DataFrame({'patient_id':[visit_id.split('_')[0] for visit_id in X.visit_id]})
#             X = X.join(df_)
            
#             X = X[y[f'updrs_{updrs_part}'].notna()]
#             y = y[y[f'updrs_{updrs_part}'].notna()]
            
#             # For updrs_3, dropping 0's improve results
#             if updrs_part == 3:
#                 X = X[y['updrs_3'] != 0]
#                 y = y[y['updrs_3'] != 0]
            
#             X_dict[key] = X
#             y_dict[key] = y
#     return X_dict, y_dict

# X_dict, y_dict = create_X_y_dict(train)

# for updrs in [1, 2, 3, 4]:
#     for plus_month in [0, 6, 12, 24]:
#         key = f'updrs_{updrs}_plus_month{plus_month}'
#         print(f'{key} => {len(X_dict[key])}')

# discard target na
updrs_1_plus_month0 => 4624:4624
updrs_1_plus_month6 => 2664:2664
updrs_1_plus_month12 => 2640:2640
updrs_1_plus_month24 => 1889:1889
updrs_2_plus_month0 => 4622:4622
updrs_2_plus_month6 => 2662:2662
updrs_2_plus_month12 => 2639:2639
updrs_2_plus_month24 => 1888:1888
updrs_3_plus_month0 => 4442:4442
updrs_3_plus_month6 => 2547:2547
updrs_3_plus_month12 => 2384:2384
updrs_3_plus_month24 => 1681:1681
updrs_4_plus_month0 => 2872:2872
updrs_4_plus_month6 => 1827:1827
updrs_4_plus_month12 => 1905:1905
updrs_4_plus_month24 => 1440:1440
------------------------------------
# discard all na
updrs_1_plus_month0 => 4624:4624
updrs_1_plus_month6 => 2663:2663
updrs_1_plus_month12 => 2640:2640
updrs_1_plus_month24 => 1889:1889
updrs_2_plus_month0 => 4622:4622
updrs_2_plus_month6 => 2661:2661
updrs_2_plus_month12 => 2639:2639
updrs_2_plus_month24 => 1888:1888
updrs_3_plus_month0 => 4442:4442
updrs_3_plus_month6 => 2533:2533
updrs_3_plus_month12 => 2373:2373
updrs_3_plus_month24 => 1669:1669
updrs_4_plus_month0 => 2872:2872
updrs_4_plus_month6 => 1482:1482
updrs_4_plus_month12 => 1390:1390
updrs_4_plus_month24 => 880:880

In [None]:
# from sklearn.linear_model import *
# import warnings
# warnings.filterwarnings("ignore")

# models = {}

# linear_params = [{
#     'alpha':[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 1]
#     }]
# target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]

# # num_folds = {u:[] for u in target}
# # for fold in range(2,21):
# best_scores = {}
# for u in target:

#     # Drop NAs
#     temp = train.dropna(subset=[u]) 

#     # For updrs_3, dropping 0's improve results
#     if u == 'updrs_3':
#         temp = temp[temp[u] != 0]

#     # Train data
#     X = temp['visit_month'].values.reshape(-1,1)
#     y = temp[u] 

#     enc = OrdinalEncoder()
#     groups = enc.fit_transform(pd.DataFrame(temp.patient_id)).reshape(1,-1)[0].tolist()
#     cv = GroupKFold(n_splits=10)
    
#     m1 = RandomizedSearchCV(Lasso(),{item[0]:[item[1]] for item in Lasso().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m2 = RandomizedSearchCV(ElasticNet(),{item[0]:[item[1]] for item in ElasticNet().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m3 = RandomizedSearchCV(ElasticNetCV(),{item[0]:[item[1]] for item in ElasticNetCV().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m4 = RandomizedSearchCV(Lars(),{item[0]:[item[1]] for item in Lars().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m5 = RandomizedSearchCV(LarsCV(),{item[0]:[item[1]] for item in LarsCV().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m6 = RandomizedSearchCV(Lasso(),{item[0]:[item[1]] for item in Lasso().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m7 = RandomizedSearchCV(LassoCV(),{item[0]:[item[1]] for item in LassoCV().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m8 = RandomizedSearchCV(LassoLars(),{item[0]:[item[1]] for item in LassoLars().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m9 = RandomizedSearchCV(LassoLarsIC(),{item[0]:[item[1]] for item in LassoLarsIC().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m10 = RandomizedSearchCV(OrthogonalMatchingPursuit(),{item[0]:[item[1]] for item in OrthogonalMatchingPursuit().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m11 = RandomizedSearchCV(LinearRegression(),{item[0]:[item[1]] for item in LinearRegression().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m12 = RandomizedSearchCV(Ridge(),{item[0]:[item[1]] for item in Ridge().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
#     m13 = RandomizedSearchCV(RidgeCV(),{item[0]:[item[1]] for item in RidgeCV().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
# #         m = RandomizedSearchCV(OrthogonalMatchingPursuitCV(),{item[0]:[item[1]] for item in OrthogonalMatchingPursuitCV().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
# #         m = RandomizedSearchCV(SGDRegressor(),{item[0]:[item[1]] for item in SGDRegressor().get_params().items()},cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)
    
#     best_scores[u] = sorted([(f'm{i}',globals()[f'm{i}'].best_score_) for i in range(1,14)],key = lambda x:x[1])
# best_scores

In [None]:
# models = {}
# linear_params = [{
#     'alpha':[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 1]
#     }]

# # num_folds = {f'updrs_{updrs}_plus_month{plus_month}': [] for updrs in [1, 2, 3 ,4] for plus_month in [0, 6, 12, 24]}
# # for fold in range(2,21):
# for updrs in [1, 2, 3 ,4]:
#     for plus_month in [0, 6, 12, 24]:
#         key = f'updrs_{updrs}_plus_month{plus_month}'
#         print(key)
#         X = X_dict[key]['visit_month'].values.reshape(-1,1)
#         y = y_dict[key][f'updrs_{updrs}']

#         enc = OrdinalEncoder()
#         groups = enc.fit_transform(pd.DataFrame(X_dict[key].patient_id)).reshape(1,-1)[0].tolist()
#         cv = GroupKFold(n_splits=10)
#         ls = RandomizedSearchCV(Lasso(),linear_params,cv = cv.split(X, y, groups),scoring=make_scorer(smape),verbose = -1).fit(X,y)

#         num_folds[key].append(ls.best_score_)
#         print('linear best param',ls.best_params_)
#         print('linear best score',ls.best_score_)
#         models['ls_' + key] = ls

In [None]:
df_train_clinical_data = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
df_supplemental_clinical_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')

target_columns_clinical_data = ['updrs_1',]
target_columns_clinical_and_supplemental_data = ['updrs_2','updrs_3', 'updrs_4']

target_visit_month_medians_clinical_data = df_train_clinical_data.groupby('visit_month')[target_columns_clinical_data].median()
target_visit_month_medians_clinical_and_supplemental_data = pd.concat((df_train_clinical_data,df_supplemental_clinical_data),
                                                                      axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].median()

# target_visit_month_medians_clinical_data = df_train_clinical_data.groupby('visit_month')[target_columns_clinical_data].mean()
# target_visit_month_medians_clinical_and_supplemental_data = pd.concat((df_train_clinical_data,df_supplemental_clinical_data),
#                                                                       axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].mean()

# target_visit_month_medians_clinical_data = df_train_clinical_data.groupby('visit_month')[target_columns_clinical_data].apply(pd.DataFrame.mode).groupby('visit_month').mean()
# target_visit_month_medians_clinical_and_supplemental_data = pd.concat((df_train_clinical_data,df_supplemental_clinical_data),
#                                                                       axis=0).groupby('visit_month')[target_columns_clinical_and_supplemental_data].apply(pd.DataFrame.mode).groupby('visit_month').mean()


# Drop 5th month visit that is coming from the supplemental clinical data
target_visit_month_medians_clinical_and_supplemental_data = target_visit_month_medians_clinical_and_supplemental_data.drop(5)

# Concatenate visit_month medians of targets
target_visit_month_medians = pd.concat(
    (target_visit_month_medians_clinical_data,target_visit_month_medians_clinical_and_supplemental_data),
    axis=1, ignore_index=False)

# Replace expanding window max of updrs values with current updrs values
target_visit_month_medians = target_visit_month_medians.expanding(min_periods=1).max()
target_visit_month_medians

In [None]:
def symmetric_mean_absolute_percentage_error(y_true, y_pred):

    smape = 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

    return smape


def score(df, target_columns, prediction_columns):
    
    y_true = []
    y_pred = []
    
    for target_column, prediction_column in zip(target_columns, prediction_columns):
        target_idx = df[target_column].notna()
        y_true.append(df.loc[target_idx, target_column].values + 1)
        y_pred.append(df.loc[target_idx, prediction_column].values + 1)
        
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
        
    score = symmetric_mean_absolute_percentage_error(
        y_true=y_true,
        y_pred=y_pred
    )
    
    return score

fold_columns = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5']
target_columns = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

for target_column in target_columns:
        
    target_idx = df_train_clinical_data[target_column].notna()
    df_train = df_train_clinical_data.loc[target_idx]
    print(f'Target: {target_column} Dataset Shape: {df_train.shape}')
        
    df_train_clinical_data.loc[target_idx, f'{target_column}_predictions'] = df_train_clinical_data.loc[target_idx, 'visit_month'].map(target_visit_month_medians[target_column])
    val_score = score(
        df=df_train_clinical_data.loc[target_idx],
        target_columns=[target_column],
        prediction_columns=[f'{target_column}_predictions']
    )
    print(f'Validation SMAPE: {val_score:.4f}\n')
    
global_oof_score = score(
    df=df_train_clinical_data,
    target_columns=target_columns,
    prediction_columns=[f'{target_column}_predictions' for target_column in target_columns]
)
print(f'Global OOF SMAPE: {global_oof_score:.4f}')

In [None]:
# import amp_pd_peptide
# amp_pd_peptide.make_env.func_dict['__called__'] = False
# env = amp_pd_peptide.make_env()   
# iter_test = env.iter_test()

# counter = 0
# model_picks = [0,0,0,0]
# for (test, test_peptides, test_proteins, sample_submission) in iter_test:
#     df = test[['visit_id']].drop_duplicates('visit_id')
    
#     for visit_id in df.visit_id:
#         X = test.query(f'visit_id=="{visit_id}"')[:1].visit_month.values.reshape(-1,1)
#         for updrs_part in [1,2,3,4]:
#             for plus_month in [0, 6, 12, 24]:
#                 key = f'updrs_{updrs_part}_plus_month{plus_month}'
#                 rating = models[f'ls_{key}'].predict(X)
#                 prediction_id = f"{visit_id}_updrs_{updrs_part}_plus_{plus_month}_months"
#                 index = sample_submission.query(f'prediction_id=="{prediction_id}"').index
#                 sample_submission.loc[index, 'rating'] = rating
        
        
#     sample_submission['patient_id'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[0].astype(int)
#     sample_submission['current_visit_month'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[1].astype(int)
#     sample_submission['visit_month_offset'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[5].astype(int)
#     sample_submission['prediction_visit_month'] = sample_submission['current_visit_month'] + sample_submission['visit_month_offset'].astype(int)
#     sample_submission['updrs'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[3].astype(int)

#     for updrs in range(1, 5):
#         if model_picks[updrs_part-1]:
#             updrs_idx = sample_submission['updrs'] == updrs
#             sample_submission.loc[updrs_idx, 'rating'] = sample_submission.loc[updrs_idx, 'prediction_visit_month'].map(target_visit_month_medians[f'updrs_{updrs}'])
#             missing_idx = sample_submission['rating'].isnull()
#             for idx, row in sample_submission[updrs_idx & missing_idx].iterrows():
#                 target_visit_month_median_idx = np.argmin(np.abs(target_visit_month_medians.index - row['prediction_visit_month']))
#                 sample_submission.loc[idx, 'rating'] = target_visit_month_medians.iloc[target_visit_month_median_idx, updrs - 1]
    
#     sample_submission = sample_submission.loc[:, ['prediction_id', 'rating']]
    
#     if counter == 0:
#             display(test)
#             display(sample_submission)
#     counter += 1
#     env.predict(sample_submission)

In [None]:
import amp_pd_peptide
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()   
iter_test = env.iter_test()

target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]

counter = 0
model_picks = [0,1,0,1]
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    df = test[['visit_id']].drop_duplicates('visit_id')
    
    for visit_id in df.visit_id:
        X = test.query(f'visit_id=="{visit_id}"')[:1].visit_month.values.reshape(-1,1)
        for u in target:
            for plus_month in [0, 6, 12, 24]:
                
                rating = models[u].predict([[X[0][0]+plus_month]])
                
                prediction_id = f"{visit_id}_{u}_plus_{plus_month}_months"
                index = sample_submission.query(f'prediction_id=="{prediction_id}"').index
                sample_submission.loc[index, 'rating'] = rating
        
        
    sample_submission['patient_id'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[0].astype(int)
    sample_submission['current_visit_month'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[1].astype(int)
    sample_submission['visit_month_offset'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[5].astype(int)
    sample_submission['prediction_visit_month'] = sample_submission['current_visit_month'] + sample_submission['visit_month_offset'].astype(int)
    sample_submission['updrs'] = sample_submission.apply('prediction_id').str.split('_', expand=True)[3].astype(int)

    for updrs in range(1, 5):
        if model_picks[updrs-1]:
            updrs_idx = sample_submission['updrs'] == updrs
            sample_submission.loc[updrs_idx, 'rating'] = sample_submission.loc[updrs_idx, 'prediction_visit_month'].map(target_visit_month_medians[f'updrs_{updrs}'])
            missing_idx = sample_submission['rating'].isnull()
            for idx, row in sample_submission[updrs_idx & missing_idx].iterrows():
                target_visit_month_median_idx = np.argmin(np.abs(target_visit_month_medians.index - row['prediction_visit_month']))
                sample_submission.loc[idx, 'rating'] = target_visit_month_medians.iloc[target_visit_month_median_idx, updrs - 1]
    
    sample_submission = sample_submission.loc[:, ['prediction_id', 'rating']]
    
    if counter == 0:
            display(test)
            display(sample_submission)
    counter += 1
    env.predict(sample_submission)