##### This is my 3rd place solution to Kaggle's AMP®-Parkinson's Disease Progression Prediction competition.

##### The discussion describing this solution is [here][1]

[1]: https://www.kaggle.com/competitions/amp-parkinsons-disease-progression-prediction/discussion/411546

# 1. Preprocessing

In [None]:
VER = "v999"

In [None]:
import sys
sys.path.append('/kaggle/input/amp-parkinsons-disease-progression-prediction')

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
# SMAPE: Metric for this competition
def smape(y_true, y_pred):
    # 2023/3/19 add +1 
    y_true = np.array(y_true) + 1
    y_pred = np.array(y_pred) + 1

    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    mask_not_zeros_ind = (y_true != 0) | (y_pred != 0)
    smap[mask_not_zeros_ind] = num[mask_not_zeros_ind] / dem[mask_not_zeros_ind]
    
    return 100 * np.mean(smap)

In [None]:
# Missing value Completion
def kesson_hokan(df):
    df['upd23b_clinical_state_on_medication'] = df['upd23b_clinical_state_on_medication'].replace('On', 0)
    df['upd23b_clinical_state_on_medication'] = df['upd23b_clinical_state_on_medication'].replace('Off', 1)
    df['upd23b_clinical_state_on_medication'] = df['upd23b_clinical_state_on_medication'].fillna(0)
    df = df.set_index('visit_month').groupby('patient_id').apply(lambda group: group.interpolate(method='index')).reset_index()
    return df

# 2. Read Data

In [None]:
# Read data
train_sup = pd.read_csv('../input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
print(train_sup.shape)
train_sup["add_flg"] = 1
train_sup['visit_month_max'] = train_sup.groupby('patient_id')['visit_month'].transform('max')
# Patients with "5 months" are not normal
train_sup = train_sup[train_sup["visit_month_max"] >= 6].copy()
print(train_sup.shape)
train_cli = pd.read_csv('../input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
print(train_cli.shape)
train_pro = pd.read_csv('../input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
print(train_pro.shape)
train_pep = pd.read_csv('../input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
print(train_pep.shape)
train = pd.concat([train_cli, train_sup]).sort_values(by=["patient_id", "visit_month"])
train = kesson_hokan(train)
train["add_flg"] = train["add_flg"].fillna(0)
dic_monthkbn = {0:1, 6:2, 12:3, 18:4, 24:5, 36:6, 48:7, 60:8, 72:9, 84:10}
def get_diff(df):
    df["visit_month_diff"] = df.groupby(["patient_id"])["visit_month"].diff()
    df["visit_month_count"] = df.groupby(["patient_id"])["patient_id"].cumcount() + 1
    df["visit_month_cummin"] = df.groupby(["patient_id"])["visit_month_diff"].cummin()
    df["visit_month_cummin_group"] = np.where(df["visit_month_cummin"].isna(), 0, 1)
    df["visit_month_cummin_group"].mask(df["visit_month_cummin"] >= 6, 2 ,inplace=True)
    df["visit_month_cummin_group"].mask(df["visit_month_cummin"] >= 12, 3 ,inplace=True)
    df["visit_month_cummin_group"].mask(df["add_flg"] == 1, 0 ,inplace=True)
    return df
def get_diff_monthkbn(df, col):
    df[f"{col}monthkbn"] = df["visit_month"].map(dic_monthkbn)
    df[f"{col}monthkbn"] = df.groupby(["patient_id"])[f"{col}monthkbn"].ffill()
    df[f"{col}monthkbn_diff"] = df.groupby(["patient_id"])[f"{col}monthkbn"].diff()
    df[f"{col}monthkbn_cumcount"] = df.groupby(["patient_id"])["visit_id"].cumcount() + 1
    df[f"{col}monthkbn_cummax"] = df.groupby(["patient_id"])[f"{col}monthkbn_diff"].cummax()
    return df
def get_diff_monthkbn_merge(df, df_pro):
    df = df.merge(df_pro, on=["patient_id", "visit_month", "visit_id"], how="left")
    df["pro_monthkbn"] = df.groupby(["patient_id"])["pro_monthkbn"].ffill()
    df["pro_monthkbn_cumcount"] = df.groupby(["patient_id"])["pro_monthkbn_cumcount"].ffill()
    df["pro_monthkbn_cummax"] = df.groupby(["patient_id"])["pro_monthkbn_cummax"].ffill()
    df["pro_monthkbn"] = df["pro_monthkbn"].fillna(0)
    df["pro_monthkbn_diff"] = df["pro_monthkbn_diff"].fillna(0)
    df["pro_monthkbn_cumcount"] = df["pro_monthkbn_cumcount"].fillna(0)
    df["pro_monthkbn_cummax"] = df["pro_monthkbn_cummax"].fillna(0)
    return df
train1 = get_diff(train.reset_index(drop=True).copy())
train1 = get_diff_monthkbn(train1.reset_index(drop=True).copy(), "")
train_pro_diff_monthkbn = train_pro[["patient_id", "visit_month", "visit_id"]].drop_duplicates().sort_values(by=["patient_id", "visit_month"])
train_pro_diff_monthkbn = get_diff_monthkbn(train_pro_diff_monthkbn.reset_index(drop=True).copy(), "pro_")
train1 = get_diff_monthkbn_merge(train1, train_pro_diff_monthkbn)
train1["total_month"] = train1["visit_month"]
train1 = train1.sort_values(by=["patient_id", "visit_month"]).reset_index(drop=True)
train1["visit_id"] = train1["patient_id"].astype(str) + "_" + train1["visit_month"].astype(str)
train1["months"] = train1["visit_month"]
print(train1.shape)

In [None]:
targets = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

In [None]:
print(train.shape)

# 3. Feature Engineering (Protein and Peptide)

In [None]:
# rank normalization
def get_count_rank(df, train_pro_count, train_pep_count):
    months = sorted(df['visit_month'].unique())
    list_df_y = []
    train_pro_count["NPX_count_cummean"] =  train_pro_count.groupby(["patient_id"])["NPX_count"].transform(lambda x: x.expanding().mean())
    train_pep_count["Pep_count_cummean"] =  train_pep_count.groupby(["patient_id"])["Pep_count"].transform(lambda x: x.expanding().mean())
    for m in months:
        df_tmp = df[df['visit_month'] == m].copy()
        train_pro_count_for_rank = train_pro_count[train_pro_count['visit_month'] <= m].copy()
        train_pep_count_for_rank = train_pep_count[train_pep_count['visit_month'] <= m].copy()
        train_pro_count_for_rank["NPX_count_rank"] = train_pro_count_for_rank["NPX_count"].rank(method="average", ascending=False, pct=True)
        train_pro_count_for_rank["NPX_count_cummean_rank"] = train_pro_count_for_rank["NPX_count_cummean"].rank(method="average", ascending=False, pct=True)
        train_pep_count_for_rank["Pep_count_rank"] = train_pep_count_for_rank["Pep_count"].rank(method="average", ascending=False, pct=True)
        train_pep_count_for_rank["Pep_count_cummean_rank"] = train_pep_count_for_rank["Pep_count_cummean"].rank(method="average", ascending=False, pct=True)
        df_tmp = df_tmp.merge(train_pro_count_for_rank[['patient_id', 'visit_month', "NPX_count_rank", "NPX_count_cummean", "NPX_count_cummean_rank"]], on=['patient_id', 'visit_month'], how="left")
        df_tmp = df_tmp.merge(train_pep_count_for_rank[['patient_id', 'visit_month', "Pep_count_rank", "Pep_count_cummean", "Pep_count_cummean_rank"]], on=['patient_id', 'visit_month'], how="left")
        list_df_y.append(df_tmp)
    df_y = pd.concat(list_df_y).sort_values(by=["patient_id", "visit_month", "total_month"])
    df_y["NPX_count_rank_cummean"] = df_y.groupby(["patient_id"])["NPX_count_rank"].transform(lambda x: x.expanding().mean())
    df_y["NPX_count_rank_cummin"] = df_y.groupby(["patient_id"])["NPX_count_rank"].cummin()
    df_y["NPX_count_rank_cummax"] = df_y.groupby(["patient_id"])["NPX_count_rank"].cummax()
    df_y["Pep_count_rank_cummean"] = df_y.groupby(["patient_id"])["Pep_count_rank"].transform(lambda x: x.expanding().mean())
    df_y["Pep_count_rank_cummin"] = df_y.groupby(["patient_id"])["Pep_count_rank"].cummin()
    df_y["Pep_count_rank_cummax"] = df_y.groupby(["patient_id"])["Pep_count_rank"].cummax()
    return df_y
    
def get_count_rank_qcut(df, train_pro, train_pep):
    train_pro_var = train_pro.groupby(["patient_id", "visit_month"])["NPX"].std().rename('NPX_std').reset_index()
    train_pro_var["NPX_std_qcut"] = pd.qcut(train_pro_var["NPX_std"], 10, labels=[i for i in range(1, 11, 1)], duplicates='drop')
    train_pep_var = train_pep.groupby(["patient_id", "visit_month"])["PeptideAbundance"].std().rename('Pep_std').reset_index()
    train_pep_var["Pep_std_qcut"] = pd.qcut(train_pep_var["Pep_std"], 10, labels=[i for i in range(1, 11, 1)], duplicates='drop')
    train_pro_count = train_pro.groupby(["patient_id", "visit_month"])["NPX"].count().rename('NPX_count').reset_index()
    train_pro_count["NPX_count_qcut"] = pd.qcut(train_pro_count["NPX_count"], 10, labels=[i for i in range(1, 11, 1)], duplicates='drop')
    train_pep_count = train_pep.groupby(["patient_id", "visit_month"])["PeptideAbundance"].count().rename('Pep_count').reset_index()
    train_pep_count["Pep_count_qcut"] = pd.qcut(train_pep_count["Pep_count"], 10, labels=[i for i in range(1, 11, 1)], duplicates='drop')
    df_y = df.merge(train_pro_count, on=['patient_id', 'visit_month'], how="left")
    df_y = df_y.merge(train_pep_count, on=['patient_id', 'visit_month'], how="left")
    df_y = df_y.merge(train_pro_var, on=['patient_id', 'visit_month'], how="left")
    df_y = df_y.merge(train_pep_var, on=['patient_id', 'visit_month'], how="left")
    df_y = df_y.sort_values(by=["patient_id", "visit_month", "total_month"])
    df_y = get_count_rank(df_y, train_pro_count, train_pep_count)
    df_y['NPX_count_rank_ffill'] = df_y.groupby(['patient_id'])['NPX_count_rank'].ffill()
    df_y['Pep_count_rank_ffill'] = df_y.groupby(['patient_id'])['Pep_count_rank'].ffill()
    df_y['NPX_count_rank_cummean'] = df_y.groupby(['patient_id'])['NPX_count_rank_cummean'].ffill()
    df_y['Pep_count_rank_cummean'] = df_y.groupby(['patient_id'])['Pep_count_rank_cummean'].ffill()
    df_y['NPX_count_rank_cummin'] = df_y.groupby(['patient_id'])['NPX_count_rank_cummin'].ffill()
    df_y['Pep_count_rank_cummin'] = df_y.groupby(['patient_id'])['Pep_count_rank_cummin'].ffill()
    df_y['NPX_count_rank_cummax'] = df_y.groupby(['patient_id'])['NPX_count_rank_cummax'].ffill()
    df_y['Pep_count_rank_cummax'] = df_y.groupby(['patient_id'])['Pep_count_rank_cummax'].ffill()
    df_y['NPX_count_cummean_rank'] = df_y.groupby(['patient_id'])['NPX_count_cummean_rank'].ffill()
    df_y['Pep_count_cummean_rank'] = df_y.groupby(['patient_id'])['Pep_count_cummean_rank'].ffill()
    return df_y

print(train.shape)
train2 = get_count_rank_qcut(train1, train_pro, train_pep)
print(train2.shape)

In [None]:
# Use SMAPE to normalize some features
def get_smape(df, df_moto, abs_flg):
    for col in df.columns.tolist():
        if col not in ["visit_id", 'patient_id', "visit_month"]:
            df[col] = (df[col] - df_moto[col].mean()) / (df[col] + df_moto[col].mean())
            if abs_flg:
                df[col] = df[col].astype(float).abs()
    return df.copy()

def get_propep_rank(df, col_rank, col_moto):
    months = sorted(df['visit_month'].unique())
    list_df_y = []
    for m in months:
        df_tmp = df[df['visit_month'] == m].copy()
        df_for_rank = df[df['visit_month'] <= m].copy()
        df_for_rank[col_rank] = df_for_rank[col_moto].rank(method="average", ascending=False, pct=True)
        df_tmp = df_tmp.merge(df_for_rank[['patient_id', 'visit_month', col_rank]], on=['patient_id', 'visit_month'], how="left")
        list_df_y.append(df_tmp)
    return pd.concat(list_df_y).sort_values(by=["patient_id", "visit_month"])

# Protein and peptide were only partially effective
from sklearn import preprocessing
def get_propep(train, train_pro, train_pep):
    train_pep["UniProt_Pep"] = train_pep["UniProt"] + train_pep["Peptide"]
    le = preprocessing.LabelEncoder()
    le.fit(train_pep["UniProt_Pep"])
    train_pep["le_Pep"] = le.transform(train_pep["UniProt_Pep"])
    train_pep["le_Pep"] = train_pep["UniProt"].astype(str) + "p" + train_pep["le_Pep"].astype(str).str.zfill(3)
    unique_UniProt = train_pro['UniProt'].unique().tolist()
    unique_ProtPep = train_pep['le_Pep'].unique().tolist()
    train_pro2 = train_pro.pivot(index=['visit_id', 'patient_id', "visit_month"], columns='UniProt', values='NPX').sort_values(by=['patient_id', 'visit_month']).reset_index()
    train_pep2 = train_pep.pivot(index=['visit_id', 'patient_id', "visit_month"], columns='le_Pep', values='PeptideAbundance').sort_values(by=['patient_id', 'visit_month']).reset_index()
    df_y = train.copy().sort_values(by=['patient_id', 'visit_month']).reset_index(drop=True)
    list_feat_propep_moto = []
    for text, df, unique_cols in [["pro", train_pro2, unique_UniProt], ["pep", train_pep2, unique_ProtPep]]:
        df = get_smape(df.copy(), df.copy(), False)
        df[f'{text}_var'] = df[unique_cols].var(axis=1)
        df = get_propep_rank(df, f'{text}_var_rank', f'{text}_var')
        df[f'{text}_std'] = df[unique_cols].std(axis=1)
        df[f'{text}_abs_mean'] = np.abs(df[unique_cols]).mean(axis=1)
        df[f'{text}_mean'] = df[unique_cols].mean(axis=1)
        df[f'{text}_mean_clipupper0'] = df[unique_cols].clip(upper=0).mean(axis=1)
        df[f"{text}_mean_minus"] = np.where(df[f"{text}_mean"] < 0, 1, 0)
        df[f'{text}_sum'] = df[unique_cols].sum(axis=1)
        df[f'{text}_sum_cummean'] = df.groupby(["patient_id"])[f'{text}_sum'].transform(lambda x: x.expanding().mean())
        df = get_propep_rank(df, f'{text}_sum_cummean_rank',  f'{text}_sum_cummean')
        df[f'{text}_sum_clipupper0'] = df[unique_cols].clip(upper=0).sum(axis=1)
        df_diff = pd.concat([df[['visit_id', 'patient_id', 'visit_month']], df.groupby('patient_id')[unique_cols].diff()], axis=1)
        df_diff[f'{text}_diff_var'] = df_diff[unique_cols].var(axis=1)
        df_diff[f'{text}_diff_var_cummean'] = df_diff.groupby(["patient_id"])[f'{text}_diff_var'].transform(lambda x: x.expanding().mean())
        df_diff = get_propep_rank(df_diff, f'{text}_diff_var_cummean_rank',  f'{text}_diff_var_cummean')
        df_diff[f'{text}_diff_std'] = df_diff[unique_cols].std(axis=1)
        df_diff[f'{text}_diff_abs_mean'] = np.abs(df_diff[unique_cols]).mean(axis=1)
        df_diff[f'{text}_diff_mean'] = df_diff[unique_cols].mean(axis=1)
        df_diff[f'{text}_diff_mean_clipupper0'] = df_diff[unique_cols].clip(upper=0).mean(axis=1)
        df_diff[f'{text}_diff_sum'] = df_diff[unique_cols].sum(axis=1)
        df_diff[f'{text}_diff_sum_clipupper0'] = df_diff[unique_cols].clip(upper=0).sum(axis=1)
        list_feat_propep = [f"{text}{i}" for i in ["_var", "_var_rank", "_std", "_abs_mean", "_mean", "_mean_clipupper0", "_mean_minus", "_sum", "_sum_cummean", "_sum_cummean_rank", "_sum_clipupper0"]]
        list_feat_prppep_diff = [f"{text}_diff{i}" for i in ["_var", "_var_cummean", "_var_cummean_rank", "_std", "_abs_mean", "_mean", "_mean_clipupper0", "_sum", "_sum_clipupper0"]]
        df_y = df_y.merge(df[["visit_id", "patient_id", "visit_month"] + list_feat_propep], on=['visit_id', 'patient_id', "visit_month"], how="left")
        df_y = df_y.merge(df_diff[["visit_id"] + list_feat_prppep_diff], on=["visit_id"], how="left")
        list_feat_propep_moto += list_feat_propep
        list_feat_propep_moto += list_feat_prppep_diff
    df_y["propep_mean_minus"] = (df_y["pro_mean_minus"] + df_y["pep_mean_minus"]).clip(lower=0, upper=1)
    df_y["propep_mean_minus_cummean"] = df_y.groupby(["patient_id"])["propep_mean_minus"].transform(lambda x: x.expanding().mean())
    df_y["propep_mean_minus_cummax"] = df_y.groupby(["patient_id"])["propep_mean_minus"].cummax()
    df_y["propep_mean_minus_cummax_cumsum"] = df_y.groupby(["patient_id"])["propep_mean_minus_cummax"].cumsum()
    list_feat_propep_add = ["propep_mean_minus_cummean", "propep_mean_minus_cummax", "propep_mean_minus_cummax_cumsum"]
    for i in list_feat_propep_moto + list_feat_propep_add:
        df_y[i] = df_y.groupby('patient_id')[i].ffill()
    df_y["propep_status"] = 2
    df_y["propep_status"].mask(df_y["pro_diff_mean"].isna() | df_y["pep_diff_mean"].isna(), 1, inplace=True)
    df_y["propep_status"].mask(df_y["pro_mean"].isna() | df_y["pep_mean"].isna(), 0, inplace=True)
    df_y["propep_status_cummin"] = df_y.groupby(["patient_id"])["propep_status"].cummin()
    df_y["NPX_Pep_count_rank_cummean_harmean"] = 3 / ((2 / df_y["NPX_count_rank_cummean"]) + (1 / df_y["Pep_count_rank_cummean"]))
    df_y["NPX_Pep_count_rank_cummin_harmean"] = 3 / ((2 / df_y["NPX_count_rank_cummin"]) + (1 / df_y["Pep_count_rank_cummin"]))
    df_y["propep_sum_rank_harmean"] = 3 / ((2 / df_y["pro_sum_cummean_rank"]) + (1 / df_y["pep_sum_cummean_rank"]))
    df_y["propep_diff_var_rank_harmean"] = 3 / ((2 / df_y["pro_diff_var_cummean_rank"]) + (1 / df_y["pep_diff_var_cummean_rank"]))
    df_y["NPX_Pep_count_rank_harmean"] = df_y[["NPX_Pep_count_rank_cummean_harmean", "NPX_Pep_count_rank_cummin_harmean"]].mean(axis=1)
    df_y["NPX_Pep_count_and_sum"] = (2 / ((1 / df_y["NPX_Pep_count_rank_harmean"]) + (1 / df_y["propep_sum_rank_harmean"])))
    df_y["feat_propep_worse"] = df_y[["NPX_Pep_count_rank_harmean", "NPX_Pep_count_and_sum"]].min(axis=1)
    df_y["feat_propep_better"] = (df_y[["propep_sum_rank_harmean", "propep_diff_var_rank_harmean"]].mean(axis=1))
    list_pro_monthkbn_cumcount = []
    for i in [0, 6, 12, 18, 24]:
        col0 = f"pro_monthkbn_cumcount_month{i}_0"
        col1 = f"pro_monthkbn_cumcount_month{i}_1"
        df_y[col0] = np.where(df_y["NPX_count"].isna() & (df_y["visit_month"] == i), 1, 0)
        df_y[col1] = np.where(~df_y["NPX_count"].isna() & (df_y["visit_month"] == i), 1, 0)
        list_pro_monthkbn_cumcount.extend([col0, col1])
    for i in list_pro_monthkbn_cumcount:
        df_y[i] = df_y.groupby(["patient_id"])[i].cumsum()
    for i in [0, 6, 12, 18, 24]:
        col0 = f"pro_monthkbn_cumcount_month{i}_0"
        col1 = f"pro_monthkbn_cumcount_month{i}_1"
        df_y[col0].mask((df_y[col0] + df_y[col1] == 0) & (df_y["visit_month"] >= i), 1, inplace=True)
    return df_y

print(train2.shape)
train3 = get_propep(train2, train_pro, train_pep)
print(train3.shape)

# 4. Prediction (Const)

In [None]:
const_init1 = 4
list_const_increment1 = [3, 6, 18, 72, 84, 90, 96, 102, 108]
const_init2 = 3
list_const_increment2 = [1, 3, 6, 30, 42, 72, 84, 90, 96, 102, 108]
const_init3 = 17
list_const_increment3 = [1, 3, 6, 12, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108]
const_init4 = 0
list_const_increment4 = [72, 78, 84, 96, 108]
estimates = {}

# UnHealthy Group
for i in range(145):
    for j in [0, 1, 2, 3]:
        for target in targets:
            if i >= 1:
                estimates[(i, j, target)] = estimates[(i - 1, j, target)]
            if target == 'updrs_1' and i == 0:
                estimates[(i, j, target)] = const_init1
            if target == 'updrs_1' and i in list_const_increment1:
                estimates[(i, j, target)] += 1
            if target == 'updrs_2' and i == 0:
                estimates[(i, j, target)] = const_init2
            if target == 'updrs_2' and i in list_const_increment2:
                estimates[(i, j, target)] += 1
            if target == 'updrs_3' and i == 0:
                estimates[(i, j, target)] = const_init3
            if target == 'updrs_3' and i in list_const_increment3:
                estimates[(i, j, target)] += 1
            if target == 'updrs_4' and i == 0:
                estimates[(i, j, target)] = const_init4
            if target == 'updrs_4' and i in list_const_increment4:
                estimates[(i, j, target)] += 1

# Healthy Group
for i in range(145):
    for j in [3]:
        for target in targets:
            if target == 'updrs_1':
                estimates[(i, j, target)] = 2
            if target == 'updrs_1' and i >= 48:
                estimates[(i, j, target)] = 3
            if target not in ['updrs_1']:
                estimates[(i, j, target)] = 0

In [None]:
def get_df_estimates(x):
    df = pd.DataFrame.from_dict(estimates, orient="index").reset_index()
    df.columns = ['month_updrs', 'value']
    df['month'] = df['month_updrs'].map(lambda x: x[0])
    df['month_sabun'] = df['month_updrs'].map(lambda x: x[1])
    df['updrs'] = df['month_updrs'].map(lambda x: x[2])
    return df

df_estimates = get_df_estimates(estimates)
df_estimates = df_estimates.pivot(index=['month', 'month_sabun'], columns='updrs', values='value').reset_index()
for i in [0, 1, 2, 3]:
    print(df_estimates[df_estimates['month_sabun'] == i].head(1))
    print(df_estimates[df_estimates['month_sabun'] == i].tail(1))

In [None]:
# Get const
for t in targets:
    train3[f'{t}_const'] = train3[['total_month', 'visit_month_cummin_group']].apply(lambda x: (x["total_month"], x["visit_month_cummin_group"], t), axis=1).map(estimates).values

# 5. Prediction (Slope)

In [None]:
list_feat_get_coef_rating = []
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month0_0")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month0_1")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month6_0")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month6_1")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month12_0")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month12_1")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month18_0")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month18_1")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month24_0")
list_feat_get_coef_rating.append("pro_monthkbn_cumcount_month24_1")
list_feat_get_coef_rating.append("feat_propep_worse")
list_feat_get_coef_rating.append("feat_propep_better")
def get_coef_rating(df):
    df_y = df.copy()
    df_y['feat1'] = np.where((df_y['visit_month_cummin_group'] <= 2)
                             & (df_y['pro_monthkbn_cumcount_month6_0'] == 1), 1, 0)
    df_y['feat2'] = np.where((df_y['visit_month_cummin_group'] <= 2)
                             & (df_y['pro_monthkbn_cumcount_month6_1'] == 1)
                             & (df_y['pro_monthkbn_cumcount_month12_0'] == 1), 1, 0)
    df_y['feat3'] = np.where((df_y['visit_month_cummin_group'] <= 2) 
                             & (df_y['pro_monthkbn_cumcount_month6_1'] == 1), 1, 0)
    df_y['feat4'] = np.where((df_y['visit_month_cummin_group'] <= 2)
                             & (df_y['pro_monthkbn_cumcount_month6_1'] == 1)
                             & (df_y['pro_monthkbn_cumcount_month12_1'] == 1), 1, 0)
    df_y['feat5'] = np.where((df_y['visit_month_cummin_group'] <= 2)
                             & (df_y['pro_monthkbn_cumcount_month6_0'] == 1)
                             & (df_y['pro_monthkbn_cumcount_month18_1'] == 1), 1, 0)
    df_y['feat6'] = np.where((df_y["visit_month_cummin_group"] <= 2)
                             & (df_y["feat_propep_worse"] >= 0.7), 1, 0)
    df_y['feat7'] = np.where((df_y["visit_month_cummin_group"] <= 2)
                             & (df_y["feat_propep_worse"] >= 0.8), 1, 0)
    df_y['feat7_2'] = np.where((df_y["visit_month_cummin_group"] == 3)
                               & (df_y["feat_propep_worse"] >= 0.8), 1, 0)
    df_y['feat8'] = np.where((df_y["visit_month_cummin_group"] <= 2)
                             & (df_y["feat_propep_better"] <= 0.3), 1, 0)
    df_y['feat9'] = np.where((df_y["visit_month_cummin_group"] <= 2)
                             & (df_y["feat_propep_better"] <= 0.2), 1, 0)
    dic_list_coef = dict(feat1=[0, -1, -2],
                         feat2=[0, -1, 0],
                         feat3=[0, 1, 2],
                         feat4=[0, 0, 1],
                         feat5=[-1, -1, -8],
                         feat6=[0, 1, 1],
                         feat7=[1, 1, 2],
                         feat7_2=[1, 0, 0],
                         feat8=[0, 0, -1],
                         feat9=[0, -1, -1])
    df_y["updrs_1_slope"] = 0
    df_y["updrs_2_slope"] = 0
    df_y["updrs_3_slope"] = 0
    for k, v in dic_list_coef.items():
        df_y["updrs_1_slope"] += df_y[k] * v[0]
        df_y["updrs_2_slope"] += df_y[k] * v[1]
        df_y["updrs_3_slope"] += df_y[k] * v[2]
    df_y["updrs_1_hat"] = df_y["updrs_1_slope"] + df_y["updrs_1_const"]
    df_y["updrs_2_hat"] = df_y["updrs_2_slope"] + df_y["updrs_2_const"]
    df_y["updrs_3_hat"] = df_y["updrs_3_slope"] + df_y["updrs_3_const"]
    df_y["updrs_4_hat"] = df_y["updrs_4_const"]  # updrs_4 has only const
    return df_y.copy()

# Get final score
print(train3.shape)
train4 = get_coef_rating(train3)
print(train4.shape)

# 6. CV Score

In [None]:
# CV Score
def calc_smape(df):
    validation_x = []
    validation_y = []
    for id, row in df.iterrows():
        for t in targets:
            if row[f'{t}'] >= 0:
                validation_x.append(row[f'{t}_hat'])
                validation_y.append(row[f'{t}'])
    return f"{smape(validation_y, validation_x):.4f}"

print("=== CV Score (shape) ===")
train_cv1 = train4[(train4["propep_status"] >= 1) & (train4["add_flg"] == 0)].copy().reset_index(drop=True)
print(train_cv1.shape)
train_cv2 = train4[train4["add_flg"] == 0].copy().reset_index(drop=True)
print(train_cv2.shape)
train_cv3 = train4.copy()
print(train_cv3.shape)
print("=== CV Score ===")
smape1 = calc_smape(train_cv1)
smape2 = calc_smape(train_cv2)
smape3 = calc_smape(train_cv3)
print(VER,",",smape1,",",smape2,",",smape3)

# 7. Time Series API

In [None]:
# import amp_pd_peptide (old)
import amp_pd_peptide_310

amp_pd_peptide_310.make_env.func_dict['__called__'] = False
env = amp_pd_peptide_310.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

# The API will deliver four dataframes in this specific order:
list_test = [train]
list_pro = [train_pro]
list_pep = [train_pep]
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    list_test.append(test)
    list_pro.append(test_proteins)
    list_pep.append(test_peptides)
    # This maps the correct value estimate to each line in sample_submission
    sample_submission['patient_id'] = sample_submission['prediction_id'].str.split('_').apply(lambda x: int(x[0]))
    sample_submission['visit_id'] = sample_submission['prediction_id'].str.split('_').apply(lambda x: '_'.join(x[0:2]))
    sample_submission['targets_updrs'] = sample_submission['prediction_id'].str.split('_').apply(lambda x: '_'.join(x[2:4]))
    sample_submission['months'] = sample_submission['prediction_id'].str.split('_').apply(lambda x: int(x[1]) + int(x[5]))
    test_all = pd.concat(list_test).drop_duplicates(["patient_id", "visit_month", "visit_id"]).sort_values(by=["patient_id", "visit_month"])
    test_all["add_flg"] = 0
    test_all["total_month"] = test_all["visit_month"]
    test_all = get_diff(test_all.copy())
    test_pro = pd.concat(list_pro).sort_values(by=['patient_id', 'visit_month']).reset_index()
    test_pep = pd.concat(list_pep).sort_values(by=['patient_id', 'visit_month']).reset_index()
    test_all = get_diff_monthkbn(test_all.reset_index(drop=True).copy(), "")
    test_pro_diff_monthkbn = test_pro[["patient_id", "visit_month", "visit_id"]].drop_duplicates().sort_values(by=["patient_id", "visit_month"])
    test_pro_diff_monthkbn = get_diff_monthkbn(test_pro_diff_monthkbn.reset_index(drop=True).copy(), "pro_")
    test_all1 = get_diff_monthkbn_merge(test_all, test_pro_diff_monthkbn)
    test_all2 = get_count_rank_qcut(test_all1, test_pro, test_pep)
    test_all3 = get_propep(test_all2, test_pro, test_pep)
    list_feat_status = ["visit_month_cummin_group", "propep_status"]
    sample_submission = sample_submission.merge(test_all3[["visit_id"] + list_feat_status + list_feat_get_coef_rating], on=["visit_id"], how="left")
    for i in list_feat_get_coef_rating:
        sample_submission[i] = sample_submission[i].fillna(0)
    targets_months_updrs = sample_submission[['months', "visit_month_cummin_group", 'targets_updrs']].apply(lambda x: (x["months"], x["visit_month_cummin_group"], x["targets_updrs"]), axis=1)
    sample_submission['updrs_1_const'] = targets_months_updrs.map(estimates).fillna(0)
    sample_submission['updrs_2_const'] = targets_months_updrs.map(estimates).fillna(0)
    sample_submission['updrs_3_const'] = targets_months_updrs.map(estimates).fillna(0)
    sample_submission['updrs_4_const'] = targets_months_updrs.map(estimates).fillna(0)
    sample_submission = get_coef_rating(sample_submission)
    sample_submission['rating'] = sample_submission['updrs_1_hat']
    sample_submission['rating'].mask((sample_submission["targets_updrs"] == "updrs_2"), sample_submission['updrs_2_hat'], inplace=True)
    sample_submission['rating'].mask((sample_submission["targets_updrs"] == "updrs_3"), sample_submission['updrs_3_hat'], inplace=True)
    sample_submission['rating'].mask((sample_submission["targets_updrs"] == "updrs_4"), sample_submission['updrs_4_hat'], inplace=True)
    
    # debug
    print(sample_submission[['prediction_id', 'rating']].head(1))
    print(sample_submission[['prediction_id', 'rating']].tail(1))
    
    # Saves predictions to csv file
    env.predict(sample_submission[['prediction_id', 'rating']])

In [None]:
# check normal end
submission = pd.read_csv('/kaggle/working/submission.csv')
submission