In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

In [14]:
import warnings
warnings.filterwarnings("ignore")

# Read the dataframes

In [16]:
train_proteins = pd.read_csv("train_proteins.csv")
train_peptides = pd.read_csv("train_peptides.csv")
train_clinical = pd.read_csv("train_clinical_data.csv")

# Merge the dataframes

In [20]:
merged_df = train_peptides.merge(
    train_proteins, on=["visit_id", "visit_month", "patient_id", "UniProt"], how="inner"
).drop_duplicates()

merged_df.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0,732430.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0,732430.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9,732430.0
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7,732430.0


In [21]:
merged_df.shape

(981834, 7)

In [22]:
# from pivottablejs import pivot_ui
# pivot_ui(train_clinical)

In [23]:
# Create score columns initialized with zeros
merged_df["Peptide oxidation score"] = 0
merged_df["Protein oxidation score"] = 0
merged_df["Peptide carbamidomethylation score"] = 0
merged_df["Protein carbamidomethylation score"] = 0

def preprocessPeptides(args):
    (patientID, proteinID), currentPeptides = args
    peptideCount = len(currentPeptides)

    if peptideCount == 0:
        return currentPeptides

    def deserializeUniMod(peptide, scoreAndRemove=['UniMod_35', 'UniMod_4']):
        oxidationScore, carbamidomethylationScore = 0, 0
        for string in scoreAndRemove:
            while string in peptide:
                peptide = peptide.replace(f"({string})", "", 1)
                if string == 'UniMod_35':
                    oxidationScore += np.log(len(peptide))
                elif string == 'UniMod_4':
                    carbamidomethylationScore += np.log(len(peptide))
        return pd.Series([peptide, oxidationScore, carbamidomethylationScore])

    currentPeptides[['Peptide', 'Peptide oxidation score', 'Peptide carbamidomethylation score']] = currentPeptides['Peptide'].apply(deserializeUniMod)

    normalized_carbamidomethylation_score = currentPeptides['Peptide carbamidomethylation score'].sum() / peptideCount
    normalized_oxidation_score = currentPeptides['Peptide oxidation score'].sum() / peptideCount

    currentPeptides["Protein oxidation score"] = normalized_oxidation_score
    currentPeptides["Protein carbamidomethylation score"] = normalized_carbamidomethylation_score

    return currentPeptides

In [24]:
from multiprocessing import Pool
import os

print("Preprocessing peptides...")
grouped_peptides = merged_df.groupby(["patient_id", "UniProt"])
with Pool(os.cpu_count()) as pool:
    preprocessed_peptides = pool.map(preprocessPeptides, [(group, data) for group, data in grouped_peptides])

# Concatenate the preprocessed peptide data and update the peptideData DataFrame
print("Concatenating...")
peptideData = pd.concat(preprocessed_peptides).reset_index(drop=True)
print("Done!")


Preprocessing peptides...


Process SpawnPoolWorker-41:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'preprocessPeptides' on <module '__main__' (built-in)>
Process SpawnPoolWorker-42:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = g

Process SpawnPoolWorker-55:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'preprocessPeptides' on <module '__main__' (built-in)>
Process SpawnPoolWorker-56:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = g

Process SpawnPoolWorker-69:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'preprocessPeptides' on <module '__main__' (built-in)>
Process SpawnPoolWorker-70:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = g

KeyboardInterrupt: 

In [14]:
peptideData.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,Peptide oxidation score,Protein oxidation score,Peptide carbamidomethylation score,Protein carbamidomethylation score
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3,11254.3,0.0,0.0,0.0,0.0
1,55_6,6,55,O00391,NEQEQPLGQWHLS,13163.6,13163.6,0.0,0.0,0.0,0.0
2,55_12,12,55,O00391,NEQEQPLGQWHLS,15257.6,15257.6,0.0,0.0,0.0,0.0
3,55_36,36,55,O00391,NEQEQPLGQWHLS,13530.8,13530.8,0.0,0.0,0.0,0.0
4,55_0,0,55,O00533,GNPEPTFSWTK,102060.0,732430.0,0.0,0.0,0.0,0.416526


In [15]:
peptideData.describe()

Unnamed: 0,visit_month,patient_id,PeptideAbundance,NPX,Peptide oxidation score,Protein oxidation score,Peptide carbamidomethylation score,Protein carbamidomethylation score
count,981834.0,981834.0,981834.0,981834.0,981834.0,981834.0,981834.0,981834.0
mean,26.105061,32603.465361,642890.2,16997540.0,0.08227,0.08227,1.357439,1.357439
std,22.913897,18605.934422,3377989.0,63154320.0,0.479718,0.195629,2.611049,1.636037
min,0.0,55.0,10.9985,84.6082,0.0,0.0,0.0,0.0
25%,6.0,16566.0,28174.25,243258.0,0.0,0.0,0.0,0.0
50%,24.0,29313.0,74308.3,992274.0,0.0,0.0,0.0,0.789194
75%,48.0,49995.0,221338.8,3723542.0,0.0,0.08963,2.397895,2.214363
max,108.0,65043.0,178752000.0,613851000.0,4.060443,3.091042,23.348452,13.764596


In [16]:
train_clinical.shape

(2615, 8)

In [17]:
oxidation_grouped_df = peptideData.groupby(['visit_id'])['Protein oxidation score'].sum().reset_index(name='Total Protein oxidation score')
print("shape", oxidation_grouped_df.shape)
oxidation_grouped_df.head()

shape (1113, 2)


Unnamed: 0,visit_id,Total Protein oxidation score
0,10053_0,51.820117
1,10053_12,46.95991
2,10053_18,65.368599
3,10138_12,81.58707
4,10138_24,79.966085


In [18]:
allData = train_clinical.merge(oxidation_grouped_df, on='visit_id', how="left")
allData.shape

(2615, 9)

In [19]:
pep_oxidation_grouped_df = peptideData.groupby(['visit_id'])['Peptide oxidation score'].sum().reset_index(name='Total Peptide oxidation score')
print("shape", pep_oxidation_grouped_df.shape)
pep_oxidation_grouped_df.head()

shape (1113, 2)


Unnamed: 0,visit_id,Total Peptide oxidation score
0,10053_0,47.3625
1,10053_12,45.483533
2,10053_18,71.302593
3,10138_12,84.173413
4,10138_24,78.184452


In [20]:
allData = allData.merge(pep_oxidation_grouped_df, on='visit_id', how="left")
allData.shape

(2615, 10)

In [21]:
pep_carbamidomethylation_grouped_df = peptideData.groupby(['visit_id'])['Peptide carbamidomethylation score'].sum().reset_index(name='Total Peptide carbamidomethylation score')
print("shape", pep_carbamidomethylation_grouped_df.shape)
pep_carbamidomethylation_grouped_df.head()

shape (1113, 2)


Unnamed: 0,visit_id,Total Peptide carbamidomethylation score
0,10053_0,957.699273
1,10053_12,871.803343
2,10053_18,1148.221678
3,10138_12,1264.579146
4,10138_24,1229.933246


In [22]:
allData = allData.merge(pep_carbamidomethylation_grouped_df, on='visit_id', how="left")
allData.shape

(2615, 11)

In [23]:

carbamidomethylation_grouped_df = peptideData.groupby(['visit_id'])['Protein carbamidomethylation score'].sum().reset_index(name='Total Protein carbamidomethylation score')
print("shape", carbamidomethylation_grouped_df.shape)
carbamidomethylation_grouped_df.head()

shape (1113, 2)


Unnamed: 0,visit_id,Total Protein carbamidomethylation score
0,10053_0,925.055336
1,10053_12,872.157058
2,10053_18,1180.5119
3,10138_12,1255.804793
4,10138_24,1238.131384


In [24]:
allData = allData.merge(carbamidomethylation_grouped_df, on='visit_id', how="left")
allData.shape

(2615, 12)

In [25]:
allData.dtypes

visit_id                                     object
patient_id                                    int64
visit_month                                   int64
updrs_1                                     float64
updrs_2                                     float64
updrs_3                                     float64
updrs_4                                     float64
upd23b_clinical_state_on_medication          object
Total Protein oxidation score               float64
Total Peptide oxidation score               float64
Total Peptide carbamidomethylation score    float64
Total Protein carbamidomethylation score    float64
dtype: object

In [26]:
allData.shape

(2615, 12)

In [27]:
def preprocess_data(train_peptides, train_proteins, train_clinical):

    # Merge the dataframes
    merged_df = train_peptides.merge(
        train_proteins, on=["visit_id", "visit_month", "patient_id", "UniProt"], how="inner"
    ).drop_duplicates()

    merged_df["Peptide oxidation score"] = 0
    merged_df["Protein oxidation score"] = 0
    merged_df["Peptide carbamidomethylation score"] = 0
    merged_df["Protein carbamidomethylation score"] = 0

    print("Preprocessing peptides...")
    grouped_peptides = merged_df.groupby(["patient_id", "UniProt"])
    with Pool(os.cpu_count()) as pool:
        preprocessed_peptides = pool.map(preprocessPeptides, [(group, data) for group, data in grouped_peptides])

    # Concatenate the preprocessed peptide data and update the peptideData DataFrame
    print("Concatenating...")
    peptideData = pd.concat(preprocessed_peptides).reset_index(drop=True)
    print("Done!")

    oxidation_grouped_df = peptideData.groupby(['visit_id'])['Protein oxidation score'].sum().reset_index(name='Total Protein oxidation score')
    allData = train_clinical.merge(oxidation_grouped_df, on='visit_id', how="left")

    pep_oxidation_grouped_df = peptideData.groupby(['visit_id'])['Peptide oxidation score'].sum().reset_index(name='Total Peptide oxidation score')
    allData = allData.merge(pep_oxidation_grouped_df, on='visit_id', how="left")

    pep_carbamidomethylation_grouped_df = peptideData.groupby(['visit_id'])['Peptide carbamidomethylation score'].sum().reset_index(name='Total Peptide carbamidomethylation score')
    allData = allData.merge(pep_carbamidomethylation_grouped_df, on='visit_id', how="left")

    carbamidomethylation_grouped_df = peptideData.groupby(['visit_id'])['Protein carbamidomethylation score'].sum().reset_index(name='Total Protein carbamidomethylation score')
    allData = allData.merge(carbamidomethylation_grouped_df, on='visit_id', how="left")

    return allData

In [28]:
allData.dtypes

visit_id                                     object
patient_id                                    int64
visit_month                                   int64
updrs_1                                     float64
updrs_2                                     float64
updrs_3                                     float64
updrs_4                                     float64
upd23b_clinical_state_on_medication          object
Total Protein oxidation score               float64
Total Peptide oxidation score               float64
Total Peptide carbamidomethylation score    float64
Total Protein carbamidomethylation score    float64
dtype: object

In [29]:
allData['upd23b_clinical_state_on_medication'].unique()

array([nan, 'On', 'Off'], dtype=object)

In [30]:
allData['upd23b_clinical_state_on_medication'] = allData['upd23b_clinical_state_on_medication'].map({'On': 1, 'Off': 0, np.nan: np.nan})
allData['upd23b_clinical_state_on_medication'].unique()

array([nan,  1.,  0.])

In [31]:
allData.dtypes

visit_id                                     object
patient_id                                    int64
visit_month                                   int64
updrs_1                                     float64
updrs_2                                     float64
updrs_3                                     float64
updrs_4                                     float64
upd23b_clinical_state_on_medication         float64
Total Protein oxidation score               float64
Total Peptide oxidation score               float64
Total Peptide carbamidomethylation score    float64
Total Protein carbamidomethylation score    float64
dtype: object

In [32]:
numeric_columns = ['visit_month', 'Total Protein oxidation score', 'Total Peptide oxidation score', 'Total Peptide carbamidomethylation score', 'Total Protein carbamidomethylation score']

In [33]:
len(numeric_columns)

5

In [34]:
def scale_data(allData):
    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()

    allData[numeric_columns] = scaler.fit_transform(allData[numeric_columns])

    return allData

In [35]:
allData = scale_data(allData)
allData.describe()

Unnamed: 0,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,Total Protein oxidation score,Total Peptide oxidation score,Total Peptide carbamidomethylation score,Total Protein carbamidomethylation score
count,2615.0,2615.0,2614.0,2613.0,2590.0,1577.0,1288.0,1068.0,1068.0,1068.0,1068.0
mean,32651.743786,0.288804,7.110559,6.74359,19.421236,1.861763,0.601708,0.833883,0.851191,0.915849,0.906075
std,18535.7587,0.233325,5.525955,6.32323,15.000289,3.022112,0.489736,0.11777,0.135556,0.084426,0.085398
min,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16574.0,0.097222,3.0,1.0,6.0,0.0,0.0,0.784378,0.80376,0.90822,0.899042
50%,29417.0,0.222222,6.0,5.0,19.0,0.0,1.0,0.866653,0.890442,0.940738,0.930524
75%,50611.0,0.444444,10.0,10.0,29.0,3.0,1.0,0.91683,0.935835,0.960963,0.951953
max,65043.0,1.0,33.0,40.0,86.0,20.0,1.0,1.0,1.0,1.0,1.0


In [36]:
allData.isna().sum()

visit_id                                       0
patient_id                                     0
visit_month                                    0
updrs_1                                        1
updrs_2                                        2
updrs_3                                       25
updrs_4                                     1038
upd23b_clinical_state_on_medication         1327
Total Protein oxidation score               1547
Total Peptide oxidation score               1547
Total Peptide carbamidomethylation score    1547
Total Protein carbamidomethylation score    1547
dtype: int64

In [37]:
allData.shape

(2615, 12)

In [38]:
print('renaming the columns')
allData.rename(columns={'updrs_1': 'result_updrs_1', 'updrs_2': 'result_updrs_2', 'updrs_3': 'result_updrs_3', 'updrs_4': 'result_updrs_4'}, inplace=True)

renaming the columns


In [39]:
allData.head()

Unnamed: 0,visit_id,patient_id,visit_month,result_updrs_1,result_updrs_2,result_updrs_3,result_updrs_4,upd23b_clinical_state_on_medication,Total Protein oxidation score,Total Peptide oxidation score,Total Peptide carbamidomethylation score,Total Protein carbamidomethylation score
0,55_0,55,0.0,10.0,6.0,15.0,,,0.948583,0.962393,0.984426,0.972356
1,55_3,55,0.027778,10.0,7.0,25.0,,,,,,
2,55_6,55,0.055556,8.0,10.0,34.0,,,0.966861,1.0,0.981097,0.972472
3,55_9,55,0.083333,8.0,9.0,30.0,0.0,1.0,,,,
4,55_12,55,0.111111,10.0,10.0,41.0,0.0,1.0,0.958542,1.0,0.980619,0.97392


## Train Test Split

In [41]:
train_model_dictionary = {'result_updrs_1': None, 'result_updrs_2': None, 'result_updrs_3': None, 'result_updrs_4': None}

# implement catboost regressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

from sklearn.metrics import r2_score

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# define the model
train_columns = numeric_columns[:]
for target_col in train_model_dictionary.keys():
    catboost_model = CatBoostRegressor(iterations=2000, learning_rate=0.07, depth=10, verbose=0)

    cur_cols = train_columns[:]
    cur_cols.append(target_col)
    X_df = allData[cur_cols]
#     print(X_df.shape)
    X_df.dropna(axis=0, inplace=True)
#     print(X_df.shape)

    X = X_df[train_columns]
    y = X_df[target_col]

    # Split the data into training and temporary sets (70% training, 30% temporary)
    X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

    # Split the temporary set into validation and testing sets (50% validation, 50% testing)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # fit the model
    catboost_model.fit(X_train_temp, y_train_temp)

    y_pred_val = catboost_model.predict(X_val)
    y_pred_train = catboost_model.predict(X_train_temp)
    y_pred_test = catboost_model.predict(X_test)

    print("CatBoost Regressor for", target_col)

    print("=========================================================================")

    # Calculate SMAPE
    smape_score_train = smape(y_train_temp, y_pred_train)

    # Calculate MSE
    mse_score_train = mean_squared_error(y_train_temp, y_pred_train)

    # Calculate R-squared
    r_sqaured_value_train = r2_score(y_train_temp, y_pred_train)

    print("SMAPE train set:", smape_score_train)
    print("MSE train set:", mse_score_train)
    print("R-squared train set:",r_sqaured_value_train )

    # Calculate SMAPE
    smape_score_val = smape(y_val, y_pred_val)

    # Calculate MSE
    mse_score_val = mean_squared_error(y_val, y_pred_val)

    r_sqaured_value_val = r2_score(y_val, y_pred_val)

    print("SMAPE val set:", smape_score_val)
    print("MSE val set:", mse_score_val)
    print("R-squared val set:",r_sqaured_value_val)

    # Calculate SMAPE
    smape_score_test = smape(y_test, y_pred_test)

    # Calculate MSE
    mse_score_test = mean_squared_error(y_test, y_pred_test)

    r_sqaured_value_test = r2_score(y_test, y_pred_test)

    print("SMAPE test set:", smape_score_test)
    print("MSE test set:", mse_score_test)
    print("R-squared test set:",r_sqaured_value_test)

#     model_dictionary[target_col] = catboost_model

    print("=========================================================================")

    train_columns.append(target_col)

CatBoost Regressor for result_updrs_1
SMAPE train set: 13.389272437132526
MSE train set: 0.008181360093390995
R-squared train set: 0.9996920958151169
SMAPE val set: 73.60055350978041
MSE val set: 44.63593412012991
R-squared val set: -0.20476888260948845
SMAPE test set: 73.45605029081158
MSE test set: 27.847137787488972
R-squared test set: -0.01251168960494331
CatBoost Regressor for result_updrs_2
SMAPE train set: 48.863033374031424
MSE train set: 0.0008286534330956857
R-squared train set: 0.9999740408638925
SMAPE val set: 91.830565222876
MSE val set: 29.76722514836456
R-squared val set: 0.3593947330298053
SMAPE test set: 100.34743839483843
MSE test set: 20.757580945915354
R-squared test set: 0.4504752620698007
CatBoost Regressor for result_updrs_3
SMAPE train set: 35.94381433825467
MSE train set: 0.0009661751971167289
R-squared train set: 0.9999958495668722
SMAPE val set: 77.17826491036682
MSE val set: 92.71977042420629
R-squared val set: 0.5816713532376148
SMAPE test set: 79.954754811

In [42]:
train_model_dictionary = {'result_updrs_1': None, 'result_updrs_2': None, 'result_updrs_3': None, 'result_updrs_4': None}

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# define the model
train_columns = numeric_columns[:]
for target_col in train_model_dictionary.keys():
    lr = LinearRegression()

    cur_cols = train_columns[:]
    cur_cols.append(target_col)
    X_df = allData[cur_cols]
#     print(X_df.shape)
    X_df.dropna(axis=0, inplace=True)
#     print(X_df.shape)

    X = X_df[train_columns]
    y = X_df[target_col]

    # Split the data into training and temporary sets (70% training, 30% temporary)
    X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

    # Split the temporary set into validation and testing sets (50% validation, 50% testing)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # fit the model
    lr.fit(X_train_temp, y_train_temp)

    y_pred_val = lr.predict(X_val)
    y_pred_train = lr.predict(X_train_temp)
    y_pred_test = lr.predict(X_test)

    print("Linear Regression for", target_col)

    print("=========================================================================")

    # Calculate SMAPE
    smape_score_train = smape(y_train_temp, y_pred_train)

    # Calculate MSE
    mse_score_train = mean_squared_error(y_train_temp, y_pred_train)

    r_sqaured_value_train = r2_score(y_train_temp, y_pred_train)

    print("SMAPE train set:", smape_score_train)
    print("MSE train set:", mse_score_train)
    print("R-squared train set:",r_sqaured_value_train )

    # Calculate SMAPE
    smape_score_val = smape(y_val, y_pred_val)

    # Calculate MSE
    mse_score_val = mean_squared_error(y_val, y_pred_val)

    r_sqaured_value_val = r2_score(y_val, y_pred_val)

    print("SMAPE val set:", smape_score_val)
    print("MSE val set:", mse_score_val)
    print("R-squared val set:",r_sqaured_value_val)

    # Calculate SMAPE
    smape_score_test = smape(y_test, y_pred_test)

    # Calculate MSE
    mse_score_test = mean_squared_error(y_test, y_pred_test)

    r_sqaured_value_test = r2_score(y_test, y_pred_test)

    print("SMAPE test set:", smape_score_test)
    print("MSE test set:", mse_score_test)
    print("R-squared test set:",r_sqaured_value_test)

#     model_dictionary[target_col] = catboost_model

    print("=========================================================================")

    train_columns.append(target_col)

Linear Regression for result_updrs_1
SMAPE train set: 68.89956176822683
MSE train set: 25.63887021771647
R-squared train set: 0.03508520030937301
SMAPE val set: 68.73459408690684
MSE val set: 37.17643133507541
R-squared val set: -0.0034293786352781463
SMAPE test set: 73.0303441670591
MSE test set: 26.039740671054382
R-squared test set: 0.05320460490667678
Linear Regression for result_updrs_2
SMAPE train set: 87.8700287096562
MSE train set: 18.63095137962379
R-squared train set: 0.41635020943808476
SMAPE val set: 90.889841743096
MSE val set: 29.172489046610576
R-squared val set: 0.3721937452770673
SMAPE test set: 97.47101615394008
MSE test set: 17.427725630936546
R-squared test set: 0.5386280133020815
Linear Regression for result_updrs_3
SMAPE train set: 73.71895894348687
MSE train set: 91.20648909093958
R-squared train set: 0.6082010437319565
SMAPE val set: 74.232714140942
MSE val set: 87.70781806265016
R-squared val set: 0.6042840413348229
SMAPE test set: 76.9720703765268
MSE test set

## Training the models

In [43]:
model_dictionary = {'result_updrs_1': None, 'result_updrs_2': None, 'result_updrs_3': None, 'result_updrs_4': None}

from sklearn.linear_model import LinearRegression

train_columns = numeric_columns[:]

for target_col in model_dictionary.keys():
    reg = LinearRegression()

    cur_cols = train_columns[:]
    cur_cols.append(target_col)
    X_df = allData[cur_cols]
    print(X_df.shape)
    X_df.dropna(axis=0, inplace=True)
    print(X_df.shape)

    X = X_df[train_columns]
    y = X_df[target_col]

    print("Target_col", target_col)

    print("Colums considered for training", X.columns.tolist())

    reg.fit(X, y)
    model_dictionary[target_col] = reg

    train_columns.append(target_col)

(2615, 6)
(1068, 6)
Target_col result_updrs_1
Colums considered for training ['visit_month', 'Total Protein oxidation score', 'Total Peptide oxidation score', 'Total Peptide carbamidomethylation score', 'Total Protein carbamidomethylation score']
(2615, 7)
(1068, 7)
Target_col result_updrs_2
Colums considered for training ['visit_month', 'Total Protein oxidation score', 'Total Peptide oxidation score', 'Total Peptide carbamidomethylation score', 'Total Protein carbamidomethylation score', 'result_updrs_1']
(2615, 8)
(1058, 8)
Target_col result_updrs_3
Colums considered for training ['visit_month', 'Total Protein oxidation score', 'Total Peptide oxidation score', 'Total Peptide carbamidomethylation score', 'Total Protein carbamidomethylation score', 'result_updrs_1', 'result_updrs_2']
(2615, 9)
(564, 9)
Target_col result_updrs_4
Colums considered for training ['visit_month', 'Total Protein oxidation score', 'Total Peptide oxidation score', 'Total Peptide carbamidomethylation score', 'To

## Get the Predicions

In [44]:
# Reference : https://www.kaggle.com/code/renataghisloti/linearregression-simple-57-3-smape

def get_predictions(test_df, test_peptides, test_proteins, model):

    allData = preprocess_data(test_peptides, test_proteins, test_df)

    allData = scale_data(allData)

    # Forecast
    test_df = allData.fillna(0)

    print("Shape of test df",test_df.shape)

    testing_cols = numeric_columns[:]

    for target in model.keys():

        test_df[ str(target)] = 0

        # Predict
        X = test_df[testing_cols]

        if target == 'result_updrs_4':
            # The most of the values in the updrs_4 is 0.
            # Most of the values are missing
            test_df[str(target)] = 0
        else:
            # ceil it up because the values in the train dataframe are mostly integers
            test_df[str(target)] = np.ceil(model[target].predict(X))

        testing_cols.append(str(target))


    # Format for final submission
    result = pd.DataFrame()

    for m in [0, 6, 12, 24]:
        for u in [1, 2, 3, 4]:

            temp = test_df[["visit_id", "result_updrs_" + str(u)]]
            temp["prediction_id"] = temp["visit_id"] + "_updrs_" + str(u) + "_plus_" + str(m) + "_months"
            temp["rating"] = temp["result_updrs_" + str(u)]
            temp = temp [['prediction_id', 'rating']]

            result = result.append(temp)
    result = result.drop_duplicates(subset=['prediction_id', 'rating'])

    return result

In [None]:
pred_df = get_predictions(train_clinical, train_peptides, train_proteins, model_dictionary)


Preprocessing peptides...
Concatenating...
Done!
Shape of test df (2615, 12)


In [None]:
pred_df

Unnamed: 0,prediction_id,rating
0,55_0_updrs_1_plus_0_months,6.0
1,55_3_updrs_1_plus_0_months,14.0
2,55_6_updrs_1_plus_0_months,6.0
3,55_9_updrs_1_plus_0_months,14.0
4,55_12_updrs_1_plus_0_months,6.0
...,...,...
2610,65043_48_updrs_4_plus_24_months,0.0
2611,65043_54_updrs_4_plus_24_months,0.0
2612,65043_60_updrs_4_plus_24_months,0.0
2613,65043_72_updrs_4_plus_24_months,0.0
