In [2]:
import pandas as pd
import numpy as np
from typing import Optional

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

In [3]:
# loading data 
train_peptides = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
train_proteins = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
train_clinincal_data = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
supplemental_clinical_data = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv")

test_peptides = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv")
test_proteins = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv")
test = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv")
sample_submission = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv")

In [4]:
# number of peptides in train set and test set
train_peptides['Peptide'].nunique(), test_peptides['Peptide'].nunique()

(968, 1077)

In [5]:
# number of interesected peptides
intersected_peptides = list(set(test_peptides['Peptide'].unique()).intersection(train_peptides['Peptide'].unique()))
len(intersected_peptides)

943

In [6]:
# number of peptides that don't appear in test set
len(set(train_peptides['Peptide'].unique()).difference(test_peptides['Peptide'].unique()))

25

In [7]:
# number of peptides that don't appear in train set
len(set(test_peptides['Peptide'].unique()).difference(train_peptides['Peptide'].unique()))

134

In [8]:
def replace_missing_values(df: pd.DataFrame, method: Optional[str] = 'mean') -> pd.DataFrame:
    """
    Given a Pandas DataFrame, replaces missing values with either 0, removes them, 
    or replaces them with the mean of the respective column.
    
    Parameters:
    df (pd.DataFrame): Input dataframe to replace missing values.
    method (str, optional): The method to use for replacing missing values. 
        Valid options are 'mean', 'zero' and 'remove'. Defaults to 'mean'.
    
    Returns:
    pd.DataFrame: A DataFrame with the missing values replaced according to the selected method.
    """
    
    # Replace missing values with 0
    if method == "zero":
        df = df.fillna(0)+1e-7
        return df
    
    # Remove rows with missing values
    elif method == "remove":
        df = df.dropna()
        return df
    
    # Replace missing values with the mean of the respective column
    elif method == "mean":
        for col in df.select_dtypes(include=['number']).columns:
            df[col] = df[col].fillna(df[col].mean())
        return df
    
    # Invalid method
    else:
        print("Invalid method. Valid options are 'mean', 'zero' and 'remove'.")
        return None

In [9]:
def preprocess(protiens, peptides, clinical, missing_method="mean"):
    # Merge the proteins data and peptides data on the common columns.
    merged_proteins_peptides = pd.merge(protiens, peptides, on=['visit_id', 'visit_month',
                                                                            'patient_id', 'UniProt'])

    # Merge the merged protein-peptides data with the clinical data on the common columns.
    merged = pd.merge(merged_proteins_peptides, clinical, on=['visit_id', 'visit_month', 'patient_id'])

    pivoted = merged.pivot(index='visit_id', columns=['Peptide'], values='PeptideAbundance')
    
    # Add visit_month, the 4 scores, and medication status.
    df = pd.merge(clinical, pivoted, on='visit_id', how='right').set_index('visit_id')

    df = df.drop('patient_id', axis = 1)

    # just getting the reduced dataframe with interesected peptides
    reduced_df = df[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4',
       'upd23b_clinical_state_on_medication', 'visit_month'] + intersected_peptides]
    
    # Replace NaN with 0 in the Peptides columns.
    reduced_df.iloc[:, 6:] = reduced_df.iloc[:, 6:].fillna(0)

    # dropping state on medication
    reduced_df.drop("upd23b_clinical_state_on_medication", axis=1, inplace=True)

    clean_df = replace_missing_values(reduced_df, missing_method)

    return clean_df

In [10]:
clean_train = preprocess(train_proteins, train_peptides, train_clinincal_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
clean_train

Unnamed: 0_level_0,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,FAEQDAK,SASDLTWDNLK,VFSNGADLSGVTEEAPLK,DLATVYVDVLK,FTFEYSR,...,YVMLPVADQDQC(UniMod_4)IR,IPLNDLFR,KTLLSNLEEAK,QQTHMLDVMQDHFSR,AIGYLNTGYQR,TVAAPSVFIFPPSDEQLK,IGDQWDKQHDMGHMMR,VYC(UniMod_4)DMNTENGGWTVIQNR,YWGVASFLQK,TPSGLYLGTC(UniMod_4)ER
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,3.0,0.0,13.0,0.000000,0,1327970.0,952787.0,427825.0,59944.4,0.0,...,583075.0,0.0,255069.0,138785.0,151638.0,866401.0,6089.82,61108.9,104260.0,0.00
10053_12,4.0,2.0,8.0,0.000000,12,951426.0,920166.0,311696.0,95475.4,0.0,...,355643.0,59474.5,256878.0,45890.8,226364.0,714019.0,16335.70,55446.7,123254.0,0.00
10053_18,2.0,2.0,0.0,0.000000,18,5313140.0,956536.0,380892.0,104368.0,98468.7,...,496021.0,89326.3,237143.0,119681.0,170298.0,704751.0,17132.60,61443.9,128336.0,0.00
10138_12,3.0,6.0,31.0,0.000000,12,3898800.0,1871060.0,309381.0,38374.2,138120.0,...,328482.0,123802.0,279519.0,239084.0,161651.0,1372500.0,27068.30,32784.1,129964.0,11314.30
10138_24,4.0,7.0,19.0,10.000000,24,1936610.0,452374.0,502991.0,136126.0,155238.0,...,496737.0,141612.0,398463.0,257273.0,178278.0,1084070.0,15608.80,87236.2,111799.0,11183.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8699_24,11.0,10.0,13.0,2.000000,24,1768870.0,1010980.0,344542.0,307786.0,132203.0,...,926094.0,108037.0,234280.0,197458.0,150700.0,798587.0,30383.90,42686.4,133682.0,0.00
942_12,5.0,2.0,25.0,0.000000,12,0.0,1265970.0,367979.0,147807.0,94377.7,...,250397.0,96194.3,261125.0,146803.0,164415.0,228645.0,10014.10,18473.8,77976.8,0.00
942_24,2.0,3.0,23.0,1.908612,24,1570310.0,1223220.0,389283.0,173503.0,82209.6,...,479473.0,82761.9,260214.0,118274.0,151694.0,393558.0,10587.90,60487.0,74483.1,6521.82
942_48,2.0,6.0,35.0,0.000000,48,1722200.0,1139280.0,401002.0,205752.0,100145.0,...,231359.0,71072.1,256175.0,140375.0,169616.0,354233.0,12486.50,55017.9,64601.8,7826.98


In [12]:
clean_train.isna().sum().sum()

0

In [13]:
targets_cols = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
X = clean_train.drop(targets_cols, axis=1)
# X.to_csv("X.csv", index=False)

y_updrs1 = clean_train['updrs_1']
y_updrs2 = clean_train['updrs_2']
y_updrs3 = clean_train['updrs_3']
y_updrs4 = clean_train['updrs_4']

In [14]:
def train_model(X, y):
    """training model and returning it"""
#     model = LGBMRegressor()
#     model = CatBoostRegressor(allow_writing_files=False, verbose=0)
    model = XGBRegressor()
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    model.fit(X_scaled, y)

    return model

In [15]:
updrs1_model = train_model(X, y_updrs1)
updrs2_model = train_model(X, y_updrs2)
updrs3_model = train_model(X, y_updrs3)
updrs4_model = train_model(X, y_updrs4)

In [23]:
def get_predictions(test, test_peptides, test_proteins, sample_submission):
    # Merge the proteins data and peptides data on the common columns.
    test_merged_proteins_peptides = pd.merge(test_proteins, test_peptides, on = ['visit_id', 'visit_month', 'patient_id', 'UniProt'])

    # Merge the merged protein-peptides data with the clinical data on the common columns.
    test_merged = pd.merge(test_merged_proteins_peptides, test, on = ['visit_id', 'visit_month', 'patient_id'])
    
    test_merged['prediction_id'] = test_merged.apply(lambda x: '{}_plus_{}months'.format(x['row_id'], x['visit_month']), axis = 1)
    
    test_pivoted = test_merged.pivot(index = 'prediction_id', columns = ['Peptide'], values = 'PeptideAbundance')
    
    test_pivoted = test_pivoted.fillna(0)
    
    # We need to remove duplicates by patient_id.
    index_list = list(test_pivoted.index)
    patient_id = []
    for i in range(len(test_pivoted)):
        element = index_list[i].split('_')[0]
        patient_id.append(element)
    test_pivoted['patient_id'] = patient_id
    
    test_pivoted = test_pivoted.drop_duplicates(subset = 'patient_id')
    patient_id = test_pivoted.pop('patient_id')
    test_pivoted.insert(0, 'patient_id', patient_id)
    
    # We need to crate visit_month in sample submission.
    index_list = list(sample_submission.prediction_id)
    patient_id = []
    for i in range(len(sample_submission)):
        element = index_list[i].split('_')[0]
        patient_id.append(element)
    
    sample_submission['patient_id'] = patient_id
    
    index_list = list(sample_submission.prediction_id)
    updrs = []
    for i in range(len(sample_submission)):
        element = index_list[i].split('_')[3]
        updrs.append(element)
    
    sample_submission['updrs'] = updrs
    
    index_list = list(sample_submission.prediction_id)
    visit_month = []
    for i in range(len(sample_submission)):
        element = index_list[i].split('_')[-2]
        visit_month.append(element)
    
    sample_submission['visit_month'] = visit_month
    
    # Merge the data and sample_submission, while keeping the order of rows of sample_submission.
    test_df = pd.merge(sample_submission, test_pivoted, on = ['patient_id'], how = 'left')
    
    # if peptide column doens't exist make new one and fill it with 0
    not_found_peptides = list(set(intersected_peptides).difference(set(test_peptides['Peptide'].unique())))
    
    for col_pep in not_found_peptides:
        test_df[col_pep] = 0
    
    
    test_df = test_df[['prediction_id', 'visit_month'] + intersected_peptides]
    
    test_df['visit_month'] = test_df['visit_month'].astype(int)
    
    answer =  sample_submission.copy()
    
    for i, row in test_df.iterrows():
        # extracting updrs number 
        updrs_number = row['prediction_id'].split('_')[3]

        x_predict = row.values[1:].reshape(1, -1)    
        if updrs_number == '1':
            predicted_value = int(abs(np.round(updrs1_model.predict(x_predict)[0])))
            answer.loc[i, 'rating'] = predicted_value

        elif updrs_number == '2':
            predicted_value = int(abs(np.round(updrs1_model.predict(x_predict)[0])))
            answer.loc[i, 'rating'] = predicted_value
            
        elif updrs_number == '3':
            predicted_value = int(abs(np.round(updrs1_model.predict(x_predict)[0])))
            answer.loc[i, 'rating'] = predicted_value
            
        else:
            predicted_value = int(abs(np.round(updrs1_model.predict(x_predict)[0])))
            answer.loc[i, 'rating'] = predicted_value
    
    answer = answer.iloc[:, :2]

    
    return answer

In [24]:
get_predictions(test, test_peptides, test_proteins, sample_submission)

int64


Unnamed: 0,prediction_id,rating
0,3342_0_updrs_1_plus_0_months,7
1,3342_0_updrs_1_plus_6_months,7
2,3342_0_updrs_1_plus_12_months,7
3,3342_0_updrs_1_plus_24_months,7
4,3342_0_updrs_2_plus_0_months,7
...,...,...
59,50423_6_updrs_3_plus_24_months,9
60,50423_6_updrs_4_plus_0_months,9
61,50423_6_updrs_4_plus_6_months,9
62,50423_6_updrs_4_plus_12_months,9


In [18]:
import sys
sys.path.append('/kaggle/input/amp-pd')
import amp_pd_peptide

In [19]:
# Initialize the environment.
env = amp_pd_peptide.make_env()
iter_test = env.iter_test()

In [20]:
# Register the predictions.
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    print(test.head())
    print(test.shape)
    print(test_peptides.shape)
    print(test_peptides.head())
    
    print(test_proteins.shape)
    print(test_proteins.head())
    print(sample_submission.shape)
    
    print("pep", test_peptides['Peptide'].nunique())
    result = get_predictions(test, test_peptides, test_proteins, sample_submission)
    
    if len(sample_submission) == len(result):
        sample_submission['rating'] = result['rating']
        
    elif len(sample_submission) < len(result):
        sample_submission['rating'] = result['rating'][0:len(sample_submission)]
        
    else:
        sample_submission['rating'][0:len(result)] = result['rating']
        
    sample_submission = sample_submission.iloc[:, 0:2]
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
  visit_id  visit_month  patient_id updrs_test           row_id
0   3342_0            0        3342    updrs_1   3342_0_updrs_1
1   3342_0            0        3342    updrs_2   3342_0_updrs_2
2   3342_0            0        3342    updrs_3   3342_0_updrs_3
3   3342_0            0        3342    updrs_4   3342_0_updrs_4
4  50423_0            0       50423    updrs_1  50423_0_updrs_1
(8, 5)
(1021, 6)
  visit_id  visit_month  patient_id UniProt  \
0  50423_0            0       50423  O00391   
1  50423_0            0       50423  O00391   
2  50423_0            0       50423  O00533   
3  50423_0            0       50423  O00533   
4  50423_0            0       50423  O00533   

                                  Peptide  PeptideAbundance  
0                    AHFSPSNIILDFPAAGSAAR           22226.3  
1                           NEQEQPLGQWHLS           10901.6  
2    

KeyError: "['AVGDKLPEC(UniMod_4)EADDGC(UniMod_4)PKPPEIAHGYVEHSVR', 'DVQLVESGGGLVKPGGSLR', 'C(UniMod_4)AEENC(UniMod_4)FIQK', 'TPLPPTSAHGNVAEGETKPDPDVTER', 'EDC(UniMod_4)NELPPRR', 'LAVTTHGLPC(UniMod_4)LAWASAQAK', 'GGETSEMYLIQPDSSVKPYR', 'PALEDLR', 'ETLLQDFR', 'FNKPFVFLM(UniMod_35)IEQNTK', 'TFTLLDPK', 'ALPGTPVASSQPR', 'DSGEGDFLAEGGGVR', 'HSIFTPETNPR', 'IDQTVEELRR', 'AVLPTGDVIGDSAK', 'GYPGVQAPEDLEWER', 'LAARLEALKENGGAR', 'TSLEDFYLDEER', 'RTHLPEVFLSK', 'KIYPTVNC(UniMod_4)QPLGMISLMK', 'ALFLETEQLK', 'INHC(UniMod_4)RFDEFFSEGC(UniMod_4)APGSKK', 'LLEELVR', 'SKVPPPRDFHINLFR', 'C(UniMod_4)FSGQC(UniMod_4)ISK', 'LGQSLDC(UniMod_4)NAEVYVVPWEK', 'LVYPSC(UniMod_4)EEK', 'SEGLLAC(UniMod_4)GTNAR', 'LLRDPADASEAHESSSR', 'C(UniMod_4)C(UniMod_4)ESASEDC(UniMod_4)MAKELPEHTVK', 'HKVYAC(UniMod_4)EVTHQGLSSPVTK', 'KLGQSLDC(UniMod_4)NAEVYVVPWEK', 'RLGMFNIQHC(UniMod_4)K', 'HLSLLTTLSNR', 'STNLHDYGMLLPC(UniMod_4)GIDK', 'GLEFLSVPSTYYK'] not in index"