In [9]:
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split

LOADING DATA

In [10]:
pep_data = pd.read_csv("train_peptides.csv")
prot_data = pd.read_csv("train_proteins.csv")
clin_data = pd.read_csv("train_clinical_data.csv")

Grouping the data

In [11]:
prot_group = prot_data.groupby(["visit_id", "UniProt"])["NPX"].mean().reset_index()
pep_group = pep_data.groupby(["visit_id", "Peptide"])["PeptideAbundance"].mean().reset_index()

Pivot and Merge DF's

In [12]:
prot_trian = prot_group.pivot(index="visit_id", columns="UniProt", values="NPX").rename_axis(columns=None).reset_index()
pep_train = pep_group.pivot(index="visit_id", columns="Peptide", values="PeptideAbundance").rename_axis(columns=None).reset_index()
prot_pep_train = prot_trian.merge(pep_train, how= "left", on=["visit_id"])
df = prot_pep_train.merge(clin_data[["visit_id", "patient_id", "visit_month", "updrs_2"]], how= "left", on="visit_id")
merged_df = df.dropna(subset=["updrs_2"])
merged_df.drop(columns="visit_id")

Unnamed: 0,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,...,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK,patient_id,visit_month,updrs_2
0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,129048.0,...,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.30,10053.0,0.0,0.0
1,10464.20,435586.0,,,,,197117.0,15099.1,164268.0,108114.0,...,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.80,10053.0,12.0,2.0
2,13235.70,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,163776.0,...,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.70,10053.0,18.0,2.0
3,12600.20,494581.0,9165.06,27193.5,22506.10,6015.90,156313.0,54546.4,204013.0,56725.0,...,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98,10138.0,12.0,6.0
4,12003.20,522138.0,4498.51,17189.8,29112.40,2665.15,151169.0,52338.1,240892.0,85767.1,...,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09,10138.0,24.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1108,9983.00,400290.0,24240.10,,16943.50,6303.17,77493.6,46435.3,254247.0,138910.0,...,33599.1,926094.0,118897.0,133682.0,571879.0,80268.3,54889.70,8699.0,24.0,10.0
1109,6757.32,360858.0,18367.60,14760.7,18603.40,1722.77,86847.4,37741.3,212132.0,100519.0,...,35767.3,250397.0,65966.9,77976.8,486239.0,45032.7,,942.0,12.0,2.0
1110,,352722.0,22834.90,23393.1,16693.50,1487.91,114772.0,36095.7,185836.0,99183.5,...,64049.8,479473.0,68505.7,74483.1,561398.0,52916.4,21847.60,942.0,24.0,3.0
1111,11627.80,251820.0,22046.50,26360.5,22440.20,2117.43,82241.9,30146.6,167633.0,84875.1,...,28008.8,231359.0,63265.8,64601.8,632782.0,51123.7,20700.30,942.0,48.0,6.0


SPLIT DATA
split data into train en test set and check shapes to confirm 

In [13]:
X = merged_df
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

CHANGE FROM DF TO DATASET
tensorflow model needs tf dataset instead of pd dataframe

In [14]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, label="updrs_2", task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_test, label="updrs_2", task=tfdf.keras.Task.REGRESSION)
rf = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION, verbose=0)
rf.compile(metrics=["mse"])
rf.fit(x=train_ds)

[INFO 24-01-22 15:47:05.9359 CET kernel.cc:1233] Loading model from path /var/folders/ws/dtdg5kqn1dx9rls1zt3mr4c80000gn/T/tmp9xkgt8_g/model/ with prefix cccaf6e1680a4694
[INFO 24-01-22 15:47:06.0903 CET decision_forest.cc:660] Model loaded with 300 root(s), 68836 node(s), and 1197 input feature(s).
[INFO 24-01-22 15:47:06.0904 CET abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-01-22 15:47:06.0908 CET kernel.cc:1061] Use fast generic engine


<keras.src.callbacks.History at 0x2921d0d50>

EVALUATE MODEL

In [15]:
inspector = rf.make_inspector()
inspector.evaluation()
evaluation = rf.evaluate(x=test_ds)

