# Feature Engineering

## Imports

In [2]:
import sys
# sys.path.append("../src")

# Basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
import missingno as mno
# Viz
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots

from sklearn.preprocessing import StandardScaler

from utils import print_missing_vals
# settings
import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')

In [3]:
peptides = pd.read_csv("../data/train_peptides.csv")
proteins = pd.read_csv("../data/train_proteins.csv")
clinical = pd.read_csv("../data/train_clinical_data.csv")

In [10]:
df_0 = clinical[(clinical.visit_month == 0)][['visit_id','updrs_1']]
print('Train shape:', df_0.shape)
df_0.head()

Train shape: (248, 2)


Unnamed: 0,visit_id,updrs_1
0,55_0,10.0
13,942_0,3.0
28,1517_0,11.0
38,1923_0,2.0
45,2660_0,2.0


## Peptide Data

### Basic FE

In [7]:
peptide_pepAbundance_ft = peptides.groupby('visit_id').agg(Abe_min=('PeptideAbundance','min'), Abe_max=('PeptideAbundance','max'), Abe_mean=('PeptideAbundance','mean'), Abe_std=('PeptideAbundance','std')).reset_index()
peptide_pepAbundance_ft.head()

Unnamed: 0,visit_id,Abe_min,Abe_max,Abe_mean,Abe_std
0,10053_0,82.9679,66333900.0,726248.393431,3535602.0
1,10053_12,128.446,73059300.0,737183.385744,3799654.0
2,10053_18,108.5,64711200.0,601466.78432,3006568.0
3,10138_12,129.024,71652400.0,699099.199189,3379573.0
4,10138_24,142.648,123897000.0,732120.888877,4912602.0


In [13]:
df_peptides = pd.merge(peptides, df_0, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()
peptides_PeptideAbundance_updrs.head()

Unnamed: 0,Peptide,updrs_1_sum
0,AADDTWEPFASGK,5.357143
1,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,5.296703
2,AAFTEC(UniMod_4)C(UniMod_4)QAADK,5.305699
3,AANEVSSADVK,5.36478
4,AATGEC(UniMod_4)TATVGKR,5.146497


In [14]:
df_peptides = pd.merge(peptides, peptides_PeptideAbundance_updrs, on = 'Peptide', how = 'left')
peptides_ft = df_peptides.groupby('visit_id').agg(peptides_updrs_1_min=('updrs_1_sum','min'), peptides_updrs_1_max=('updrs_1_sum','max'), peptides_updrs_1_mean=('updrs_1_sum','mean'), peptides_updrs_1_std=('updrs_1_sum','std')).reset_index()
peptides_ft

Unnamed: 0,visit_id,peptides_updrs_1_min,peptides_updrs_1_max,peptides_updrs_1_mean,peptides_updrs_1_std
0,10053_0,4.878788,5.661972,5.279278,0.092880
1,10053_12,4.816794,5.661972,5.277513,0.097712
2,10053_18,4.297619,5.661972,5.265384,0.116303
3,10138_12,4.297619,5.661972,5.253513,0.126117
4,10138_24,4.297619,5.661972,5.257710,0.123452
...,...,...,...,...,...
1108,8699_24,4.572519,5.661972,5.256902,0.123395
1109,942_12,4.572519,5.661972,5.254323,0.118205
1110,942_24,4.572519,5.652174,5.255565,0.117226
1111,942_48,4.572519,5.652174,5.253489,0.119653


## Protein Data

### Basic FE

In [8]:
proteins_npx_ft = proteins.groupby('visit_id').agg(NPX_min=('NPX','min'), NPX_max=('NPX','max'), NPX_mean=('NPX','mean'), NPX_std=('NPX','std')).reset_index()
proteins_npx_ft.head()

Unnamed: 0,visit_id,NPX_min,NPX_max,NPX_mean,NPX_std
0,10053_0,2497.84,269126000.0,2856580.0,21316300.0
1,10053_12,5800.87,270030000.0,2728871.0,20921620.0
2,10053_18,1334.11,278835000.0,2509967.0,19694530.0
3,10138_12,2520.24,365582000.0,3002583.0,25161700.0
4,10138_24,1436.94,396894000.0,3068891.0,27168060.0


In [11]:
df_proteins = pd.merge(proteins, df_0, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()
proteins_Uniprot_updrs.head()

Unnamed: 0,UniProt,updrs_1_sum
0,O00391,4.971014
1,O00533,5.319588
2,O00584,5.286458
3,O14498,5.217877
4,O14773,5.371585


In [12]:
df_proteins = pd.merge(proteins, proteins_Uniprot_updrs, on = 'UniProt', how = 'left')
proteins_UniProt_ft = df_proteins.groupby('visit_id').agg(proteins_updrs_1_min=('updrs_1_sum','min'), proteins_updrs_1_max=('updrs_1_sum','max'), proteins_updrs_1_mean=('updrs_1_sum','mean'), proteins_updrs_1_std=('updrs_1_sum','std')).reset_index()
proteins_UniProt_ft.head()

Unnamed: 0,visit_id,proteins_updrs_1_min,proteins_updrs_1_max,proteins_updrs_1_mean,proteins_updrs_1_std
0,10053_0,4.892857,5.601449,5.300548,0.077355
1,10053_12,4.816794,5.652174,5.296073,0.099055
2,10053_18,4.297619,5.652174,5.272617,0.134631
3,10138_12,4.297619,5.652174,5.263118,0.143238
4,10138_24,4.297619,5.652174,5.269522,0.137776


## Overall

### Grouping

In [32]:
# Grouping by patient and month
combined = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_peptide_clinical_outlier.csv')

# Group the data by visit_month and patient_id
groups = combined.groupby(['visit_month'])

# Loop through the groups and save each group to a separate CSV file
for group_name, group_data in groups:
    filename = f'/home/alfred/Code/Kaggle/AMP-Parkinsons/data/grouped_by_month_prot/month_{group_name}.csv'
    # print(filename)
    # columns_to_remove_outliers = ['updrs_1','updrs_2','updrs_3','updrs_4']
    # for col in columns_to_remove_outliers:
    #     z_scores = np.abs((group_data[col] - np.mean(group_data[col])) / np.std(group_data[col]))
    #     good_indices = z_scores < 2.5
    #     group_data = group_data[good_indices]
    group_data.to_csv(filename, index=False)

In [66]:
# Grouping by month
combined = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_clinical_scaled.csv')

# Group the data by visit_month and patient_id
groups = combined.groupby(['visit_month', 'patient_id'])

# Loop through the groups and save each group to a separate CSV file
for group_name, group_data in groups:
    filename = f'/home/alfred/Code/Kaggle/AMP-Parkinsons/data/grouped_by_id_and_month_prot/{group_name[0]}_{group_name[1]}.csv'
    group_data.to_csv(filename, index=False)

### Pivoting with Proteins

In [4]:
# Load the data from two files
file1 = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/imputed/train_clinical_imputed.csv')
file2 = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/train_proteins.csv')

# Merge the dataframes based on visit_id, visit_month, and patient_id columns
merged_df = pd.merge(file1, file2, on=['visit_id', 'visit_month', 'patient_id'])

# Pivot the UniProt and NPX columns
pivoted_df = merged_df.pivot(index=['visit_id', 'visit_month', 'patient_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'upd23b_medication_Off','upd23b_medication_On'], columns='UniProt', values='NPX')

pivoted_df = pivoted_df.reset_index()
pivoted_df = pivoted_df.sort_values(by='patient_id')
# pivoted_df.head()
pivoted_filled_df = pivoted_df.fillna(pivoted_df.mean())
# pivoted_df.to_csv("/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/pivoted.csv")

In [17]:
pivoted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068 entries, 0 to 1067
Columns: 236 entries, visit_id to Q9Y6R7
dtypes: float64(231), int64(4), object(1)
memory usage: 1.9+ MB


In [18]:
pivoted_df.describe()

UniProt,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,O00533,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
count,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,736.0,1067.0,...,915.0,1031.0,916.0,937.0,1055.0,1068.0,870.0,733.0,1008.0,1000.0
mean,26.744382,32673.328652,6.566479,5.821161,17.344569,1.210674,0.208801,0.214419,11605.680568,509201.3,...,281160.099639,35319.249893,111477.225983,15211.324365,35506.761526,229812.621348,69986.346402,19658.800546,19875.057917,20849.81882
std,22.839426,18612.372744,5.330702,5.931414,14.987692,2.424801,0.406643,0.410611,2815.680292,233683.4,...,135120.685684,18622.313931,24017.506085,4590.724764,12572.302507,102094.316809,26081.467236,6510.064922,5956.448373,10758.041965
min,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,873.778,59718.2,...,9844.57,4704.1,47111.7,4469.08,1571.4,13554.3,3439.81,3950.62,2038.5,3343.2
25%,6.0,16574.0,2.0,1.0,2.0,0.0,0.0,0.0,9722.8825,346044.5,...,197452.0,21624.85,94155.15,12001.1,26776.15,157389.5,54096.9,15011.5,15758.2,13419.625
50%,24.0,29365.0,5.0,4.0,16.0,0.0,0.0,0.0,11482.25,481744.0,...,271653.0,33812.5,109658.5,14956.7,35030.0,213907.5,67491.05,18934.6,19440.9,18925.85
75%,48.0,50611.0,9.0,9.0,28.0,1.0,0.0,0.0,13371.075,644726.0,...,357081.0,47460.0,126551.75,18045.6,42895.15,284162.25,84028.275,23350.3,23115.4,26229.05
max,108.0,65043.0,33.0,29.0,78.0,20.0,1.0,1.0,21361.8,1806980.0,...,893220.0,128338.0,209158.0,39060.6,84868.9,718141.0,226139.0,68511.5,48743.4,122699.0


#### Missing Values of pivoted

In [23]:
def missing_vals(data: pd.DataFrame, amountMissing: int):
    """Prints out the amount of missing data and the percentages

    :param data: Dataset to find missing data in
    :type data: pd.DataFrame
    :param amountMissing: The amount of features to print out
    :type amountMissing: int
    """
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [29]:
missing = missing_vals(pivoted_df, 236)
missing.to_csv("/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/missing.csv")

In [30]:
# Filling missing protein values with nans

pivoted_filled_df = pivoted_df.fillna(pivoted_df.mean())

In [26]:
pivoted_filled_df.describe()

UniProt,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,O00533,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
count,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,...,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0
mean,26.744382,32673.328652,6.566479,5.821161,17.344569,1.210674,0.208801,0.214419,11605.680568,509201.3,...,281160.099639,35319.249893,111477.225983,15211.324365,35506.761526,229812.621348,69986.346402,19658.800546,19875.057917,20849.81882
std,22.839426,18612.372744,5.330702,5.931414,14.987692,2.424801,0.406643,0.410611,2336.925141,233573.8,...,125058.359856,18296.585365,22241.099178,4299.68827,12495.479258,102094.316809,23537.465577,5392.108847,5786.55263,10409.593453
min,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,873.778,59718.2,...,9844.57,4704.1,47111.7,4469.08,1571.4,13554.3,3439.81,3950.62,2038.5,3343.2
25%,6.0,16574.0,2.0,1.0,2.0,0.0,0.0,0.0,10566.65,346579.2,...,211038.25,22014.375,97283.05,12464.175,26967.625,157389.5,57822.2,16819.975,15925.825,13847.925
50%,24.0,29365.0,5.0,4.0,16.0,0.0,0.0,0.0,11605.680568,482036.0,...,281160.099639,34600.65,111477.225983,15211.324365,35184.85,213907.5,69986.346402,19658.800546,19838.55,19846.15
75%,48.0,50611.0,9.0,9.0,28.0,1.0,0.0,0.0,12435.6,644449.5,...,334399.5,46993.25,122562.75,17450.3,42853.45,284162.25,79080.325,21212.325,22860.85,25674.975
max,108.0,65043.0,33.0,29.0,78.0,20.0,1.0,1.0,21361.8,1806980.0,...,893220.0,128338.0,209158.0,39060.6,84868.9,718141.0,226139.0,68511.5,48743.4,122699.0


### Pivoting with Peptides

In [6]:
# Load the data from two files
file1 = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_clinical.csv')
file2 = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/train_peptides.csv')

# Merge the dataframes based on visit_id, visit_month, and patient_id columns
merged_df = pd.merge(file1, file2, on=['visit_id', 'visit_month', 'patient_id'])

title = 'visit_id,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,O75144,O75326,O94919,P00441,P00450,P00734,P00736,P00738,P00746,P00747,P00748,P00751,P01008,P01009,P01011,P01019,P01023,P01024,P01031,P01033,P01034,P01042,P01344,P01591,P01594,P01608,P01621,P01717,P01780,P01833,P01834,P01857,P01859,P01860,P01861,P01876,P01877,P02452,P02647,P02649,P02652,P02655,P02656,P02671,P02675,P02679,P02747,P02748,P02749,P02750,P02751,P02753,P02760,P02763,P02765,P02766,P02768,P02774,P02787,P02790,P02792,P04004,P04075,P04156,P04180,P04196,P04207,P04211,P04216,P04217,P04275,P04406,P04433,P05060,P05067,P05090,P05155,P05156,P05408,P05452,P05546,P06310,P06396,P06454,P06681,P06727,P07195,P07225,P07333,P07339,P07602,P07711,P07858,P07998,P08123,P08133,P08253,P08294,P08493,P08571,P08603,P08637,P08697,P09104,P09486,P09871,P10451,P10643,P10645,P10909,P11142,P11277,P12109,P13473,P13521,P13591,P13611,P13671,P13987,P14174,P14314,P14618,P16035,P16070,P16152,P16870,P17174,P17936,P18065,P19021,P19652,P19823,P19827,P20774,P20933,P23083,P23142,P24592,P25311,P27169,P30086,P31997,P32754,P35542,P36222,P36955,P36980,P39060,P40925,P41222,P43121,P43251,P43652,P49588,P49908,P51884,P54289,P55290,P60174,P61278,P61626,P61769,P61916,P80748,P98160,Q02818,Q06481,Q08380,Q12805,Q12841,Q12907,Q13283,Q13332,Q13449,Q13451,Q13740,Q14118,Q14508,Q14515,Q14624,Q15904,Q16270,Q16610,Q562R1,Q6UX71,Q6UXB8,Q6UXD5,Q7Z3B1,Q7Z5P9,Q8IWV7,Q8N2S1,Q8NBJ4,Q8NE71,Q92520,Q92823,Q92876,Q96BZ4,Q96KN2,Q96PD5,Q96S96,Q99435,Q99674,Q99683,Q99829,Q99832,Q99969,Q9BY67,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7'

cols = title.split(',')
# Pivot the UniProt and NPX columns
pivoted_peptide_df = merged_df.pivot(index=cols, columns='Peptide', values='PeptideAbundance')

pivoted_peptide_df = pivoted_peptide_df.reset_index()
pivoted_peptide_df = pivoted_peptide_df.sort_values(by=['patient_id', 'visit_month'])
pivoted_peptide_df = pivoted_peptide_df.fillna(pivoted_peptide_df.mean())
pivoted_peptide_df.head()

Peptide,visit_id,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
814,55_0,0,55,10.0,6.0,15.0,1.0,0,0,11254.3,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
817,55_6,6,55,8.0,10.0,34.0,1.0,0,0,13163.6,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
815,55_12,12,55,10.0,10.0,41.0,0.0,0,1,15257.6,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
816,55_36,36,55,17.0,18.0,51.0,0.0,0,1,13530.8,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
1067,942_6,6,942,8.0,2.0,21.0,0.0,0,0,11218.7,...,226314.0,6399.8,3938239.0,57571.4,480951.0,80001.2,79661.9,573300.0,48005.8,15674.1


In [41]:
pivoted_peptide_df.describe()

Peptide,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,O00533,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
count,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,...,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0,1068.0
mean,26.744382,32673.328652,6.566479,5.821161,17.344569,1.210674,0.208801,0.214419,11605.680568,509201.3,...,215475.0709,8996.806715,3938239.0,68394.594621,613073.2,92668.876265,125754.068949,472235.953491,46987.825294,21042.291924
std,22.839426,18612.372744,5.330702,5.931414,14.987692,2.424801,0.406643,0.410611,2336.925141,233573.8,...,50313.883464,2961.030625,1620814.0,57816.007323,494574.1,30091.801489,38243.867433,130017.630872,13069.455974,9105.018018
min,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,873.778,59718.2,...,12164.3,258.249,162464.0,884.26,7494.66,869.898,991.452,11371.2,6362.49,868.903
25%,6.0,16574.0,2.0,1.0,2.0,0.0,0.0,0.0,10566.65,346579.2,...,187394.0,7087.5875,2941620.0,34201.85,294923.0,70517.575,100952.0,385090.25,38471.475,16230.625
50%,24.0,29365.0,5.0,4.0,16.0,0.0,0.0,0.0,11605.680568,482036.0,...,216340.0,8996.806715,3872205.0,59685.7,484596.5,88893.05,124152.0,464453.0,46863.3,21042.291924
75%,48.0,50611.0,9.0,9.0,28.0,1.0,0.0,0.0,12435.6,644449.5,...,246187.5,10330.775,4631290.0,80915.175,759108.5,110132.5,149156.5,548241.25,53433.65,24460.275
max,108.0,65043.0,33.0,29.0,78.0,20.0,1.0,1.0,21361.8,1806980.0,...,409939.0,27670.5,13855500.0,712856.0,3984710.0,251526.0,264224.0,948416.0,107220.0,70020.8


In [42]:
pivoted_peptide_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1068 entries, 817 to 1025
Columns: 1204 entries, visit_id to YYWGGQYTWDMAK
dtypes: float64(1199), int64(4), object(1)
memory usage: 9.8+ MB


### Standardize Values

In [7]:
# select columns to standardize
cols_to_standardize = list(pivoted_peptide_df.columns)

cols_to_remove = ['visit_id','visit_month','patient_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_medication_Off','upd23b_medication_On']

cols_to_standardize = [x for x in cols_to_standardize if x not in cols_to_remove]

# create StandardScaler object
scaler = StandardScaler()

# fit scaler on selected columns
scaler.fit(pivoted_peptide_df[cols_to_standardize])

# transform selected columns
pivoted_peptide_df[cols_to_standardize] = scaler.transform(pivoted_peptide_df[cols_to_standardize])
pivoted_peptide_df.head()

Peptide,visit_id,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
814,55_0,0,55,10.0,6.0,15.0,1.0,0,0,-0.150431,...,-0.284688,2.532566,-0.07899077,0.666207,-0.065554,1.279556,1.048945,-0.268789,-0.05348,-0.675097
817,55_6,6,55,8.0,10.0,34.0,1.0,0,0,0.666966,...,-0.882796,1.419763,0.1118978,0.77853,-0.198672,0.360504,0.493197,-0.110383,-0.531267,-0.037183
815,55_12,12,55,10.0,10.0,41.0,0.0,0,1,1.563435,...,0.324057,2.999345,0.9480547,0.82873,0.199744,1.471991,1.465207,-0.153766,0.592289,0.087774
816,55_36,36,55,17.0,18.0,51.0,0.0,0,1,0.824169,...,-0.600216,3.238124,-0.7892198,0.39008,0.133692,1.194377,2.038561,0.203029,0.444364,-0.776704
1067,942_6,6,942,8.0,2.0,21.0,0.0,0,0,-0.165671,...,0.215527,-0.877473,-2.874355e-16,-0.187288,-0.267268,-0.421165,-1.205782,0.777674,0.077926,-0.589862


In [67]:
# select columns to standardize
cols_to_standardize = list(pivoted_filled_df.columns)

cols_to_remove = ['visit_id','visit_month','patient_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_medication_Off','upd23b_medication_On']

cols_to_standardize = [x for x in cols_to_standardize if x not in cols_to_remove]

# create StandardScaler object
scaler = StandardScaler()

# fit scaler on selected columns
scaler.fit(pivoted_filled_df[cols_to_standardize])

# transform selected columns
pivoted_filled_df[cols_to_standardize] = scaler.transform(pivoted_filled_df[cols_to_standardize])
pivoted_filled_df.head()

UniProt,visit_id,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,...,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7
817,55_6,6,55,8.0,10.0,34.0,1.0,0,0,0.666966,...,0.996129,-0.272673,-0.103606,1.928595,1.292996,1.372486,0.0,0.607913,-0.372167,-0.404347
816,55_36,36,55,17.0,18.0,51.0,0.0,0,1,0.824169,...,0.179495,0.703693,-0.075716,2.028836,2.779646,1.447746,0.212091,2.24021,0.399638,0.083355
815,55_12,12,55,10.0,10.0,41.0,0.0,0,1,1.563435,...,0.182343,0.421421,0.154911,1.547063,2.089048,0.869607,-0.179532,1.769079,1.497455,-0.149828
814,55_0,0,55,10.0,6.0,15.0,1.0,0,0,-0.150431,...,0.67452,0.011415,-0.650975,1.840804,2.034155,1.752979,0.0,1.873985,0.684431,-0.182256
1067,942_6,6,942,8.0,2.0,21.0,0.0,0,0,-0.165671,...,-0.222297,-0.431291,-0.795321,0.521113,-1.137557,0.75268,0.524905,0.808949,-0.161754,-0.538085


### Remove Outliers

In [31]:
columns_to_remove_outliers = ['updrs_1','updrs_2','updrs_3','updrs_4']


for col in columns_to_remove_outliers:
    z_scores = np.abs((pivoted_peptide_df[col] - np.mean(pivoted_peptide_df[col])) / np.std(pivoted_peptide_df[col]))
    good_indices = z_scores < 3
    pivoted_peptide_df = pivoted_peptide_df[good_indices]

### Ending

#### All feature engineering

In [30]:
# Load the data from two files
file1 = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/imputed/train_clinical_imputed.csv')
file2 = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/train_proteins.csv')

# Merge the dataframes based on visit_id, visit_month, and patient_id columns
merged_df = pd.merge(file1, file2, on=['visit_id', 'visit_month', 'patient_id'])

# Pivot the UniProt and NPX columns
pivoted_df = merged_df.pivot(index=['visit_id', 'visit_month', 'patient_id', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'upd23b_medication_Off','upd23b_medication_On'], columns='UniProt', values='NPX')

pivoted_df = pivoted_df.reset_index()
pivoted_df = pivoted_df.sort_values(by='patient_id')
# pivoted_df.head()
pivoted_filled_df = pivoted_df.fillna(pivoted_df.mean())


# Load the data from two files
file1 = pivoted_filled_df
file2 = pd.read_csv('/home/alfred/Code/Kaggle/AMP-Parkinsons/data/train_peptides.csv')

# Merge the dataframes based on visit_id, visit_month, and patient_id columns
merged_df = pd.merge(file1, file2, on=['visit_id', 'visit_month', 'patient_id'])

title = 'visit_id,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_medication_Off,upd23b_medication_On,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,O60888,O75144,O75326,O94919,P00441,P00450,P00734,P00736,P00738,P00746,P00747,P00748,P00751,P01008,P01009,P01011,P01019,P01023,P01024,P01031,P01033,P01034,P01042,P01344,P01591,P01594,P01608,P01621,P01717,P01780,P01833,P01834,P01857,P01859,P01860,P01861,P01876,P01877,P02452,P02647,P02649,P02652,P02655,P02656,P02671,P02675,P02679,P02747,P02748,P02749,P02750,P02751,P02753,P02760,P02763,P02765,P02766,P02768,P02774,P02787,P02790,P02792,P04004,P04075,P04156,P04180,P04196,P04207,P04211,P04216,P04217,P04275,P04406,P04433,P05060,P05067,P05090,P05155,P05156,P05408,P05452,P05546,P06310,P06396,P06454,P06681,P06727,P07195,P07225,P07333,P07339,P07602,P07711,P07858,P07998,P08123,P08133,P08253,P08294,P08493,P08571,P08603,P08637,P08697,P09104,P09486,P09871,P10451,P10643,P10645,P10909,P11142,P11277,P12109,P13473,P13521,P13591,P13611,P13671,P13987,P14174,P14314,P14618,P16035,P16070,P16152,P16870,P17174,P17936,P18065,P19021,P19652,P19823,P19827,P20774,P20933,P23083,P23142,P24592,P25311,P27169,P30086,P31997,P32754,P35542,P36222,P36955,P36980,P39060,P40925,P41222,P43121,P43251,P43652,P49588,P49908,P51884,P54289,P55290,P60174,P61278,P61626,P61769,P61916,P80748,P98160,Q02818,Q06481,Q08380,Q12805,Q12841,Q12907,Q13283,Q13332,Q13449,Q13451,Q13740,Q14118,Q14508,Q14515,Q14624,Q15904,Q16270,Q16610,Q562R1,Q6UX71,Q6UXB8,Q6UXD5,Q7Z3B1,Q7Z5P9,Q8IWV7,Q8N2S1,Q8NBJ4,Q8NE71,Q92520,Q92823,Q92876,Q96BZ4,Q96KN2,Q96PD5,Q96S96,Q99435,Q99674,Q99683,Q99829,Q99832,Q99969,Q9BY67,Q9HDC9,Q9NQ79,Q9NYU2,Q9UBR2,Q9UBX5,Q9UHG2,Q9UKV8,Q9UNU6,Q9Y646,Q9Y6R7'

cols = title.split(',')
# Pivot the UniProt and NPX columns
pivoted_peptide_df = merged_df.pivot(index=cols, columns='Peptide', values='PeptideAbundance')

pivoted_peptide_df = pivoted_peptide_df.reset_index()
pivoted_peptide_df = pivoted_peptide_df.sort_values(by=['patient_id', 'visit_month'])
pivoted_peptide_df = pivoted_peptide_df.fillna(pivoted_peptide_df.mean())


# select columns to standardize
cols_to_standardize = list(pivoted_peptide_df.columns)

cols_to_remove = ['visit_id','visit_month','patient_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_medication_Off','upd23b_medication_On']

cols_to_standardize = [x for x in cols_to_standardize if x not in cols_to_remove]

# create StandardScaler object
scaler = StandardScaler()

# fit scaler on selected columns
scaler.fit(pivoted_peptide_df[cols_to_standardize])

# transform selected columns
pivoted_peptide_df[cols_to_standardize] = scaler.transform(pivoted_peptide_df[cols_to_standardize])

In [24]:
pivoted_peptide_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 967 entries, 814 to 1026
Columns: 1204 entries, visit_id to YYWGGQYTWDMAK
dtypes: float64(1199), int64(4), object(1)
memory usage: 8.9+ MB


#### To CSV

In [5]:
pivoted_filled_df.to_csv("/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_clinical.csv", index=False)

In [68]:
pivoted_filled_df.to_csv("/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_clinical_scaled.csv", index=False)

In [10]:
pivoted_peptide_df.to_csv("/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_peptide_clinical.csv", index=False)

In [9]:
pivoted_peptide_df.to_csv("/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_peptide_clinical_scaled.csv", index=False)

In [25]:
pivoted_peptide_df.to_csv("/home/alfred/Code/Kaggle/AMP-Parkinsons/data/engineered/train_protein_peptide_clinical_outlier.csv", index=False)