# FCUL ALS Data Cleaning
---

Exploring the ALS dataset from Faculdade de Ciências da Universidade de Lisboa (FCUL) with the data from over 1000 patients collected in Portugal.

The main goal of this notebook is to prepare a single CSV document that contains all the relevant data to be used when training a machine learning model that predicts disease progression, filtering useless columns and performing imputation.

## Importing the necessary packages

In [1]:
import pandas as pd              # Pandas to handle the data in dataframes
import re                        # re to do regex searches in string data
import plotly                    # Plotly for interactive and pretty plots
import plotly.graph_objs as go
from datetime import datetime    # datetime to use proper date and time formats
import os                        # os handles directory/workspace changes
import numpy as np               # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook   # tqdm allows to track code execution progress
import numbers                   # numbers allows to check if data is numeric
import torch                     # PyTorch to create and apply deep learning models
from torch.utils.data.sampler import SubsetRandomSampler
import data_utils as du          # Data science and machine learning relevant methods

In [2]:
import pixiedust                 # Debugging in Jupyter Notebook cells

Pixiedust database opened successfully


In [3]:
# Change to parent directory (presumably "Documents")
os.chdir("../../..")
# Path to the CSV dataset files
data_path = 'Datasets/Thesis/FCUL_ALS/'

In [4]:
du.set_pandas_library(lib='pandas')

Allow pandas to show more columns:

In [5]:
pd.set_option('display.max_columns', 3000)
pd.set_option('display.max_rows', 3000)

Set the random seed for reproducibility:

In [6]:
du.set_random_seed(42)

## Reading the data

In [7]:
ALS_proc_df = pd.read_csv(f'{data_path}dataWithoutDunnoNIV.csv')
ALS_proc_df.head()

Unnamed: 0,REF,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,ALS-FRS,ALS-FRS-R,ALS-FRSb,ALS-FRSsUL,ALS-FRSsLL,ALS-FRSr,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,SNIP,PhrenMeanLat,PhrenMeanAmpl,CervicalFlex,CervicalExt,NIV,NIV_DATE,firstDate,lastDate,medianDate
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,34.0,42.0,12.0,11.0,7.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,,5.0,5.0,0,04/06/2007,07/11/2006,15/11/2006,07/11/2006
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,34.0,42.0,12.0,11.0,7.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,,,,,,,,,5.0,5.0,0,04/06/2007,04/12/2006,04/12/2006,04/12/2006
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,33.0,41.0,12.0,11.0,6.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,,5.0,5.0,0,04/06/2007,09/01/2007,24/01/2007,09/01/2007
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,28.0,36.0,12.0,7.0,5.0,4.0,12.0,4.0,4.0,4.0,3.0,2.0,2.0,2.0,2.0,1.0,4.0,4,4.0,4.0,64.2,56.2,26.7,26.8,68.8,,,,4.0,5.0,0,04/06/2007,11/05/2007,17/05/2007,11/05/2007
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,13.0,21.0,10.0,0.0,0.0,3.0,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4,3.0,4.0,,,,,,,,,2.0,4.0,1,04/06/2007,03/09/2007,03/09/2007,03/09/2007


## Renaming columns

In [8]:
ALS_proc_df.rename(columns={'REF': 'subject_id'}, inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,ALS-FRS,ALS-FRS-R,ALS-FRSb,ALS-FRSsUL,ALS-FRSsLL,ALS-FRSr,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,SNIP,PhrenMeanLat,PhrenMeanAmpl,CervicalFlex,CervicalExt,NIV,NIV_DATE,firstDate,lastDate,medianDate
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,34.0,42.0,12.0,11.0,7.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,,5.0,5.0,0,04/06/2007,07/11/2006,15/11/2006,07/11/2006
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,34.0,42.0,12.0,11.0,7.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,,,,,,,,,5.0,5.0,0,04/06/2007,04/12/2006,04/12/2006,04/12/2006
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,33.0,41.0,12.0,11.0,6.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,,5.0,5.0,0,04/06/2007,09/01/2007,24/01/2007,09/01/2007
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,28.0,36.0,12.0,7.0,5.0,4.0,12.0,4.0,4.0,4.0,3.0,2.0,2.0,2.0,2.0,1.0,4.0,4,4.0,4.0,64.2,56.2,26.7,26.8,68.8,,,,4.0,5.0,0,04/06/2007,11/05/2007,17/05/2007,11/05/2007
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,13.0,21.0,10.0,0.0,0.0,3.0,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4,3.0,4.0,,,,,,,,,2.0,4.0,1,04/06/2007,03/09/2007,03/09/2007,03/09/2007


## Creating a timestamp column

Using `medianDate`, we can define a column that serves as the timestamp, which indicates how many days have gone by since the patient's first data sample.

Convert column `medianDate` to a datetime format:

In [9]:
ALS_proc_df.medianDate = pd.to_datetime(ALS_proc_df.medianDate, format='%d/%m/%Y')
ALS_proc_df.medianDate

0      2006-11-07
1      2006-12-04
2      2007-01-09
3      2007-05-11
4      2007-09-03
          ...    
6183   2019-01-15
6184   2019-01-15
6185   2019-02-11
6186   2019-03-01
6187   2019-03-06
Name: medianDate, Length: 6188, dtype: datetime64[ns]

Get the difference in days between the samples:

In [10]:
ALS_proc_df.medianDate = ALS_proc_df.groupby('subject_id').medianDate.diff()
ALS_proc_df.medianDate

0           NaT
1       27 days
2       36 days
3      122 days
4      115 days
         ...   
6183        NaT
6184        NaT
6185        NaT
6186        NaT
6187        NaT
Name: medianDate, Length: 6188, dtype: timedelta64[ns]

Convert to a numeric format and replace the missing values (which are the first sample in each time series) with 0:

In [11]:
ALS_proc_df.medianDate = ALS_proc_df.medianDate / np.timedelta64(1, 'D')
ALS_proc_df.medianDate = ALS_proc_df.medianDate.fillna(0)
ALS_proc_df.medianDate

0         0.0
1        27.0
2        36.0
3       122.0
4       115.0
        ...  
6183      0.0
6184      0.0
6185      0.0
6186      0.0
6187      0.0
Name: medianDate, Length: 6188, dtype: float64

Rename to `ts`:

In [12]:
ALS_proc_df.rename(columns={'medianDate': 'ts'}, inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,ALS-FRS,ALS-FRS-R,ALS-FRSb,ALS-FRSsUL,ALS-FRSsLL,ALS-FRSr,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,SNIP,PhrenMeanLat,PhrenMeanAmpl,CervicalFlex,CervicalExt,NIV,NIV_DATE,firstDate,lastDate,ts
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,34.0,42.0,12.0,11.0,7.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,,5.0,5.0,0,04/06/2007,07/11/2006,15/11/2006,0.0
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,34.0,42.0,12.0,11.0,7.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,,,,,,,,,5.0,5.0,0,04/06/2007,04/12/2006,04/12/2006,27.0
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,33.0,41.0,12.0,11.0,6.0,4.0,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,,5.0,5.0,0,04/06/2007,09/01/2007,24/01/2007,36.0
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,28.0,36.0,12.0,7.0,5.0,4.0,12.0,4.0,4.0,4.0,3.0,2.0,2.0,2.0,2.0,1.0,4.0,4,4.0,4.0,64.2,56.2,26.7,26.8,68.8,,,,4.0,5.0,0,04/06/2007,11/05/2007,17/05/2007,122.0
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,13.0,21.0,10.0,0.0,0.0,3.0,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4,3.0,4.0,,,,,,,,,2.0,4.0,1,04/06/2007,03/09/2007,03/09/2007,115.0


In [13]:
ALS_proc_df.ts.describe()

count    6188.000000
mean      107.685197
std       129.084727
min         0.000000
25%        60.000000
50%        98.000000
75%       127.000000
max      4053.000000
Name: ts, dtype: float64

## Deleting unused columns

Removing kind of useless columns ('NIV_DATE', 'firstDate', 'lastDate'), ones with too many missing values ('SNIP', 'CervicalFlex', 'CervicalExt') and ones that would give away the labels ('ALS-FRS', 'ALS-FRS-R', 'ALS-FRSb', 'ALS-FRSsUL', 'ALS-FRSsLL', 'ALS-FRSr').

In [14]:
ALS_proc_df.columns

Index(['subject_id', 'Gender', 'BMI', 'MND familiar history', 'Age at onset',
       'Disease duration', 'El Escorial reviewed criteria', 'UMN vs LMN',
       'Onset form', 'C9orf72', 'ALS-FRS', 'ALS-FRS-R', 'ALS-FRSb',
       'ALS-FRSsUL', 'ALS-FRSsLL', 'ALS-FRSr', 'R', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', '1R', '2R', '3R', 'VC', 'FVC',
       'MIP', 'MEP', 'P0.1', 'SNIP', 'PhrenMeanLat', 'PhrenMeanAmpl',
       'CervicalFlex', 'CervicalExt', 'NIV', 'NIV_DATE', 'firstDate',
       'lastDate', 'ts'],
      dtype='object')

In [15]:
ALS_proc_df.drop(columns=['NIV_DATE', 'firstDate', 'lastDate', 'SNIP', 
                          'CervicalFlex', 'CervicalExt', 'ALS-FRS',
                          'ALS-FRS-R', 'ALS-FRSb', 'ALS-FRSsUL', 
                          'ALS-FRSsLL', 'ALS-FRSr'], inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV,ts
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0.0
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4,4.0,4.0,,,,,,,,0,27.0
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0,36.0
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,12.0,4.0,4.0,4.0,3.0,2.0,2.0,2.0,2.0,1.0,4.0,4,4.0,4.0,64.2,56.2,26.7,26.8,68.8,,,0,122.0
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4,3.0,4.0,,,,,,,,1,115.0


## Removing patients with only one clinical visit

Since we want to predict the use of NIV in the next clinical visit, it doesn't make any sense to include patients with only one data point.

In [16]:
ALS_proc_df.subject_id.nunique()

1110

In [17]:
ALS_proc_df.groupby('subject_id').ts.count().min()

1

In [18]:
for patient in ALS_proc_df.subject_id.unique():
    # Check if the current patient only has one clinical visit
    if len(ALS_proc_df[ALS_proc_df.subject_id == patient]) == 1:
        # Remove patient's data from the dataframe
        ALS_proc_df = ALS_proc_df[ALS_proc_df.subject_id != patient]

In [19]:
ALS_proc_df.subject_id.nunique()

918

In [20]:
ALS_proc_df.groupby('subject_id').ts.count().min()

2

In [21]:
ALS_proc_df.groupby('subject_id').ts.count().describe()

count    918.000000
mean       6.531590
std        4.521195
min        2.000000
25%        3.000000
50%        5.000000
75%        9.000000
max       27.000000
Name: ts, dtype: float64

## Cleaning categorical columns

Combining redundant values and one hot encoding categorical features.

Making "Gender" a proper one hot encoded column:

In [22]:
ALS_proc_df['Gender'] = ALS_proc_df['Gender'] - 1

Fixing a bug in the `1R` column:

In [23]:
ALS_proc_df['1R'] = ALS_proc_df['1R'].replace(to_replace='\\1', value=1).astype('float64')

In [24]:
du.search_explore.dataframe_missing_values(ALS_proc_df)

Unnamed: 0,column_name,percent_missing
subject_id,subject_id,0.0
NIV,NIV,0.0
C9orf72,C9orf72,0.0
Age at onset,Age at onset,0.0
ts,ts,0.0
Gender,Gender,0.0
Disease duration,Disease duration,0.100067
Onset form,Onset form,0.200133
P1,P1,6.571047
P3,P3,6.604403


One hot encode the remaining categorical columns:

In [25]:
ALS_proc_df = du.data_processing.one_hot_encoding_dataframe(ALS_proc_df,
                                                            columns=['El Escorial reviewed criteria',
                                                                     'Onset form',
                                                                     'UMN vs LMN',
                                                                     'C9orf72'],
                                                            join_rows=True,
                                                            join_by=['subject_id', 'ts'],
                                                            lower_case=True, 
                                                            has_nan=True,
                                                            inplace=True)
ALS_proc_df.head()

Cleaning the categorical columns...


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Getting dummies...
Done!


Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV,El Escorial reviewed criteria_def,El Escorial reviewed criteria_missing_value,El Escorial reviewed criteria_pbp,El Escorial reviewed criteria_pma,El Escorial reviewed criteria_poss,El Escorial reviewed criteria_pro,El Escorial reviewed criteria_pro_lab_sup,El Escorial reviewed criteria_sus,Onset form_1,Onset form_2,Onset form_3,Onset form_4,Onset form_5,Onset form_ftd,Onset form_missing_value,UMN vs LMN_both,UMN vs LMN_lmn,UMN vs LMN_missing_value,UMN vs LMN_umn,C9orf72_missing_value,C9orf72_no,C9orf72_yes
0,2,0.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
1,2,27.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,,,,,,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
2,2,36.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4.0,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
3,2,91.0,0,17.901235,2.0,55.0,5.3,10.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,4.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
4,2,115.0,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0


Reduxing the UMN vs LMN columns into just 2 clear columns:

In [26]:
ALS_proc_df.rename(columns={'UMN vs LMN_lmn': 'LMN',
                            'UMN vs LMN_umn': 'UMN'}, inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV,El Escorial reviewed criteria_def,El Escorial reviewed criteria_missing_value,El Escorial reviewed criteria_pbp,El Escorial reviewed criteria_pma,El Escorial reviewed criteria_poss,El Escorial reviewed criteria_pro,El Escorial reviewed criteria_pro_lab_sup,El Escorial reviewed criteria_sus,Onset form_1,Onset form_2,Onset form_3,Onset form_4,Onset form_5,Onset form_ftd,Onset form_missing_value,UMN vs LMN_both,LMN,UMN vs LMN_missing_value,UMN,C9orf72_missing_value,C9orf72_no,C9orf72_yes
0,2,0.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
1,2,27.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,,,,,,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
2,2,36.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4.0,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
3,2,91.0,0,17.901235,2.0,55.0,5.3,10.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,4.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0
4,2,115.0,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0


In [27]:
# Activate both UMN and LMN features if the "both" value is 1
ALS_proc_df.LMN = ALS_proc_df.apply(lambda df: 1 if df['UMN vs LMN_both'] == 1 or df['LMN'] == 1 else 0, axis=1)
ALS_proc_df.UMN = ALS_proc_df.apply(lambda df: 1 if df['UMN vs LMN_both'] == 1 or df['UMN'] == 1 else 0, axis=1)

In [28]:
# Drop the "both" column as it's redundant
ALS_proc_df.drop(columns='UMN vs LMN_both', inplace=True)

In [29]:
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV,El Escorial reviewed criteria_def,El Escorial reviewed criteria_missing_value,El Escorial reviewed criteria_pbp,El Escorial reviewed criteria_pma,El Escorial reviewed criteria_poss,El Escorial reviewed criteria_pro,El Escorial reviewed criteria_pro_lab_sup,El Escorial reviewed criteria_sus,Onset form_1,Onset form_2,Onset form_3,Onset form_4,Onset form_5,Onset form_ftd,Onset form_missing_value,LMN,UMN vs LMN_missing_value,UMN,C9orf72_missing_value,C9orf72_no,C9orf72_yes
0,2,0.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
1,2,27.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,,,,,,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
2,2,36.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4.0,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
3,2,91.0,0,17.901235,2.0,55.0,5.3,10.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,4.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
4,2,115.0,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0


In [30]:
len(ALS_proc_df[(ALS_proc_df.UMN == 1) & (ALS_proc_df.LMN == 1)])

15

**Comment:** The previous length matches the number found on the value counts of the original dataframe, corresponding to the value "both".

Remove the redundant `C9orf72_no` column:

In [31]:
ALS_proc_df.columns

Index(['subject_id', 'ts', 'Gender', 'BMI', 'MND familiar history',
       'Age at onset', 'Disease duration', 'R', 'P1', 'P2', 'P3', 'P4', 'P5',
       'P6', 'P7', 'P8', 'P9', 'P10', '1R', '2R', '3R', 'VC', 'FVC', 'MIP',
       'MEP', 'P0.1', 'PhrenMeanLat', 'PhrenMeanAmpl', 'NIV',
       'El Escorial reviewed criteria_def',
       'El Escorial reviewed criteria_missing_value',
       'El Escorial reviewed criteria_pbp',
       'El Escorial reviewed criteria_pma',
       'El Escorial reviewed criteria_poss',
       'El Escorial reviewed criteria_pro',
       'El Escorial reviewed criteria_pro_lab_sup',
       'El Escorial reviewed criteria_sus', 'Onset form_1', 'Onset form_2',
       'Onset form_3', 'Onset form_4', 'Onset form_5', 'Onset form_ftd',
       'Onset form_missing_value', 'LMN', 'UMN vs LMN_missing_value', 'UMN',
       'C9orf72_missing_value', 'C9orf72_no', 'C9orf72_yes'],
      dtype='object')

In [32]:
ALS_proc_df.drop(columns='C9orf72_no', inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV,El Escorial reviewed criteria_def,El Escorial reviewed criteria_missing_value,El Escorial reviewed criteria_pbp,El Escorial reviewed criteria_pma,El Escorial reviewed criteria_poss,El Escorial reviewed criteria_pro,El Escorial reviewed criteria_pro_lab_sup,El Escorial reviewed criteria_sus,Onset form_1,Onset form_2,Onset form_3,Onset form_4,Onset form_5,Onset form_ftd,Onset form_missing_value,LMN,UMN vs LMN_missing_value,UMN,C9orf72_missing_value,C9orf72_yes
0,2,0.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,2,27.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,,,,,,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
2,2,36.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4.0,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
3,2,91.0,0,17.901235,2.0,55.0,5.3,10.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,4.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
4,2,115.0,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0


In [33]:
ALS_proc_df.rename(columns={'C9orf72_yes': 'C9orf72'}, inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,1R,2R,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV,El Escorial reviewed criteria_def,El Escorial reviewed criteria_missing_value,El Escorial reviewed criteria_pbp,El Escorial reviewed criteria_pma,El Escorial reviewed criteria_poss,El Escorial reviewed criteria_pro,El Escorial reviewed criteria_pro_lab_sup,El Escorial reviewed criteria_sus,Onset form_1,Onset form_2,Onset form_3,Onset form_4,Onset form_5,Onset form_ftd,Onset form_missing_value,LMN,UMN vs LMN_missing_value,UMN,C9orf72_missing_value,C9orf72
0,2,0.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,2,27.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,,,,,,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
2,2,36.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4.0,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
3,2,91.0,0,17.901235,2.0,55.0,5.3,10.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,4.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
4,2,115.0,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0


## Standardize all column names to be lower case and without spaces

In [34]:
ALS_proc_df.columns = [col.lower().replace(' ', '_').replace('-', '_') for col in ALS_proc_df.columns]

In [35]:
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,1r,2r,3r,vc,fvc,mip,mep,p0.1,phrenmeanlat,phrenmeanampl,niv,el_escorial_reviewed_criteria_def,el_escorial_reviewed_criteria_missing_value,el_escorial_reviewed_criteria_pbp,el_escorial_reviewed_criteria_pma,el_escorial_reviewed_criteria_poss,el_escorial_reviewed_criteria_pro,el_escorial_reviewed_criteria_pro_lab_sup,el_escorial_reviewed_criteria_sus,onset_form_1,onset_form_2,onset_form_3,onset_form_4,onset_form_5,onset_form_ftd,onset_form_missing_value,lmn,umn_vs_lmn_missing_value,umn,c9orf72_missing_value,c9orf72
0,2,0.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,2,27.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,4.0,4.0,4.0,4.0,,,,,,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
2,2,36.0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0,1.0,4.0,4.0,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
3,2,91.0,0,17.901235,2.0,55.0,5.3,10.0,2.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2.0,4.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
4,2,115.0,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,3.0,4.0,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0


## NIV label

In order to predict the use of NIV in the next 3 months, we need to create a shifted version of the "niv" column.

In [36]:
ALS_proc_df['niv_label'] = ALS_proc_df['niv']

In [40]:
def set_niv_label_in_row(df, time_window_days=90):
    global ALS_proc_df
    # Get a list of all the timestamps in the current patient's time series
    subject_ts_list = ALS_proc_df[ALS_proc_df.subject_id == df.subject_id].ts
    try:
        # Try to find the timestamp of a sample that is equal or bigger than 
        # the current one + the desired time window
        closest_ts = subject_ts_list[subject_ts_list >= df.ts+time_window_days].iloc[0]
    except IndexError:
        # Just use the data from the subject's last sample if there are no 
        # samples in the desired time window for this subject
        closest_ts = subject_ts_list.iloc[-1]
    # Check if the patient is on NIV in this observed future
    return ALS_proc_df[(ALS_proc_df.subject_id == df.subject_id) & (ALS_proc_df.ts == closest_ts)].niv.item() == 1

In [41]:
ALS_proc_df[['subject_id', 'ts', 'niv', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,niv,niv_label
0,2,0.0,0,0
1,2,27.0,0,0
2,2,36.0,0,0
3,2,91.0,1,1
4,2,115.0,1,1
5,2,122.0,0,0
6,3,0.0,0,0
7,3,34.0,1,1
8,3,92.0,1,1
9,4,0.0,0,0


In [42]:
ALS_proc_df['niv_label'] = ALS_proc_df.apply(set_niv_label_in_row, axis=1)

In [43]:
ALS_proc_df[['subject_id', 'ts', 'niv', 'niv_label']].head(200)

Unnamed: 0,subject_id,ts,niv,niv_label
0,2,0.0,0,True
1,2,27.0,0,False
2,2,36.0,0,False
3,2,91.0,1,False
4,2,115.0,1,False
5,2,122.0,0,False
6,3,0.0,0,True
7,3,34.0,1,True
8,3,92.0,1,True
9,4,0.0,0,False


In [44]:
# Save a version of the dataframe without normalization
ALS_proc_df.to_csv(f'{data_path}cleaned/FCUL_ALS_cleaned_denorm.csv')

In [45]:
ALS_proc_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,5745.0,562.078851,370.227028,2.0,255.0,516.0,888.0,1341.0
ts,5745.0,111.084769,132.078524,0.0,64.0,98.0,130.0,4053.0
gender,5745.0,0.417929,0.493261,0.0,0.0,0.0,1.0,1.0
bmi,5132.0,26.150368,7.014326,14.204545,22.773186,25.181826,27.688778,133.879671
mnd_familiar_history,5203.0,1.991159,0.556254,0.0,2.0,2.0,2.0,10.0
age_at_onset,5745.0,62.74604,18.856769,9.0,54.0,62.0,70.0,360.0
disease_duration,5739.0,20.891295,28.005369,0.13,8.27,12.73,23.53,589.59
r,5221.0,10.556981,3.133196,0.0,9.0,11.0,12.0,43.0
p1,5359.0,2.924053,1.637564,0.0,2.0,3.0,4.0,20.0
p2,5357.0,3.299795,1.440151,0.0,3.0,4.0,4.0,18.0


## Normalizing continuous values

Continuous data is normalized into z-scores, where 0 represents the mean and an absolute value of 1 corresponds to the standard deviation.

In [46]:
ALS_proc_df = du.data_processing.normalize_data(ALS_proc_df, id_columns=['subject_id', 'ts'])
ALS_proc_df.head()

z-score normalizing columns ['bmi', 'mnd_familiar_history', 'age_at_onset', 'disease_duration', 'r', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9', 'p10', '1r', '2r', '3r', 'vc', 'fvc', 'mip', 'mep', 'p0.1', 'phrenmeanlat', 'phrenmeanampl']...


Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,1r,2r,3r,vc,fvc,mip,mep,p0.1,phrenmeanlat,phrenmeanampl,niv,el_escorial_reviewed_criteria_def,el_escorial_reviewed_criteria_missing_value,el_escorial_reviewed_criteria_pbp,el_escorial_reviewed_criteria_pma,el_escorial_reviewed_criteria_poss,el_escorial_reviewed_criteria_pro,el_escorial_reviewed_criteria_pro_lab_sup,el_escorial_reviewed_criteria_sus,onset_form_1,onset_form_2,onset_form_3,onset_form_4,onset_form_5,onset_form_ftd,onset_form_missing_value,lmn,umn_vs_lmn_missing_value,umn,c9orf72_missing_value,c9orf72,niv_label
0,2,0.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,0.15569,0.692551,0.524571,0.25163,0.439201,0.020523,0.126811,0.087589,-0.395301,-1.244218,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,True
1,2,27.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,0.15569,0.692551,0.524571,0.25163,0.439201,,,,,,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
2,2,36.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,-0.502063,0.692551,0.524571,0.25163,0.439201,-0.158402,-0.055812,-0.294044,-0.441416,-0.701165,,,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
3,2,91.0,0,-1.176041,0.015894,-0.410783,-0.556725,-0.177768,-0.564285,0.486203,-0.156554,-1.685238,-1.425549,-1.360546,-1.628482,-0.975574,-1.159815,0.011943,-1.060433,0.25163,0.439201,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
4,2,115.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.141395,0.046378,0.486203,-0.156554,-1.685238,-1.425549,-1.360546,-1.628482,-1.708759,-1.159815,0.011943,0.524571,-0.692709,0.439201,,,,,,,,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False


In [47]:
ALS_proc_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,5745.0,562.0789,370.227028,2.0,255.0,516.0,888.0,1341.0
ts,5745.0,111.0848,132.078524,0.0,64.0,98.0,130.0,4053.0
gender,5745.0,0.4179286,0.493261,0.0,0.0,0.0,1.0,1.0
bmi,5132.0,1.747697e-14,1.0,-1.703061,-0.481469,-0.138081,0.219324,15.358469
mnd_familiar_history,5203.0,-2.157712e-16,1.0,-3.579588,0.015894,0.015894,0.015894,14.397821
age_at_onset,5745.0,1.18733e-16,1.0,-2.850225,-0.463814,-0.039564,0.384687,15.76378
disease_duration,5739.0,-3.664761e-16,1.0,-0.741333,-0.450674,-0.291419,0.094221,20.306775
r,5221.0,-2.653818e-16,1.0,-3.369397,-0.496931,0.141395,0.460558,10.35461
p1,5359.0,7.424966000000001e-17,1.0,-1.785612,-0.564285,0.046378,0.657041,10.427655
p2,5357.0,9.549949e-17,1.0,-2.291284,-0.208169,0.486203,0.486203,10.207406


## Imputation and removal of incomplete data

Starting from a last information carried forward technique, the data is initially forward filled. Next, a backward fill is done, as current data of the patient should still be a good indicator of the recent past. Finally, the remaining missing values are filled with zeroes, as it represents the average value of each given feature.

In [48]:
ALS_proc_df[['subject_id', 'ts', 'r', 'p1', 'p2', 'bmi', 'fvc', 'vc', 'mip', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,r,p1,p2,bmi,fvc,vc,mip,niv_label
0,2,0.0,0.460558,0.657041,0.486203,-1.176041,0.126811,0.020523,0.087589,True
1,2,27.0,0.460558,0.657041,0.486203,-1.176041,,,,False
2,2,36.0,0.460558,0.657041,0.486203,-1.176041,-0.055812,-0.158402,-0.294044,False
3,2,91.0,-0.177768,-0.564285,0.486203,-1.176041,,,,False
4,2,115.0,0.141395,0.046378,0.486203,-1.176041,,,,False
5,2,122.0,0.460558,0.657041,0.486203,-1.176041,-1.072746,-0.76294,-0.967437,False
6,3,0.0,,,,-0.167276,-0.561105,-0.523104,-1.413544,True
7,3,34.0,,0.046378,-2.291284,-0.167276,,,,True
8,3,92.0,,-1.785612,-2.291284,-0.167276,,,,True
9,4,0.0,,,,-0.489198,0.936474,,2.233459,False


In [49]:
ALS_proc_df = du.data_processing.missing_values_imputation(ALS_proc_df, method='zigzag', id_column='subject_id')
ALS_proc_df.head()

Resetting the index...
Searching for boolean features...
Replacing boolean features' missing values with zero...
Forward filling and backward filling missing values...
Replacing remaining missing values with zero...
Done!


Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,1r,2r,3r,vc,fvc,mip,mep,p0.1,phrenmeanlat,phrenmeanampl,niv,el_escorial_reviewed_criteria_def,el_escorial_reviewed_criteria_missing_value,el_escorial_reviewed_criteria_pbp,el_escorial_reviewed_criteria_pma,el_escorial_reviewed_criteria_poss,el_escorial_reviewed_criteria_pro,el_escorial_reviewed_criteria_pro_lab_sup,el_escorial_reviewed_criteria_sus,onset_form_1,onset_form_2,onset_form_3,onset_form_4,onset_form_5,onset_form_ftd,onset_form_missing_value,lmn,umn_vs_lmn_missing_value,umn,c9orf72_missing_value,c9orf72,niv_label
0,2,0.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,0.15569,0.692551,0.524571,0.25163,0.439201,0.020523,0.126811,0.087589,-0.395301,-1.244218,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,True
1,2,27.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,0.15569,0.692551,0.524571,0.25163,0.439201,0.020523,0.126811,0.087589,-0.395301,-1.244218,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
2,2,36.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,-0.502063,0.692551,0.524571,0.25163,0.439201,-0.158402,-0.055812,-0.294044,-0.441416,-0.701165,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
3,2,91.0,0,-1.176041,0.015894,-0.410783,-0.556725,-0.177768,-0.564285,0.486203,-0.156554,-1.685238,-1.425549,-1.360546,-1.628482,-0.975574,-1.159815,0.011943,-1.060433,0.25163,0.439201,-0.158402,-0.055812,-0.294044,-0.441416,-0.701165,0.0,0.0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
4,2,115.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.141395,0.046378,0.486203,-0.156554,-1.685238,-1.425549,-1.360546,-1.628482,-1.708759,-1.159815,0.011943,0.524571,-0.692709,0.439201,-0.158402,-0.055812,-0.294044,-0.441416,-0.701165,0.0,0.0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False


In [50]:
ALS_proc_df[['subject_id', 'ts', 'r', 'p1', 'p2', 'bmi', 'fvc', 'vc', 'mip', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,r,p1,p2,bmi,fvc,vc,mip,niv_label
0,2,0.0,0.460558,0.657041,0.486203,-1.176041,0.126811,0.020523,0.087589,True
1,2,27.0,0.460558,0.657041,0.486203,-1.176041,0.126811,0.020523,0.087589,False
2,2,36.0,0.460558,0.657041,0.486203,-1.176041,-0.055812,-0.158402,-0.294044,False
3,2,91.0,-0.177768,-0.564285,0.486203,-1.176041,-0.055812,-0.158402,-0.294044,False
4,2,115.0,0.141395,0.046378,0.486203,-1.176041,-0.055812,-0.158402,-0.294044,False
5,2,122.0,0.460558,0.657041,0.486203,-1.176041,-1.072746,-0.76294,-0.967437,False
6,3,0.0,0.0,0.046378,-2.291284,-0.167276,-0.561105,-0.523104,-1.413544,True
7,3,34.0,0.0,0.046378,-2.291284,-0.167276,-0.561105,-0.523104,-1.413544,True
8,3,92.0,0.0,-1.785612,-2.291284,-0.167276,-0.561105,-0.523104,-1.413544,True
9,4,0.0,0.460558,0.046378,0.486203,-0.489198,0.936474,0.0,2.233459,False


## Saving the data

In [51]:
ALS_proc_df.to_csv(f'{data_path}cleaned/FCUL_ALS_cleaned.csv')

In [52]:
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,1r,2r,3r,vc,fvc,mip,mep,p0.1,phrenmeanlat,phrenmeanampl,niv,el_escorial_reviewed_criteria_def,el_escorial_reviewed_criteria_missing_value,el_escorial_reviewed_criteria_pbp,el_escorial_reviewed_criteria_pma,el_escorial_reviewed_criteria_poss,el_escorial_reviewed_criteria_pro,el_escorial_reviewed_criteria_pro_lab_sup,el_escorial_reviewed_criteria_sus,onset_form_1,onset_form_2,onset_form_3,onset_form_4,onset_form_5,onset_form_ftd,onset_form_missing_value,lmn,umn_vs_lmn_missing_value,umn,c9orf72_missing_value,c9orf72,niv_label
0,2,0.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,0.15569,0.692551,0.524571,0.25163,0.439201,0.020523,0.126811,0.087589,-0.395301,-1.244218,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,True
1,2,27.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,0.15569,0.692551,0.524571,0.25163,0.439201,0.020523,0.126811,0.087589,-0.395301,-1.244218,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
2,2,36.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.460558,0.657041,0.486203,0.581703,0.905501,1.157852,0.631186,0.367061,-0.242388,-0.502063,0.692551,0.524571,0.25163,0.439201,-0.158402,-0.055812,-0.294044,-0.441416,-0.701165,0.0,0.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
3,2,91.0,0,-1.176041,0.015894,-0.410783,-0.556725,-0.177768,-0.564285,0.486203,-0.156554,-1.685238,-1.425549,-1.360546,-1.628482,-0.975574,-1.159815,0.011943,-1.060433,0.25163,0.439201,-0.158402,-0.055812,-0.294044,-0.441416,-0.701165,0.0,0.0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False
4,2,115.0,0,-1.176041,0.015894,-0.410783,-0.556725,0.141395,0.046378,0.486203,-0.156554,-1.685238,-1.425549,-1.360546,-1.628482,-1.708759,-1.159815,0.011943,0.524571,-0.692709,0.439201,-0.158402,-0.055812,-0.294044,-0.441416,-0.701165,0.0,0.0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,False


In [53]:
ALS_proc_df.columns

Index(['subject_id', 'ts', 'gender', 'bmi', 'mnd_familiar_history',
       'age_at_onset', 'disease_duration', 'r', 'p1', 'p2', 'p3', 'p4', 'p5',
       'p6', 'p7', 'p8', 'p9', 'p10', '1r', '2r', '3r', 'vc', 'fvc', 'mip',
       'mep', 'p0.1', 'phrenmeanlat', 'phrenmeanampl', 'niv',
       'el_escorial_reviewed_criteria_def',
       'el_escorial_reviewed_criteria_missing_value',
       'el_escorial_reviewed_criteria_pbp',
       'el_escorial_reviewed_criteria_pma',
       'el_escorial_reviewed_criteria_poss',
       'el_escorial_reviewed_criteria_pro',
       'el_escorial_reviewed_criteria_pro_lab_sup',
       'el_escorial_reviewed_criteria_sus', 'onset_form_1', 'onset_form_2',
       'onset_form_3', 'onset_form_4', 'onset_form_5', 'onset_form_ftd',
       'onset_form_missing_value', 'lmn', 'umn_vs_lmn_missing_value', 'umn',
       'c9orf72_missing_value', 'c9orf72', 'niv_label'],
      dtype='object')