# FCUL ALS Data Cleaning
---

Exploring the ALS dataset from Faculdade de Ciências da Universidade de Lisboa (FCUL) with the data from over 1000 patients collected in Portugal.

The main goal of this notebook is to prepare a single CSV document that contains all the relevant data to be used when training a machine learning model that predicts disease progression, filtering useless columns and performing imputation.

## Importing the necessary packages

In [1]:
import pandas as pd              # Pandas to handle the data in dataframes
import re                        # re to do regex searches in string data
import plotly                    # Plotly for interactive and pretty plots
import plotly.graph_objs as go
from datetime import datetime    # datetime to use proper date and time formats
import os                        # os handles directory/workspace changes
import numpy as np               # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook   # tqdm allows to track code execution progress
import numbers                   # numbers allows to check if data is numeric
import torch                     # PyTorch to create and apply deep learning models
from torch.utils.data.sampler import SubsetRandomSampler
import utils                     # Contains auxiliary functions

In [2]:
# Change to parent directory (presumably "Documents")
os.chdir("../..")

# Path to the CSV dataset files
data_path = 'Datasets/Thesis/FCUL_ALS/'

**Important:** Use the following two lines to be able to do plotly plots offline:

In [3]:
import plotly.offline as py
plotly.offline.init_notebook_mode(connected=True)

**Important:** The following function is needed in every Google Colab cell that contains a Plotly chart:

In [4]:
def configure_plotly_browser_state():
    import IPython
    display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

## Reading the data

In [5]:
ALS_proc_df = pd.read_csv(f'{data_path}dataWithoutDunnoNIV.csv')
ALS_proc_df.head()

Unnamed: 0,REF,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,...,SNIP,PhrenMeanLat,PhrenMeanAmpl,CervicalFlex,CervicalExt,NIV,NIV_DATE,firstDate,lastDate,medianDate
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,5.0,5.0,0,04/06/2007,07/11/2006,15/11/2006,07/11/2006
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,5.0,5.0,0,04/06/2007,04/12/2006,04/12/2006,04/12/2006
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,5.0,5.0,0,04/06/2007,09/01/2007,24/01/2007,09/01/2007
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,4.0,5.0,0,04/06/2007,11/05/2007,17/05/2007,11/05/2007
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,2.0,4.0,1,04/06/2007,03/09/2007,03/09/2007,03/09/2007


## Renaming columns

In [6]:
ALS_proc_df.rename(columns={'REF': 'subject_id'}, inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,...,SNIP,PhrenMeanLat,PhrenMeanAmpl,CervicalFlex,CervicalExt,NIV,NIV_DATE,firstDate,lastDate,medianDate
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,5.0,5.0,0,04/06/2007,07/11/2006,15/11/2006,07/11/2006
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,5.0,5.0,0,04/06/2007,04/12/2006,04/12/2006,04/12/2006
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,5.0,5.0,0,04/06/2007,09/01/2007,24/01/2007,09/01/2007
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,4.0,5.0,0,04/06/2007,11/05/2007,17/05/2007,11/05/2007
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,,,,2.0,4.0,1,04/06/2007,03/09/2007,03/09/2007,03/09/2007


## Deleting unused columns

Removing kind of useless columns ('NIV_DATE', 'firstDate', 'lastDate', 'medianDate'), ones with too many missing values ('SNIP', 'CervicalFlex', 'CervicalExt') and ones that would give away the labels ('ALS-FRS', 'ALS-FRS-R', 'ALS-FRSb', 'ALS-FRSsUL', 'ALS-FRSsLL', 'ALS-FRSr').

In [7]:
ALS_proc_df.columns

Index(['subject_id', 'Gender', 'BMI', 'MND familiar history', 'Age at onset',
       'Disease duration', 'El Escorial reviewed criteria', 'UMN vs LMN',
       'Onset form', 'C9orf72', 'ALS-FRS', 'ALS-FRS-R', 'ALS-FRSb',
       'ALS-FRSsUL', 'ALS-FRSsLL', 'ALS-FRSr', 'R', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', '1R', '2R', '3R', 'VC', 'FVC',
       'MIP', 'MEP', 'P0.1', 'SNIP', 'PhrenMeanLat', 'PhrenMeanAmpl',
       'CervicalFlex', 'CervicalExt', 'NIV', 'NIV_DATE', 'firstDate',
       'lastDate', 'medianDate'],
      dtype='object')

In [8]:
ALS_proc_df.drop(columns=['NIV_DATE', 'firstDate', 'lastDate', 'medianDate', 
                          'SNIP', 'CervicalFlex', 'CervicalExt', 'ALS-FRS', 
                          'ALS-FRS-R', 'ALS-FRSb', 'ALS-FRSsUL', 'ALS-FRSsLL', 
                          'ALS-FRSr'], inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,...,2R,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,4.0,84.78,88.32,59.1,51.28,36.21,,,0
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,4.0,,,,,,,,0
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,4.0,80.08,83.43,47.38,49.8,63.36,,,0
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,4.0,64.2,56.2,26.7,26.8,68.8,,,0
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,3.0,4.0,,,,,,,,1


## Getting discrete timestamps

Creating a index for each patient that serves as a discrete timestamp, starting at 0 in their first clinical visit and ending at the length of their time series (-1).

In [9]:
ALS_proc_df['ts'] = ALS_proc_df.groupby('subject_id').cumcount()
ALS_proc_df.head(10)

Unnamed: 0,subject_id,Gender,BMI,MND familiar history,Age at onset,Disease duration,El Escorial reviewed criteria,UMN vs LMN,Onset form,C9orf72,...,3R,VC,FVC,MIP,MEP,P0.1,PhrenMeanLat,PhrenMeanAmpl,NIV,ts
0,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,84.78,88.32,59.1,51.28,36.21,,,0,0
1,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,,,,,,,,0,1
2,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,80.08,83.43,47.38,49.8,63.36,,,0,2
3,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,64.2,56.2,26.7,26.8,68.8,,,0,3
4,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,,,,,,,,1,4
5,2,1,17.901235,2.0,55.0,5.3,,,1,Unknown,...,4.0,,,,,,,,1,5
6,3,1,24.977043,2.0,73.0,5.13,poss,,2,Unknown,...,,70.5,69.9,13.0,7.4,32.4,9.0,0.25,0,0
7,3,1,24.977043,2.0,73.0,5.13,poss,,2,Unknown,...,,,,,,,,,1,1
8,3,1,24.977043,2.0,73.0,5.13,poss,,2,Unknown,...,,,,,,,9.9,0.15,1,2
9,4,1,22.718974,2.0,65.0,9.0,def,,1,Unknown,...,,,110.0,125.0,175.0,,,,0,0


## Removing patients with only one clinical visit

Since we want to predict the use of NIV in the next clinical visit, it doesn't make any sense to include patients with only one data point.

In [10]:
ALS_proc_df.subject_id.nunique()

1110

In [11]:
ALS_proc_df.groupby('subject_id').ts.count().min()

1

In [12]:
for patient in ALS_proc_df.subject_id.unique():
    # Check if the current patient only has one clinical visit
    if len(ALS_proc_df[ALS_proc_df.subject_id == patient]) == 1:
        # Remove patient's data from the dataframe
        ALS_proc_df = ALS_proc_df[ALS_proc_df.subject_id != patient]

In [13]:
ALS_proc_df.subject_id.nunique()

918

In [14]:
ALS_proc_df.groupby('subject_id').ts.count().min()

2

## Cleaning categorical columns

Combining redundant values and one hot encoding categorical features.

In [15]:
# Making "Gender" a proper one hot encoded column
ALS_proc_df['Gender'] = ALS_proc_df['Gender'] - 1

In [16]:
# Fixing a bug in the "1R" column
ALS_proc_df['1R'] = ALS_proc_df['1R'].replace(to_replace='\\1', value=1).astype('float64')

In [17]:
ALS_proc_df = utils.one_hot_encoding_dataframe(ALS_proc_df, columns=['El Escorial reviewed criteria',
                                                                     'Onset form',
                                                                     'UMN vs LMN',
                                                                     'C9orf72'])
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,...,Onset form_5,Onset form_ftd,Onset form_nan,UMN vs LMN_both,UMN vs LMN_lmn,UMN vs LMN_nan,UMN vs LMN_umn,C9orf72_no,C9orf72_unknown,C9orf72_yes
0,2,0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
1,2,1,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
2,2,2,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
3,2,3,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
4,2,4,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,...,0,0,0,0,0,1,0,0,1,0


Reduxing the UMN vs LMN columns into just 2 clear columns:

In [18]:
ALS_proc_df.rename(columns={'UMN vs LMN_lmn': 'LMN',
                            'UMN vs LMN_umn': 'UMN',
                            'UMN vs LMN_nan': 'UMN_vs_LMN_unknown'}, inplace=True)
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,...,Onset form_5,Onset form_ftd,Onset form_nan,UMN vs LMN_both,LMN,UMN_vs_LMN_unknown,UMN,C9orf72_no,C9orf72_unknown,C9orf72_yes
0,2,0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
1,2,1,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
2,2,2,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
3,2,3,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
4,2,4,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,...,0,0,0,0,0,1,0,0,1,0


In [19]:
# Activate both UMN and LMN features if the "both" value is 1
ALS_proc_df.LMN = ALS_proc_df.apply(lambda df: 1 if df['UMN vs LMN_both'] == 1 or df['LMN'] == 1 else 0, axis=1)
ALS_proc_df.UMN = ALS_proc_df.apply(lambda df: 1 if df['UMN vs LMN_both'] == 1 or df['UMN'] == 1 else 0, axis=1)

In [20]:
# Drop the "both" column as it's redundant
ALS_proc_df.drop(columns='UMN vs LMN_both', inplace=True)

In [21]:
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,Gender,BMI,MND familiar history,Age at onset,Disease duration,R,P1,P2,...,Onset form_4,Onset form_5,Onset form_ftd,Onset form_nan,LMN,UMN_vs_LMN_unknown,UMN,C9orf72_no,C9orf72_unknown,C9orf72_yes
0,2,0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
1,2,1,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
2,2,2,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
3,2,3,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
4,2,4,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,...,0,0,0,0,0,1,0,0,1,0


In [22]:
len(ALS_proc_df[(ALS_proc_df.UMN == 1) & (ALS_proc_df.LMN == 1)])

17

**Comment:** The previous length matches the number found on the value counts of the original dataframe, corresponding to the value "both".

## Standardize all column names to be lower case and without spaces

In [23]:
ALS_proc_df.columns = [col.lower().replace(' ', '_').replace('-', '_') for col in ALS_proc_df.columns]

In [24]:
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,p2,...,onset_form_4,onset_form_5,onset_form_ftd,onset_form_nan,lmn,umn_vs_lmn_unknown,umn,c9orf72_no,c9orf72_unknown,c9orf72_yes
0,2,0,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
1,2,1,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
2,2,2,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
3,2,3,0,17.901235,2.0,55.0,5.3,12.0,4.0,4.0,...,0,0,0,0,0,1,0,0,1,0
4,2,4,0,17.901235,2.0,55.0,5.3,11.0,3.0,4.0,...,0,0,0,0,0,1,0,0,1,0


## NIV label

In order to predict the use of NIV in the next 3 months, we need to create a shifted version of the "niv" column.

In [25]:
ALS_proc_df['niv_label'] = ALS_proc_df['niv']

In [26]:
ALS_proc_df[['subject_id', 'ts', 'niv', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,niv,niv_label
0,2,0,0,0
1,2,1,0,0
2,2,2,0,0
3,2,3,0,0
4,2,4,1,1
5,2,5,1,1
6,3,0,0,0
7,3,1,1,1
8,3,2,1,1
9,4,0,0,0


In [27]:
ALS_proc_df['niv_label'] = ALS_proc_df.groupby('subject_id')['niv_label'].shift(-1)

In [28]:
ALS_proc_df[['subject_id', 'ts', 'niv', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,niv,niv_label
0,2,0,0,0.0
1,2,1,0,0.0
2,2,2,0,0.0
3,2,3,0,1.0
4,2,4,1,1.0
5,2,5,1,
6,3,0,0,1.0
7,3,1,1,1.0
8,3,2,1,
9,4,0,0,0.0


In [29]:
# Save a version of the dataframe without normalization
ALS_proc_df.to_csv(f'{data_path}cleaned/FCUL_ALS_cleaned_denorm.csv')

In [30]:
ALS_proc_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,5996.0,559.863909,369.094962,2.0,253.0,514.0,885.0,1341.0
ts,5996.0,4.328886,4.266936,0.0,1.0,3.0,6.0,26.0
gender,5996.0,0.418112,0.49329,0.0,0.0,0.0,1.0,1.0
bmi,5354.0,25.066061,3.677668,14.204545,22.679952,24.964946,27.34375,39.68254
mnd_familiar_history,5431.0,1.907568,0.296573,0.0,2.0,2.0,2.0,2.0
age_at_onset,5996.0,60.119413,12.652924,9.0,53.0,61.0,69.0,90.0
disease_duration,5990.0,20.015883,24.757918,0.13,8.17,12.7,22.67,244.0
r,5454.0,10.105977,2.307815,0.0,9.0,11.0,12.0,12.0
p1,5602.0,2.797215,1.439963,0.0,2.0,3.0,4.0,4.0
p2,5600.0,3.156607,1.191418,0.0,3.0,4.0,4.0,4.0


## Normalizing continuous values

In [31]:
ALS_proc_df = utils.normalize_data(ALS_proc_df)
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,p2,...,onset_form_5,onset_form_ftd,onset_form_nan,lmn,umn_vs_lmn_unknown,umn,c9orf72_no,c9orf72_unknown,c9orf72_yes,niv_label
0,2,0,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,0.0
1,2,1,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,0.0
2,2,2,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,0.0
3,2,3,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,1.0
4,2,4,0,-1.948198,0.311668,-0.404603,-0.594391,0.387389,0.140826,0.70789,...,0,0,0,0,1,0,0,1,0,1.0


In [32]:
ALS_proc_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
subject_id,5996.0,559.8639,369.094962,2.0,253.0,514.0,885.0,1341.0
ts,5996.0,4.328886,4.266936,0.0,1.0,3.0,6.0,26.0
gender,5996.0,0.4181121,0.49329,0.0,0.0,0.0,1.0,1.0
bmi,5354.0,3.136129e-14,1.0,-2.95337,-0.64881,-0.027494,0.61933,3.974387
mnd_familiar_history,5431.0,-2.668951e-16,1.0,-6.432032,0.311668,0.311668,0.311668,0.311668
age_at_onset,5996.0,-4.740112e-18,1.0,-4.040126,-0.562669,0.069596,0.70186,2.361556
disease_duration,5990.0,4.365271e-16,1.0,-0.803213,-0.478468,-0.295497,0.107203,9.046969
r,5454.0,4.9506090000000004e-17,1.0,-4.379023,-0.479231,0.387389,0.820699,0.820699
p1,5602.0,1.014699e-16,1.0,-1.942561,-0.553636,0.140826,0.835289,0.835289
p2,5600.0,1.522592e-16,1.0,-2.649453,-0.131446,0.70789,0.70789,0.70789


## Imputation and removal of incomplete data

Starting from a last information carried forward technique, the data is initially forward filled. Next, a backward fill is done, as current data of the patient should still be a good indicator of the recent past. Finally, the remaining missing values are filled with zeroes, as it represents the average value of each given feature.

In [33]:
ALS_proc_df[['subject_id', 'ts', 'r', 'p1', 'p2', 'bmi', 'fvc', 'vc', 'mip', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,r,p1,p2,bmi,fvc,vc,mip,niv_label
0,2,0,0.820699,0.835289,0.70789,-1.948198,0.18807,0.071629,0.118281,0.0
1,2,1,0.820699,0.835289,0.70789,-1.948198,,,,0.0
2,2,2,0.820699,0.835289,0.70789,-1.948198,-0.016206,-0.129978,-0.285228,0.0
3,2,3,0.820699,0.835289,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
4,2,4,0.387389,0.140826,0.70789,-1.948198,,,,1.0
5,2,5,-0.045921,-0.553636,0.70789,-1.948198,,,,
6,3,0,,,,-0.024205,-0.581412,-0.540913,-1.468901,1.0
7,3,1,,0.140826,-2.649453,-0.024205,,,,1.0
8,3,2,,-1.942561,-2.649453,-0.024205,,,,
9,4,0,,,,-0.6382,1.093737,,2.38716,0.0


In [34]:
# Forward fill each patient's data
ALS_proc_df = ALS_proc_df.set_index('subject_id', append=True).groupby('subject_id').fillna(method='ffill').reset_index(level=1)

In [35]:
ALS_proc_df[['subject_id', 'ts', 'r', 'p1', 'p2', 'bmi', 'fvc', 'vc', 'mip', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,r,p1,p2,bmi,fvc,vc,mip,niv_label
0,2,0,0.820699,0.835289,0.70789,-1.948198,0.18807,0.071629,0.118281,0.0
1,2,1,0.820699,0.835289,0.70789,-1.948198,0.18807,0.071629,0.118281,0.0
2,2,2,0.820699,0.835289,0.70789,-1.948198,-0.016206,-0.129978,-0.285228,0.0
3,2,3,0.820699,0.835289,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
4,2,4,0.387389,0.140826,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
5,2,5,-0.045921,-0.553636,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
6,3,0,,,,-0.024205,-0.581412,-0.540913,-1.468901,1.0
7,3,1,,0.140826,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
8,3,2,,-1.942561,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
9,4,0,,,,-0.6382,1.093737,,2.38716,0.0


In [36]:
# Backward fill each patient's data
ALS_proc_df = ALS_proc_df.set_index('subject_id', append=True).groupby('subject_id').fillna(method='bfill').reset_index(level=1)

In [37]:
ALS_proc_df[['subject_id', 'ts', 'r', 'p1', 'p2', 'bmi', 'fvc', 'vc', 'mip', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,r,p1,p2,bmi,fvc,vc,mip,niv_label
0,2,0,0.820699,0.835289,0.70789,-1.948198,0.18807,0.071629,0.118281,0.0
1,2,1,0.820699,0.835289,0.70789,-1.948198,0.18807,0.071629,0.118281,0.0
2,2,2,0.820699,0.835289,0.70789,-1.948198,-0.016206,-0.129978,-0.285228,0.0
3,2,3,0.820699,0.835289,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
4,2,4,0.387389,0.140826,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
5,2,5,-0.045921,-0.553636,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
6,3,0,,0.140826,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
7,3,1,,0.140826,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
8,3,2,,-1.942561,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
9,4,0,0.820699,0.835289,0.70789,-0.6382,1.093737,,2.38716,0.0


In [38]:
# Fill remaining missing values with 0, as they represent that feature's average value
ALS_proc_df = ALS_proc_df.fillna(value=0)

In [39]:
ALS_proc_df[['subject_id', 'ts', 'r', 'p1', 'p2', 'bmi', 'fvc', 'vc', 'mip', 'niv_label']].head(20)

Unnamed: 0,subject_id,ts,r,p1,p2,bmi,fvc,vc,mip,niv_label
0,2,0,0.820699,0.835289,0.70789,-1.948198,0.18807,0.071629,0.118281,0.0
1,2,1,0.820699,0.835289,0.70789,-1.948198,0.18807,0.071629,0.118281,0.0
2,2,2,0.820699,0.835289,0.70789,-1.948198,-0.016206,-0.129978,-0.285228,0.0
3,2,3,0.820699,0.835289,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
4,2,4,0.387389,0.140826,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
5,2,5,-0.045921,-0.553636,0.70789,-1.948198,-1.153721,-0.811151,-0.997222,1.0
6,3,0,0.0,0.140826,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
7,3,1,0.0,0.140826,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
8,3,2,0.0,-1.942561,-2.649453,-0.024205,-0.581412,-0.540913,-1.468901,1.0
9,4,0,0.820699,0.835289,0.70789,-0.6382,1.093737,0.0,2.38716,0.0


In [40]:
ALS_proc_df.to_csv(f'{data_path}cleaned/FCUL_ALS_cleaned.csv')

In [41]:
ALS_proc_df.head()

Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,p2,...,onset_form_5,onset_form_ftd,onset_form_nan,lmn,umn_vs_lmn_unknown,umn,c9orf72_no,c9orf72_unknown,c9orf72_yes,niv_label
0,2,0,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,0.0
1,2,1,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,0.0
2,2,2,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,0.0
3,2,3,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,0.70789,...,0,0,0,0,1,0,0,1,0,1.0
4,2,4,0,-1.948198,0.311668,-0.404603,-0.594391,0.387389,0.140826,0.70789,...,0,0,0,0,1,0,0,1,0,1.0


In [42]:
ALS_proc_df.columns

Index(['subject_id', 'ts', 'gender', 'bmi', 'mnd_familiar_history',
       'age_at_onset', 'disease_duration', 'r', 'p1', 'p2', 'p3', 'p4', 'p5',
       'p6', 'p7', 'p8', 'p9', 'p10', '1r', '2r', '3r', 'vc', 'fvc', 'mip',
       'mep', 'p0.1', 'phrenmeanlat', 'phrenmeanampl', 'niv',
       'el_escorial_reviewed_criteria_def',
       'el_escorial_reviewed_criteria_nan',
       'el_escorial_reviewed_criteria_pbp',
       'el_escorial_reviewed_criteria_pma',
       'el_escorial_reviewed_criteria_poss',
       'el_escorial_reviewed_criteria_pro',
       'el_escorial_reviewed_criteria_pro_lab_sup',
       'el_escorial_reviewed_criteria_sus', 'onset_form_1', 'onset_form_2',
       'onset_form_3', 'onset_form_4', 'onset_form_5', 'onset_form_ftd',
       'onset_form_nan', 'lmn', 'umn_vs_lmn_unknown', 'umn', 'c9orf72_no',
       'c9orf72_unknown', 'c9orf72_yes', 'niv_label'],
      dtype='object')