# FCUL ALS Pre Training Exploration
---

Exploring the ALS dataset from Faculdade de Ciências da Universidade de Lisboa (FCUL) with the data from over 1000 patients collected in Portugal.

Just playing around with the cleaned dataframe before inputing it to the machine learning pipeline.

## Importing the necessary packages

In [1]:
import pandas as pd              # Pandas to handle the data in dataframes
import re                        # re to do regex searches in string data
import plotly                    # Plotly for interactive and pretty plots
import plotly.graph_objs as go
from datetime import datetime    # datetime to use proper date and time formats
import os                        # os handles directory/workspace changes
import numpy as np               # NumPy to handle numeric and NaN operations
from tqdm import tqdm_notebook   # tqdm allows to track code execution progress
import numbers                   # numbers allows to check if data is numeric
import torch                     # PyTorch to create and apply deep learning models
from torch.utils.data.sampler import SubsetRandomSampler
import data_utils as du          # Data science and machine learning relevant methods

In [2]:
# Change to parent directory (presumably "Documents")
os.chdir("../..")
# Path to the CSV dataset files
data_path = 'Datasets/Thesis/FCUL_ALS/'

## Exploring the cleaned dataset

In [3]:
ALS_proc_df = pd.read_csv(f'{data_path}cleaned/FCUL_ALS_cleaned.csv')
ALS_proc_df.head()

Unnamed: 0.1,Unnamed: 0,subject_id,ts,gender,bmi,mnd_familiar_history,age_at_onset,disease_duration,r,p1,...,onset_form_5,onset_form_ftd,onset_form_nan,lmn,umn_vs_lmn_unknown,umn,c9orf72_no,c9orf72_unknown,c9orf72_yes,niv_label
0,0,2,0,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,...,0,0,0,0,1,0,0,1,0,0.0
1,1,2,1,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,...,0,0,0,0,1,0,0,1,0,0.0
2,2,2,2,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,...,0,0,0,0,1,0,0,1,0,0.0
3,3,2,3,0,-1.948198,0.311668,-0.404603,-0.594391,0.820699,0.835289,...,0,0,0,0,1,0,0,1,0,1.0
4,4,2,4,0,-1.948198,0.311668,-0.404603,-0.594391,0.387389,0.140826,...,0,0,0,0,1,0,0,1,0,1.0


In [4]:
ALS_proc_df.dtypes

Unnamed: 0                                     int64
subject_id                                     int64
ts                                             int64
gender                                         int64
bmi                                          float64
mnd_familiar_history                         float64
age_at_onset                                 float64
disease_duration                             float64
r                                            float64
p1                                           float64
p2                                           float64
p3                                           float64
p4                                           float64
p5                                           float64
p6                                           float64
p7                                           float64
p8                                           float64
p9                                           float64
p10                                          f

In [5]:
ALS_proc_df.nunique()

Unnamed: 0                                   5996
subject_id                                    918
ts                                             27
gender                                          2
bmi                                           566
mnd_familiar_history                            4
age_at_onset                                   66
disease_duration                              531
r                                              14
p1                                              6
p2                                              6
p3                                              6
p4                                              6
p5                                              6
p6                                              6
p7                                              6
p8                                              6
p9                                              6
p10                                             6
1r                                              6


In [6]:
du.search_explore.dataframe_missing_values(ALS_proc_df)

Unnamed: 0,column_name,percent_missing
Unnamed: 0,Unnamed: 0,0.0
phrenmeanampl,phrenmeanampl,0.0
niv,niv,0.0
el_escorial_reviewed_criteria_def,el_escorial_reviewed_criteria_def,0.0
el_escorial_reviewed_criteria_nan,el_escorial_reviewed_criteria_nan,0.0
el_escorial_reviewed_criteria_pbp,el_escorial_reviewed_criteria_pbp,0.0
el_escorial_reviewed_criteria_pma,el_escorial_reviewed_criteria_pma,0.0
el_escorial_reviewed_criteria_poss,el_escorial_reviewed_criteria_poss,0.0
el_escorial_reviewed_criteria_pro,el_escorial_reviewed_criteria_pro,0.0
el_escorial_reviewed_criteria_pro_lab_sup,el_escorial_reviewed_criteria_pro_lab_sup,0.0


In [7]:
ALS_proc_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,5996.0,2997.5,1731.040439,0.0,1498.75,2997.5,4496.25,5995.0
subject_id,5996.0,559.8639,369.094962,2.0,253.0,514.0,885.0,1341.0
ts,5996.0,4.328886,4.266936,0.0,1.0,3.0,6.0,26.0
gender,5996.0,0.4181121,0.49329,0.0,0.0,0.0,1.0,1.0
bmi,5996.0,2.800517e-14,0.94494,-2.95337,-0.599209,0.0,0.50613,3.974387
mnd_familiar_history,5996.0,-2.085649e-16,0.951712,-6.432032,0.311668,0.311668,0.311668,0.311668
age_at_onset,5996.0,-1.777542e-17,1.0,-4.040126,-0.562669,0.069596,0.70186,2.361556
disease_duration,5996.0,4.360903e-16,0.999499,-0.803213,-0.478468,-0.295497,0.107203,9.046969
r,5996.0,-0.0007528759,1.000666,-4.379023,-0.479231,0.387389,0.820699,0.820699
p1,5996.0,0.0136591,0.987891,-1.942561,-0.553636,0.140826,0.835289,0.835289


## Random exploratory stuff

In [8]:
labels = torch.Tensor([0, 0, 0, 1, 1, 1])
pred = torch.Tensor([1, 0, 0, 0, 1, 1])
correct_pred = pred == labels
correct_pred

tensor([False,  True,  True, False,  True,  True])

In [9]:
torch.masked_select(pred, labels.byte())



tensor([0., 1., 1.])

In [10]:
true_pos = int(sum(torch.masked_select(pred, labels.byte())))
true_pos



2

In [11]:
false_neg = int(sum(torch.masked_select(pred == 0, labels.byte())))
false_neg



1

In [12]:
true_neg = int(sum(torch.masked_select(pred == 0, (labels == 0).byte())))
true_neg



2

In [13]:
false_pos = int(sum(torch.masked_select(pred, (labels == 0).byte())))
false_pos



1

In [14]:
any(metric in ['a', 'b', 'c'] for metric in ['precision', 'recall', 'F1'])

False

In [15]:
x = 1

In [16]:
'x' in locals()

True