# Characterizing data distribution

In [38]:
input_dir = 'autoML'

### We want to represent the dataset in a human-friendly format to get a good impression of it. This is a kind of dataset "identity card".

- Data format : autoML

### Characterization and visualization

- Hierarchical clustering with heatmap matrix
- Hierarchical clustering with correlation matrix
- Principal components analysis (PCA)
- Linear discriminant analysis (LDA)
- t-distributed stochastic neighbor embedding (t-SNE algorithme)

In [39]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Read data

In [40]:
# Currently ad-hoc

# Will use ingestion program in the future ?

# Example : 
# mimic.data
# mimic_feat.name
# mimic.solution
# mimic_label.name

feat_name = pd.read_csv(input_dir+'/mimic_feat.name', sep='\n', header=None)
data = pd.read_csv(input_dir+'/mimic.data', sep=' ', header=None)
X = pd.DataFrame(data.values, columns=feat_name)

label_name = pd.read_csv(input_dir+'/mimic_label.name', sep='\n', header=None)
solution = pd.read_csv(input_dir+'/mimic.solution', sep=' ', header=None)
y = pd.DataFrame(solution.values, columns=label_name)

In [41]:
X.head()

Unnamed: 0,"(INSURANCE,)","(LANGUAGE,)","(RELIGION,)","(MARITAL_STATUS,)","(ETHNICITY,)","(GENDER,)","(AGE,)","(CCU,)","(CSRU,)","(MICU,)",...,"(V74,)","(V81,)","(V83,)","(V84,)","(V85,)","(V86,)","(V87,)","(V88,)","(V90,)","(V91,)"
0,Medicare,,CATHOLIC,MARRIED,WHITE,M,71,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Private,,UNOBTAINABLE,MARRIED,WHITE,M,58,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Medicare,,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,M,72,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Medicare,,CATHOLIC,MARRIED,WHITE,M,74,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Medicare,,CATHOLIC,MARRIED,UNKNOWN/NOT SPECIFIED,M,-285,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
y.head()

Unnamed: 0,"(DIED,)","(LOS,)"
0,0.0,1.2641
1,0.0,3.5466
2,0.0,2.1407
3,0.0,1.1224
4,0.0,1.8472


### Simplification

- Replace missing values, NaN, Inf.
- Replace missing categorical variables.
- Normalization

In [45]:
def is_numeric(variable):
    """ Test if a variable (DataFrame column) is numeric or categorical """
    
    numeric = False
    for value in variable:
        # Check if there is at least one value that is a number and not NaN
        # (isinstance consider Nan as a Number)
        if isinstance(value, (int, float)) and not np.isnan(value):
            numeric = True
            break
            
    return numeric
    
def preprocessing(data):
    """ Return preprocessed DataFrame """
    
    columns = data.columns.values

    for column in columns:
    
        # For numerical variables
        if is_numeric(data[column]):
    
            # Replace NaN with the median of the variable value
            data[column] = data[column].fillna(data[column].median())
    
            # Replace +Inf by the maximum and -Inf by the minimum
            data[column] = data[column].replace(np.inf, max(data[column]))
            data[column] = data[column].replace(-np.inf, min(data[column]))
    
        # For categorigal variables
        else:
            # Replace NaN with 'missing'
            # TODO : For one-hot encoding : [0, 0, ..., 0]
            data[column] = data[column].fillna('missing')
            
            # One-hot encoding
            one_hot = pd.get_dummies(data[column])
            data = data.drop(column, axis=1)
            data = data.join(one_hot, lsuffix='l', rsuffix='r')
        
    return data

In [46]:
X = preprocessing(X)
y = preprocessing(y)

### Hierarchical clustering