In [3]:
# Import required libraries
import numpy as np
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

In [4]:
# Load the data
HRV = pd.read_csv('/Users/aguetat/Desktop/train.csv')
HRV.head(10)

Unnamed: 0,MEAN_RR,MEDIAN_RR,SDRR,RMSSD,SDSD,SDRR_RMSSD,HR,pNN25,pNN50,SD1,...,HF,HF_PCT,HF_NU,TP,LF_HF,HF_LF,sampen,higuci,datasetId,condition
0,885.157845,853.76373,140.972741,15.554505,15.553371,9.063146,69.499952,11.133333,0.533333,11.001565,...,15.522603,0.421047,1.514737,3686.666157,65.018055,0.01538,2.139754,1.163485,2,no stress
1,939.425371,948.357865,81.317742,12.964439,12.964195,6.272369,64.36315,5.6,0.0,9.170129,...,2.108525,0.070133,0.304603,3006.487251,327.296635,0.003055,2.174499,1.084711,2,interruption
2,898.186047,907.00686,84.497236,16.305279,16.305274,5.182201,67.450066,13.066667,0.2,11.533417,...,13.769729,0.512671,1.049528,2685.879461,94.28091,0.010607,2.13535,1.176315,2,interruption
3,881.757865,893.46003,90.370537,15.720468,15.720068,5.748591,68.809562,11.8,0.133333,11.119476,...,18.181913,0.529387,1.775294,3434.52098,55.328701,0.018074,2.178341,1.179688,2,no stress
4,809.625331,811.184865,62.766242,19.213819,19.213657,3.266724,74.565728,20.2,0.2,13.590641,...,48.215822,1.839473,3.279993,2621.175204,29.487873,0.033912,2.221121,1.249612,2,no stress
5,923.283866,617.79416,517.536544,9.965976,9.933933,51.930344,81.342254,1.2,0.6,7.026695,...,11.02746,0.31849,6.799829,3462.418453,13.706252,0.072959,0.582616,1.128483,2,no stress
6,973.252908,964.65002,82.405179,10.644196,10.643638,7.741794,62.095066,2.0,0.0,7.5287,...,1.489796,0.119828,0.307425,1243.278879,324.282351,0.003084,2.161461,1.158004,2,no stress
7,715.914682,679.499395,131.477151,9.477727,9.477717,13.872224,85.857703,2.533333,0.2,6.703994,...,28.913453,1.501528,6.194082,1925.601664,15.144441,0.066031,1.110739,1.146555,2,no stress
8,814.257021,827.52283,87.014459,14.632232,14.631275,5.946766,74.588857,7.733333,0.8,10.349326,...,20.757787,0.659188,2.026982,3148.992003,48.33443,0.020689,2.174233,1.122471,2,interruption
9,959.694591,957.8956,54.904529,12.0154,12.015343,4.569513,62.726998,3.266667,0.2,8.498966,...,2.572459,0.1508,0.490102,1705.869787,203.039304,0.004925,2.1716,1.176054,2,no stress


In [5]:
# Data Exploration
HRV.groupby('datasetId').condition.value_counts()

datasetId  condition    
2          no stress        200082
           interruption     105150
           time pressure     64057
Name: condition, dtype: int64

In [6]:
# Data Exploration
HRV.groupby('condition').HR.value_counts()

condition      HR        
interruption   51.363126     1
               51.363656     1
               51.364273     1
               51.365412     1
               51.367964     1
               51.370943     1
               51.372594     1
               51.374063     1
               51.377141     1
               51.379473     1
               51.380090     1
               51.382883     1
               51.384965     1
               51.385398     1
               51.385480     1
               51.385933     1
               51.387378     1
               51.387545     1
               51.387816     1
               51.388088     1
               51.389789     1
               51.390973     1
               51.391166     1
               51.391279     1
               51.392363     1
               51.393253     1
               51.394313     1
               51.394878     1
               51.396019     1
               51.396124     1
                            ..
time pressure

In [7]:
# Data Exploration
HRV.groupby(['datasetId','condition']).HR.value_counts()

datasetId  condition      HR        
2          interruption   51.363126     1
                          51.363656     1
                          51.364273     1
                          51.365412     1
                          51.367964     1
                          51.370943     1
                          51.372594     1
                          51.374063     1
                          51.377141     1
                          51.379473     1
                          51.380090     1
                          51.382883     1
                          51.384965     1
                          51.385398     1
                          51.385480     1
                          51.385933     1
                          51.387378     1
                          51.387545     1
                          51.387816     1
                          51.388088     1
                          51.389789     1
                          51.390973     1
                          51.391166    

In [8]:
# Data Munging
# The first and most important step in using TPOT on any data set is to rename the target class/response variable to class.

HRV.rename(columns={'datasetID': 'class'}, inplace=True)

In [9]:
# Data Munging
# At present, TPOT requires all the data to be in numerical format. 
# As we can see below, our data set has 1 categorical variables which contain non-numerical values

HRV.dtypes

MEAN_RR              float64
MEDIAN_RR            float64
SDRR                 float64
RMSSD                float64
SDSD                 float64
SDRR_RMSSD           float64
HR                   float64
pNN25                float64
pNN50                float64
SD1                  float64
SD2                  float64
KURT                 float64
SKEW                 float64
MEAN_REL_RR          float64
MEDIAN_REL_RR        float64
SDRR_REL_RR          float64
RMSSD_REL_RR         float64
SDSD_REL_RR          float64
SDRR_RMSSD_REL_RR    float64
KURT_REL_RR          float64
SKEW_REL_RR          float64
VLF                  float64
VLF_PCT              float64
LF                   float64
LF_PCT               float64
LF_NU                float64
HF                   float64
HF_PCT               float64
HF_NU                float64
TP                   float64
LF_HF                float64
HF_LF                float64
sampen               float64
higuci               float64
datasetId     

In [10]:
# Data Munging
# We then check the number of levels that each of the five categorical variables have.
for cat in ['condition']:
    print("Number of levels in category '{0}': \b {1:2.2f} ".format(cat, HRV[cat].unique().size))

Number of levels in category 'condition':  3.00 


In [11]:
# Data Munging
# As we can see, contact and poutcome have few levels. Let's find out what they are.
for cat in ['condition']:
    print("Levels for catgeory '{0}': {1}".format(cat, HRV[cat].unique()))

Levels for catgeory 'condition': ['no stress' 'interruption' 'time pressure']


In [27]:
# Data Munging
# We then code these levels manually into numerical values. 
HRV['condition'] = HRV['condition'].map({'no stress':-1,'interruption':0,'time pressure':1})
HRV['class'] = HRV['class'].map({'2':2})

KeyError: 'class'

In [13]:
# Data Munging
# For nan i.e. the missing values, we simply replace them with a placeholder value (-999). 
# In fact, we perform this replacement for the entire data set.
HRV = HRV.fillna(-999)
pd.isnull(HRV).any()

MEAN_RR              False
MEDIAN_RR            False
SDRR                 False
RMSSD                False
SDSD                 False
SDRR_RMSSD           False
HR                   False
pNN25                False
pNN50                False
SD1                  False
SD2                  False
KURT                 False
SKEW                 False
MEAN_REL_RR          False
MEDIAN_REL_RR        False
SDRR_REL_RR          False
RMSSD_REL_RR         False
SDSD_REL_RR          False
SDRR_RMSSD_REL_RR    False
KURT_REL_RR          False
SKEW_REL_RR          False
VLF                  False
VLF_PCT              False
LF                   False
LF_PCT               False
LF_NU                False
HF                   False
HF_PCT               False
HF_NU                False
TP                   False
LF_HF                False
HF_LF                False
sampen               False
higuci               False
datasetId            False
condition            False
dtype: bool

In [24]:
# Data Munging
# For other categorical variables, we encode the levels as digits using 
# Scikit-learn's MultiLabelBinarizer and treat them as new features.
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

condition_Trans = mlb.fit_transform([{str(val)} for val in HRV['condition'].values])
HR_Trans = mlb.fit_transform([{str(val)} for val in HRV['HR'].values])
MEAN_RR_Trans = mlb.fit_transform([{str(val)} for val in HRV['MEAN_RR'].values])
MEDIAN_RR_Trans = mlb.fit_transform([{str(val)} for val in HRV['MEDIAN_RR'].values])

In [22]:
# Data Munging
# Drop the unused features from the dataset.
HRV_new = HRV.drop(['MEAN_REL_RR','MEDIAN_REL_RR','SDRR_REL_RR','RMSSD_REL_RR','SDSD_REL_RR','SDRR_RMSSD_REL_RR','KURT_REL_RR','SKEW_REL_RR'], axis=1)

In [25]:
# Data Munging
# We then add the encoded features to form the final dataset to be used with TPOT.
import numpy as np
HRV_new = np.hstack((HRV_new.values, HR_Trans, class, MEAN_RR_Trans, MEDIAN_RR_Trans))

SyntaxError: invalid syntax (<ipython-input-25-b668b13fdaec>, line 4)

In [None]:
# Data Munging
np.isnan(marketing_new).any()