In [14]:
path = "D:\\Rythm\\"

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import FastICA, PCA
import scipy as sc
from scipy.stats import kurtosis
from scipy.stats import skew
import pywt
from scipy import stats
from statsmodels.robust import stand_mad

# Functions

In [16]:
# Quality of hypnogram recording : if there is much than p percent of the record that is -1 -> bad recording

def hypnogram_quality(hypx , treshold):
    return 0 if hypx.count(-1)/len(hypx) > treshold else 1

In [17]:
def preprocessing_hypno(col):
    col = col.replace("'","").replace(",","")[1:-1].split(' ')
    res = [int(i) for i in col]
    return res

In [18]:
# Count number of breaks of slow wave sleep

def count_breaks_3(hypx):
    count = 0
    for i in range(0,len(hypx)-1):
        if hypx[i] !=3 & hypx[i+1] == 3:
            count = count + 1
    return count
            

In [19]:
def get_hypno(data):
    return data["HYPNOGRAM"]

In [20]:
def get_eeg(data):
    return data.ix[:,2:-1]

In [21]:
def get_eeg_features(EEG, age):
    age["mean_eeg"] = EEG.apply(np.mean,axis = 1)
    age["std_eeg"] = EEG.apply(np.std , axis = 1)
    age["kurt_eeg"] =  EEG.apply(sc.stats.kurtosis , axis =1)
    age["skew_eeg"] = EEG.apply(sc.stats.skew , axis =1)
    age["max_eeg"] = EEG.apply(np.max , axis = 1)
    age["min_eeg"] = EEG.apply(np.min, axis = 1)
    return age

In [22]:
def get_hypno_features(age,hyp):
    age["mean_hypno"] = hyp.apply(np.mean)
    age["std_hypno"] = hyp.apply(np.std)
    age["kurt_hypno"] = hyp.apply(sc.stats.kurtosis)
    age["skew_hypno"] = hyp.apply(sc.stats.skew)

    age["length_list"] = hyp.apply(lambda hypx : len(hypx))
    age["occurencies_-1"] = hyp.apply(lambda hypx : hypx.count(-1))
    age["occurencies_0"] = hyp.apply(lambda hypx : hypx.count(0))
    age["occurencies_1"] = hyp.apply(lambda hypx : hypx.count(1))
    age["occurencies_2"] = hyp.apply(lambda hypx : hypx.count(2))
    age["occurencies_3"] = hyp.apply(lambda hypx : hypx.count(3))
    age["occurencies_4"] = hyp.apply(lambda hypx : hypx.count(4))

    # Quality of recording : categorical variable
    quality_threshold = 0.1 #10% of well recorded time steps
    age["record_quality"] = hyp.apply(lambda hypx : hypnogram_quality(hypx,quality_threshold))

    # Percentage of Slow Wave Sleep (3)

    age["SlowWaveSleep_percentage"] = age["occurencies_3"] / age["length_list"]

    # Number of breaks in Slow Wave Sleep

    age["breaks"] = hyp.apply(count_breaks_3)
    return age

In [23]:
def get_device(age,data):
    age["device"] = dat2["DEVICE"]
    return age

In [41]:
def get_fft_length(EEG , age):
    length = []
    for j in range(0,len(EEG)):
        fft = np.fft.fft(EEG.ix[j,:])
        freqs = np.fft.fftfreq(len(fft) , d = 1/250) # frequence
        y = np.sqrt(np.square(fft.real) + np.square(fft.imag)) # amplitude
        z = y[(0 < freqs) & (freqs < 4)]
        length.append(len(z))
    age["fft_length"] = length
    return age
        

In [82]:
def get_fft_mean_ampl(EEG , age):
    res = []
    for j in range(0,len(EEG)):
        fft = np.fft.fft(EEG.ix[j,:])
        freqs = np.fft.fftfreq(len(fft) , d = 1/250) # frequence
        y = np.sqrt(np.square(fft.real) + np.square(fft.imag)) # amplitude
        z = y[(0 < freqs) & (freqs < 4)]
        res.append(np.mean(z))
    age["fft_amp"] = res
    return age

In [26]:
def get_wavelet(EEG):
    coef = EEG.apply(lambda x : pywt.wavedec(x, 'db8', level=8, mode='per'), axis =1)
    return coef

In [33]:
def get_wav_features(coef, age):
    max_coef = []
    min_coef = []
    mean_coef = []
    median_coef = []
    for j in range(0,len(coef)):
        temp_max = [np.max(coef[j][i]) for i in range(0,8)]
        max_coef.append(np.max(temp_max))
        temp_min = [np.min(coef[j][i]) for i in range(0,8)]
        min_coef.append(np.min(temp_min))
        temp_mean = [np.mean(coef[j][i]) for i in range(0,8)]
        mean_coef.append(np.mean(temp_mean))
        temp_median = [np.median(coef[j][i]) for i in range(0,8)]
        median_coef.append(np.median(temp_median))
    age["wav_max"] = max_coef
    age["wav_min"] = min_coef
    age["wav_mean"] = mean_coef
    age["wav_median"] = median_coef
    return age

In [28]:
def get_first_level(coef , age_wav):
  
    wav = []
    for i in range(0,len(coef[1][0])):
        wav.append("wav_"+str(i))

    bla = []
    for i in range(0,581):
        bla.append(coef[i][0])
    
    res = pd.DataFrame(bla)
    res.columns = wav
    
    return pd.concat([age_wav , res] , axis =1)

# Load data

In [87]:
dat2 = pd.read_csv(path+"train_input.csv" , header = 0 , sep =";")

In [88]:
age = pd.read_table(path+"agetrain.txt" , header = 0 , sep =";")
age_wav = pd.DataFrame.copy(age)
age_wav_feat = pd.DataFrame.copy(age)

# Features from EEG

In [56]:
EEG = get_eeg(dat2)
age = get_device(age,dat2)
age = get_eeg_features(EEG,age)

# Features from HYPNOGRAM

In [47]:
hypnogram = get_hypno(dat2)
hyp = hypnogram.apply(preprocessing_hypno)

In [48]:
age = get_hypno_features(age,hyp)
age = get_fft_length(EEG, age)
age = get_fft_mean_ampl(EEG,age)

In [49]:
age.head()

Unnamed: 0,ID,TARGET,device,mean_eeg,std_eeg,kurt_eeg,skew_eeg,max_eeg,min_eeg,mean_hypno,...,occurencies_0,occurencies_1,occurencies_2,occurencies_3,occurencies_4,record_quality,SlowWaveSleep_percentage,breaks,fft_length,fft_amp
0,0,32,0,0.001395848,26.81135,129.538478,-2.978869,370.873413,-584.737732,2.346035,...,89,7,443,233,177,1,0.239959,94,1199,30110.757853
1,1,29,0,-0.0006471835,11.881911,0.421025,-0.042748,58.905529,-66.786278,2.320659,...,78,19,379,198,115,1,0.250951,50,1199,12311.275042
2,2,36,0,0.01041976,97.740294,215.544923,3.324026,1944.06543,-2119.885986,2.267797,...,53,0,53,188,0,1,0.637288,31,1199,122016.340524
3,3,56,0,-0.007526106,17.135283,145.579363,4.544951,440.595032,-190.482651,2.741071,...,65,5,302,189,323,1,0.210938,56,1199,19151.960999
4,4,60,1,-5.648294e-08,3.4e-05,1.346067,-0.322313,0.000124,-0.000185,1.28328,...,463,58,143,239,36,1,0.254526,33,1199,0.038199


In [54]:
age.to_csv("feat.csv")

# Wavelet decomposition

In [None]:
age_wav = get_device(age_wav , dat2)

In [50]:
coef = get_wavelet(EEG)

In [51]:
age_wav = get_first_level(coef , age_wav)

In [52]:
age_wav.head()

Unnamed: 0,ID,TARGET,wav_0,wav_1,wav_2,wav_3,wav_4,wav_5,wav_6,wav_7,...,wav_283,wav_284,wav_285,wav_286,wav_287,wav_288,wav_289,wav_290,wav_291,wav_292
0,0,32,-26.160924,23.303999,-10.213803,6.906258,29.895347,-11.980105,22.45215,-94.052389,...,114.789948,-87.383952,45.343961,-35.58315,46.568365,-57.706383,46.303616,-19.056273,-43.239224,52.121363
1,1,29,-119.202402,115.511347,-31.042702,39.627518,75.966791,-116.779473,-31.488822,75.733471,...,36.964734,-30.638351,75.603686,-16.8208,-129.549361,163.803176,-44.749805,22.295864,-105.316364,49.4141
2,2,36,-162.166613,138.936718,-55.307023,-47.563567,25.383424,133.420173,-139.551223,163.09486,...,-125.434922,169.589339,-131.072328,92.223169,-106.177873,41.249881,4.889169,47.290513,-134.627912,192.335377
3,3,56,-107.109462,65.115314,-83.066183,76.750174,-40.364537,-35.582466,35.15805,-60.960912,...,71.868659,-56.068405,60.445541,-32.552461,-31.070541,35.879694,-0.042507,-16.144663,-67.805915,153.965081
4,4,60,0.000145,-1.8e-05,3.2e-05,7.1e-05,-0.000223,0.000303,-0.000114,6.6e-05,...,-0.000287,0.000242,-0.000404,0.000287,0.000239,-0.000326,6.3e-05,-0.000112,0.000358,-0.000451


In [53]:
age_wav.to_csv("wav.csv")

## Wavelet features


In [59]:
age_wav_feat = get_wav_features(coef,age_wav_feat)
age_wav_feat = get_device(age_wav_feat, dat2)

In [60]:
age_wav_feat.head()

Unnamed: 0,ID,TARGET,wav_max,wav_min,wav_mean,wav_median,device
0,0,32,2257.385063,-2681.838399,1.283108,0.0228532,0
1,1,29,245.921723,-239.334369,-0.527796,-0.3990815,0
2,2,36,6881.331512,-13321.069867,-1.721624,0.2481047,0
3,3,56,1499.878949,-1234.391089,-0.225711,0.004624934,0
4,4,60,0.001245,-0.00105,-3e-06,-3.472571e-09,1


In [61]:
age_wav_feat.to_csv("wavfeat.csv")

# Train a random forest

In [89]:
EEG = get_eeg(dat2)
age = get_device(age,dat2)
age = get_eeg_features(EEG,age)



In [90]:
EEG = get_eeg(dat2)
age = get_device(age,dat2)
age = get_eeg_features(EEG,age)

hypnogram = get_hypno(dat2)
hyp = hypnogram.apply(preprocessing_hypno)

age = get_hypno_features(age,hyp)

In [91]:
age.head()

Unnamed: 0,ID,TARGET,device,mean_eeg,std_eeg,kurt_eeg,skew_eeg,max_eeg,min_eeg,mean_hypno,...,length_list,occurencies_-1,occurencies_0,occurencies_1,occurencies_2,occurencies_3,occurencies_4,record_quality,SlowWaveSleep_percentage,breaks
0,0,32,0,0.001395848,26.81135,129.538478,-2.978869,370.873413,-584.737732,2.346035,...,971,22,89,7,443,233,177,1,0.239959,94
1,1,29,0,-0.0006471835,11.881911,0.421025,-0.042748,58.905529,-66.786278,2.320659,...,789,0,78,19,379,198,115,1,0.250951,50
2,2,36,0,0.01041976,97.740294,215.544923,3.324026,1944.06543,-2119.885986,2.267797,...,295,1,53,0,53,188,0,1,0.637288,31
3,3,56,0,-0.007526106,17.135283,145.579363,4.544951,440.595032,-190.482651,2.741071,...,896,12,65,5,302,189,323,1,0.210938,56
4,4,60,1,-5.648294e-08,3.4e-05,1.346067,-0.322313,0.000124,-0.000185,1.28328,...,939,0,463,58,143,239,36,1,0.254526,33


In [77]:
EEG.head()

Unnamed: 0,EEG_0,EEG_1,EEG_2,EEG_3,EEG_4,EEG_5,EEG_6,EEG_7,EEG_8,EEG_9,...,EEG_74990,EEG_74991,EEG_74992,EEG_74993,EEG_74994,EEG_74995,EEG_74996,EEG_74997,EEG_74998,EEG_74999
0,-41.473923,-45.176369,-48.871876,-52.430531,-55.698608,-58.338871,-59.919689,-60.365482,-59.915306,-58.619602,...,-39.994534,-40.737919,-41.243393,-41.563141,-41.477573,-40.68784,-39.171162,-37.272285,-35.384312,-33.797901
1,1.995628,1.97668,2.144622,2.3071,2.182211,1.700662,0.858995,-0.222909,-1.122924,-1.487486,...,4.151426,3.326329,2.46532,1.581539,0.663152,-0.33666,-1.501697,-2.669854,-3.677377,-4.576015
2,1.017115,1.65727,2.482397,3.356749,4.225514,5.210766,6.30964,7.299839,8.118546,8.81822,...,6.741571,5.831949,4.320967,2.68133,1.038642,-0.829033,-2.989935,-5.276758,-7.512856,-9.267806
3,11.683449,13.265381,15.650467,17.32884,16.866253,16.138889,15.232826,14.188766,12.432949,9.145377,...,-4.335334,-8.712267,-12.486134,-15.047755,-15.679938,-14.733905,-13.749776,-12.376304,-11.085991,-8.736834
4,17.023975,15.871809,14.530089,13.091481,11.45983,9.527679,7.319503,4.868694,2.31906,-0.03631,...,2.221342,1.686171,1.377594,1.298173,1.646075,2.646186,4.198915,5.949712,7.506319,8.67199


In [92]:
age = get_fft_mean_ampl(EEG,age)

coef = get_wavelet(EEG)
age = get_wav_features(coef,age)

age.head()

Unnamed: 0,ID,TARGET,device,mean_eeg,std_eeg,kurt_eeg,skew_eeg,max_eeg,min_eeg,mean_hypno,...,occurencies_3,occurencies_4,record_quality,SlowWaveSleep_percentage,breaks,fft_amp,wav_max,wav_min,wav_mean,wav_median
0,0,32,0,0.001395848,26.81135,129.538478,-2.978869,370.873413,-584.737732,2.346035,...,233,177,1,0.239959,94,30110.757853,2257.385063,-2681.838399,1.283108,0.0228532
1,1,29,0,-0.0006471835,11.881911,0.421025,-0.042748,58.905529,-66.786278,2.320659,...,198,115,1,0.250951,50,12311.275042,245.921723,-239.334369,-0.527796,-0.3990815
2,2,36,0,0.01041976,97.740294,215.544923,3.324026,1944.06543,-2119.885986,2.267797,...,188,0,1,0.637288,31,122016.340524,6881.331512,-13321.069867,-1.721624,0.2481047
3,3,56,0,-0.007526106,17.135283,145.579363,4.544951,440.595032,-190.482651,2.741071,...,189,323,1,0.210938,56,19151.960999,1499.878949,-1234.391089,-0.225711,0.004624934
4,4,60,1,-5.648294e-08,3.4e-05,1.346067,-0.322313,0.000124,-0.000185,1.28328,...,239,36,1,0.254526,33,0.038199,0.001245,-0.00105,-3e-06,-3.472571e-09


In [94]:
age.columns

Index(['ID', 'TARGET', 'device', 'mean_eeg', 'std_eeg', 'kurt_eeg', 'skew_eeg',
       'max_eeg', 'min_eeg', 'mean_hypno', 'std_hypno', 'kurt_hypno',
       'skew_hypno', 'length_list', 'occurencies_-1', 'occurencies_0',
       'occurencies_1', 'occurencies_2', 'occurencies_3', 'occurencies_4',
       'record_quality', 'SlowWaveSleep_percentage', 'breaks', 'fft_amp',
       'wav_max', 'wav_min', 'wav_mean', 'wav_median'],
      dtype='object')

In [93]:
age["record_quality"] = pd.Categorical(age["record_quality"] , ordered = False)
age["device"] = pd.Categorical(age["device"] , ordered = False)
dum = pd.get_dummies(age[["record_quality","device"]])

In [95]:
quantitative = age.ix[:,2:]

In [96]:
quantitative = quantitative.drop(["record_quality", "device"] , axis =1)

In [97]:
X_train = pd.concat([dum, quantitative], axis=1)

In [98]:
X_train.columns

Index(['record_quality_0', 'record_quality_1', 'device_0.0', 'device_1.0',
       'mean_eeg', 'std_eeg', 'kurt_eeg', 'skew_eeg', 'max_eeg', 'min_eeg',
       'mean_hypno', 'std_hypno', 'kurt_hypno', 'skew_hypno', 'length_list',
       'occurencies_-1', 'occurencies_0', 'occurencies_1', 'occurencies_2',
       'occurencies_3', 'occurencies_4', 'SlowWaveSleep_percentage', 'breaks',
       'fft_amp', 'wav_max', 'wav_min', 'wav_mean', 'wav_median'],
      dtype='object')

In [99]:
Y_train = age["TARGET"]

# TEST

In [100]:
dat2= pd.read_csv(path + "test_input.csv" , header = 0 , sep =";")
age = pd.DataFrame()

In [101]:
EEG = get_eeg(dat2)
age = get_device(age,dat2)
age = get_eeg_features(EEG,age)

hypnogram = get_hypno(dat2)
hyp = hypnogram.apply(preprocessing_hypno)

age = get_hypno_features(age,hyp)
age = get_fft_mean_ampl(EEG,age)

coef = get_wavelet(EEG)
age = get_wav_features(coef,age)

In [102]:
age.head()

Unnamed: 0,device,mean_eeg,std_eeg,kurt_eeg,skew_eeg,max_eeg,min_eeg,mean_hypno,std_hypno,kurt_hypno,...,occurencies_3,occurencies_4,record_quality,SlowWaveSleep_percentage,breaks,fft_amp,wav_max,wav_min,wav_mean,wav_median
0,0,-0.010803,39.670638,1.092408,-0.150298,210.285889,-266.636261,2.301815,1.051958,0.565742,...,375,94,1,0.358166,115,41504.785388,1193.492798,-1359.09296,1.793273,0.004136
1,0,0.008285,13.064128,1.720162,-0.40693,62.612057,-86.85981,2.359335,0.972793,0.682986,...,172,111,1,0.219949,59,14936.198693,381.107914,-318.570303,0.865945,3.6e-05
2,0,0.047561,46.866553,144.778221,-6.068087,419.809143,-985.156738,2.401235,1.055321,0.248593,...,126,121,1,0.194444,34,46839.038799,4367.784532,-4288.617221,0.555141,-0.002713
3,0,0.003095,11.212206,0.822795,-0.163347,46.615643,-58.285442,1.78186,1.612435,-0.961079,...,190,125,0,0.21814,165,11566.457527,256.509737,-264.927743,0.259609,-0.132295
4,0,-0.009169,17.592145,2.923748,-0.158121,91.47422,-108.499474,2.255196,0.973436,0.778938,...,310,49,1,0.357968,72,19681.453293,507.270915,-617.978831,-2.700893,0.008838


In [66]:
age.columns

Index(['device', 'mean_eeg', 'std_eeg', 'kurt_eeg', 'skew_eeg', 'max_eeg',
       'min_eeg', 'mean_hypno', 'std_hypno', 'kurt_hypno', 'skew_hypno',
       'length_list', 'occurencies_-1', 'occurencies_0', 'occurencies_1',
       'occurencies_2', 'occurencies_3', 'occurencies_4', 'record_quality',
       'SlowWaveSleep_percentage', 'breaks', 'fft_length', 'fft_amp',
       'wav_max', 'wav_min', 'wav_mean', 'wav_median'],
      dtype='object')

In [103]:

age = get_wav_features(coef,age)
age.head()

Unnamed: 0,device,mean_eeg,std_eeg,kurt_eeg,skew_eeg,max_eeg,min_eeg,mean_hypno,std_hypno,kurt_hypno,...,occurencies_3,occurencies_4,record_quality,SlowWaveSleep_percentage,breaks,fft_amp,wav_max,wav_min,wav_mean,wav_median
0,0,-0.010803,39.670638,1.092408,-0.150298,210.285889,-266.636261,2.301815,1.051958,0.565742,...,375,94,1,0.358166,115,41504.785388,1193.492798,-1359.09296,1.793273,0.004136
1,0,0.008285,13.064128,1.720162,-0.40693,62.612057,-86.85981,2.359335,0.972793,0.682986,...,172,111,1,0.219949,59,14936.198693,381.107914,-318.570303,0.865945,3.6e-05
2,0,0.047561,46.866553,144.778221,-6.068087,419.809143,-985.156738,2.401235,1.055321,0.248593,...,126,121,1,0.194444,34,46839.038799,4367.784532,-4288.617221,0.555141,-0.002713
3,0,0.003095,11.212206,0.822795,-0.163347,46.615643,-58.285442,1.78186,1.612435,-0.961079,...,190,125,0,0.21814,165,11566.457527,256.509737,-264.927743,0.259609,-0.132295
4,0,-0.009169,17.592145,2.923748,-0.158121,91.47422,-108.499474,2.255196,0.973436,0.778938,...,310,49,1,0.357968,72,19681.453293,507.270915,-617.978831,-2.700893,0.008838


In [104]:
age.to_csv("age_test.csv")
age_test = pd.DataFrame.copy(age)

In [105]:
age_test["record_quality"] = pd.Categorical(age_test["record_quality"] , ordered = False)
age_test["device"] = pd.Categorical(age_test["device"] , ordered = False)
dum = pd.get_dummies(age_test[["record_quality","device"]])

In [106]:
quantitative_test = age_test

In [107]:
quantitative_test = quantitative_test.drop(["record_quality", "device"] , axis =1)

In [108]:
X_test= pd.concat([dum, quantitative_test], axis=1)

In [110]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param=[{"max_features":list(range(4,len(X_train.columns),3)), "n_estimators" : list(range(50,500,50))}]
rf= GridSearchCV(RandomForestRegressor(),param,cv=5,n_jobs=-1)
rfOpt=rf.fit(X_train , Y_train)
# paramètres optimaux
print("Meilleur score = %f, nombre de features = %i, nombre d'arbres = %i" % (1. - rfOpt.best_score_,rfOpt.best_params_["max_features"], rfOpt.best_params_["n_estimators"]))

Meilleur score = 0.741388, nombre de features = 22, nombre d'arbres = 150


In [112]:
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(max_features = 22 , n_estimators = 150)
rfFit = rf.fit(X_train, Y_train)
print("MSE=",mean_squared_error(Y_train,rfFit.predict(X_train)))

MSE= 13.9079202142


In [113]:
pred = np.round(rfFit.predict(X_test))

In [114]:
res = pd.DataFrame()
#res["ID"] = [i for i in range(581,830)]
res["TARGET"] = pred
res

Unnamed: 0,TARGET
0,39
1,42
2,41
3,36
4,38
5,32
6,58
7,35
8,41
9,55


In [115]:
res.to_csv("test_output2.csv")