In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import neurokit2 as nk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import re
import os

In [2]:
DATA_PATH = '/srv/scratch3/bruxit/bruxit_all_fnorm/'
DATA_PATH = "/Users/nanashuka/Document/Zurich/UZH/master_project/sensor-data-analysis-pipeline/back-end/data/"
original_sampling = 2000
selected_sampling = 1000

In [3]:
def resample_signal(signal, sampling_rate=2000, selected_sampling=1000):
    return nk.signal_resample(signal, sampling_rate=sampling_rate, desired_sampling_rate=selected_sampling, method="pandas")

In [5]:
existing_patients_recordings = {}
for folder in os.listdir(DATA_PATH):
        if not os.path.isdir(DATA_PATH + folder):
            continue
        if not re.search('p(.*?)_', folder):
            continue
        patient_id = re.search('p(.*?)_', folder).group(1)

        patient_week_folder = DATA_PATH + folder

        csv_files = [f for f in os.listdir(patient_week_folder) if f.endswith(".csv")]

        night_id_list= []
        night_id_recorder = {}
        for csv in csv_files:
            night_id = re.search("[0-9]+", csv).group(0)
            night_id_list.append(night_id)

            if "location_Bites" not in csv:
                recorder = re.search('.(?=F)', csv).group(0)
                night_id_recorder[night_id] = recorder

        if night_id_list:
            week = re.search('wk(.*)', folder).group(1)
            night_id_list = list(set(night_id_list))
            night_id_list = sorted(night_id_list)

            for night_id in night_id_list:
                if patient_id in existing_patients_recordings:
                    existing_patients_recordings[patient_id].append({
                        "week": week,
                        "night_id": night_id,
                        "recorder": night_id_recorder[night_id]
                    })
                else:
                    existing_patients_recordings[patient_id] = [{
                        "week": week,
                        "night_id": night_id,
                        "recorder": night_id_recorder[night_id]
                    }]
print(existing_patients_recordings)

{'3': [{'week': '21-22', 'night_id': '0100004', 'recorder': 'a'}, {'week': '5-6', 'night_id': '0809455', 'recorder': 'a'}, {'week': '5-6', 'night_id': '0816565', 'recorder': 'a'}, {'week': '5-6', 'night_id': '0902175', 'recorder': 'a'}, {'week': '3-4', 'night_id': '0602473', 'recorder': 'a'}, {'week': '3-4', 'night_id': '0612150', 'recorder': 'a'}, {'week': '13-14', 'night_id': '0100002', 'recorder': 'a'}], '2': [{'week': '1-2', 'night_id': '0107285', 'recorder': 'e'}, {'week': '1-2', 'night_id': '0123035', 'recorder': 'e'}, {'week': '1-2', 'night_id': '0204451', 'recorder': 'e'}, {'week': '1-2', 'night_id': '0213314', 'recorder': 'e'}, {'week': '5-6', 'night_id': '0521390', 'recorder': 'e'}, {'week': '3-4', 'night_id': '0315230', 'recorder': 'e'}, {'week': '13-14', 'night_id': '1722373', 'recorder': 'd'}], '1': [{'week': '21-22', 'night_id': '2020582', 'recorder': 'c'}, {'week': '1', 'night_id': '0901260', 'recorder': 'c'}, {'week': '1', 'night_id': '1022102', 'recorder': 'c'}, {'week

In [8]:
general_model = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic',
        eval_metric='logloss', subsample=0.6, max_depth=3, learning_rate=0.1, colsample_bytree=1.0)
first = True
for patient_id, recordings in existing_patients_recordings.items():
    print("Patient: ", patient_id)
    patient_model = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic',
        eval_metric='logloss', subsample=0.6, max_depth=3, learning_rate=0.1, colsample_bytree=1.0)
    df = pd.DataFrame({'MR': [], 'ML': [], 'Bites': []})
    for recording in recordings:
        try:
            week = recording["week"]
            night_id = recording["night_id"]
            recorder = recording["recorder"]
            print(f"p{patient_id}, wk{week}, {night_id}{recorder}")
            # Load data
            loc = pd.read_csv(DATA_PATH + "p" + patient_id + "_wk" + week + "/" + night_id + recorder + "location_Bites.csv")
            # data = pd.read_csv(DATA_PATH + "p" + patient_id + "_wk" + week + "/" + night_id + recorder + "Fnorm.csv")
            range_min = 0
            range_max = int(loc.iloc[2,1]) + 1
            dur_len = float(loc.iloc[0,2]) + float(loc.iloc[1,2]) + float(loc.iloc[2,2])
            if dur_len*original_sampling*2 > range_max:
                range_max = int(dur_len*2000*2)
            print('range_max: ', range_max)
            data_itr = pd.read_csv(f'{DATA_PATH}p{patient_id}_wk{week}/{night_id}{recorder}Fnorm.csv', chunksize=range_max, iterator=True, usecols=['MR','ML'])
            data = data_itr.get_chunk()
            print('data.shape: ', data.shape)
            # print('data: ', data.describe())
            
            bites = np.zeros(data.shape[0], dtype=int)
            data = data.dropna()
            # print('data.shape after drop: ', data.shape)
            print(f"label bites, wk{week}, {night_id}{recorder}")
            for i in range(0, data.shape[0]):
                if i < int(float(loc.iloc[0,0])) or (i > int(float(loc.iloc[0,1])) and i < int(float(loc.iloc[1,0]))) or (i > int(float(loc.iloc[1,1])) and i < int(float(loc.iloc[2,0]))) or i > int(float(loc.iloc[2,1])):
                    bites[i] = 0
                else:
                    bites[i] = 1
            bites = bites[data.index]
            MR = data['MR'].values.tolist()
            ML = data['ML'].values.tolist()
            if (len(bites) != len(MR)):
                print('/////////////////////////////////////')
                print('len(bites): ', len(bites))
                print('len(MR): ', len(MR))
                print('/////////////////////////////////////')
            # print('MR: ', MR[:10])
            # print('ML: ', ML[:10])
            MR = resample_signal(signal=MR, sampling_rate=original_sampling, selected_sampling=selected_sampling)
            ML = resample_signal(signal=ML, sampling_rate=original_sampling, selected_sampling=selected_sampling)
            bites = resample_signal(signal=bites, sampling_rate=original_sampling, selected_sampling=selected_sampling)
            # df.loc[:,'Bites'] = bites
            df_p = pd.DataFrame({'MR': MR, 'ML': ML, 'Bites': bites})
            print('df_p.shape: ', df_p.shape)
            print('df.shape: ', df.shape)
            # print('df: ', df_p.describe())
            
            df = df.append(df_p, ignore_index=True)
            print()
        except Exception as e:
            print(e)
            print()
            continue
    
    x = df.iloc[:,:2].copy()
    y = df.iloc[:,-1].copy()
    x = np.array(x.values.tolist())
    y = np.array(y.values.tolist())
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # Split data for test and training
    SC = StandardScaler()
    x_train = pd.DataFrame(SC.fit_transform(x_train))
    x_test = pd.DataFrame(SC.transform(x_test))
    print(f"fit model p{patient_id}")
    patient_model.fit(x_train, y_train)
    print(f"save model p{patient_id}")
    patient_model.save_model(DATA_PATH + '/models/' +f"p{patient_id}_model.json")

    print("fit general model")
    if first:
        general_model.fit(x_train, y_train)
        first = False
    else:
        general_model.fit(x_train, y_train, xgb_model=general_model)

general_model.save_model(DATA_PATH + '/models/' + "general_model.json")
print("save general model")

Patient:  3
p3, wk21-22, 0100004a
range_max:  489511
data.shape:  (489511, 2)
data:                    MR            ML
count  489511.000000  4.895110e+05
mean        0.000091 -2.335562e-07
std         3.849291  1.288763e+00
min       -71.793000 -1.757000e+02
25%        -0.302290 -3.872200e-02
50%         0.000733  3.616700e-05
75%         0.306670  3.927800e-02
max       108.550000  8.888100e+01
label bites, wk21-22, 0100004a
df_p.shape:  (244756, 3)
df.shape:  (0, 3)

p3, wk5-6, 0809455a
range_max:  27396
data.shape:  (27396, 2)
data:                   MR            ML
count  27396.000000  27396.000000
mean       0.000019      0.000442
std        0.682977      1.855528
min      -12.705000    -16.463000
25%       -0.085409     -0.327133
50%       -0.003090      0.029702
75%        0.080449      0.539615
max       25.175000     22.501000
label bites, wk5-6, 0809455a


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (13698, 3)
df.shape:  (244756, 3)

p3, wk5-6, 0816565a
range_max:  20829
data.shape:  (20829, 2)
data:                   MR            ML
count  20829.000000  20829.000000
mean      -0.000232      0.001881
std        1.825176      5.696589
min      -14.500000   -114.570000
25%       -0.612850     -2.403500
50%       -0.017522      0.086490
75%        0.554120      2.405300
max       17.400000    120.110000
label bites, wk5-6, 0816565a


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (10414, 3)
df.shape:  (258454, 3)

p3, wk5-6, 0902175a
range_max:  18216
data.shape:  (18216, 2)
data:                   MR            ML
count  18216.000000  18216.000000
mean       0.000264     -0.000039
std        3.783902      1.461487
min      -52.385000    -10.225000
25%       -0.705965     -0.550782
50%       -0.013310      0.061732
75%        0.639030      0.815513
max      103.440000      9.179900
label bites, wk5-6, 0902175a


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (9108, 3)
df.shape:  (268868, 3)

p3, wk3-4, 0602473a
range_max:  21273
data.shape:  (21273, 2)
data:                   MR            ML
count  21273.000000  21273.000000
mean       0.000061      0.002164
std        0.909057     11.866573
min      -19.247000   -325.720000
25%       -0.147190     -3.605900
50%       -0.006554      0.609150
75%        0.133690      4.772800
max        9.504000    235.240000
label bites, wk3-4, 0602473a


  df = df.append(df_p, ignore_index=True)


Invalid frequency: 2.000094L

p3, wk3-4, 0612150a
range_max:  15290
data.shape:  (15290, 2)
data:                   MR            ML
count  15290.000000  15290.000000
mean       0.000393      0.000207
std        0.666794      0.226058
min       -3.354800     -0.582520
25%       -0.441265     -0.203615
50%        0.002002     -0.001321
75%        0.453443      0.201080
max        3.798600      0.715390
label bites, wk3-4, 0612150a
df_p.shape:  (7645, 3)
df.shape:  (277976, 3)

p3, wk13-14, 0100002a
range_max:  490631
data.shape:  (490631, 2)
data:                    MR             ML
count  490631.000000  490631.000000
mean        0.000009      -0.000015
std         6.920259       8.027082
min      -349.910000    -433.870000
25%        -0.603855      -1.378650
50%        -0.001918       0.004328
75%         0.596505       1.406400
max       397.380000     364.570000
label bites, wk13-14, 0100002a


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (245316, 3)
df.shape:  (285621, 3)



  df = df.append(df_p, ignore_index=True)


fit model p3
save model p3
fit general model
Patient:  2
p2, wk1-2, 0107285e
range_max:  106111
data.shape:  (106111, 2)
data:                    MR             ML
count  106111.000000  106111.000000
mean       -0.000203       0.000021
std        15.168053       1.913643
min      -242.430000     -42.180000
25%        -1.764500      -0.017774
50%         0.007887      -0.000016
75%         1.793800       0.018096
max       332.330000      42.707000
label bites, wk1-2, 0107285e
df_p.shape:  (53056, 3)
df.shape:  (0, 3)

p2, wk1-2, 0123035e
range_max:  47314
data.shape:  (47314, 2)
data:                   MR            ML
count  47314.000000  47314.000000
mean      -0.000009      0.000371
std        0.213355      1.859459
min       -1.594700    -19.193000
25%       -0.026855     -0.178127
50%       -0.001603      0.008792
75%        0.019518      0.254890
max        2.998600     12.724000
label bites, wk1-2, 0123035e


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (23657, 3)
df.shape:  (53056, 3)

p2, wk1-2, 0204451e
range_max:  79454
data.shape:  (79454, 2)
data:                   MR            ML
count  79454.000000  79454.000000
mean       0.000034     -0.000065
std        1.109931      1.451143
min      -11.763000    -18.805000
25%       -0.104210     -0.108850
50%       -0.004954      0.003342
75%        0.086454      0.138738
max       14.880000     11.816000
label bites, wk1-2, 0204451e


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (39727, 3)
df.shape:  (76713, 3)

p2, wk1-2, 0213314e
range_max:  46460
data.shape:  (46460, 2)
data:                   MR            ML
count  46460.000000  46460.000000
mean      -0.000088     -0.001207
std        2.431369      1.895967
min     -123.660000    -24.868000
25%       -0.673620     -0.003952
50%        0.000939     -0.000018
75%        0.673403      0.004023
max       61.411000     21.732000
label bites, wk1-2, 0213314e


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (23230, 3)
df.shape:  (116440, 3)

p2, wk5-6, 0521390e
range_max:  54090
data.shape:  (54090, 2)
data:                   MR            ML
count  54090.000000  54089.000000
mean       0.000090      0.000011
std        0.154142      0.026851
min       -0.653410     -0.111980
25%       -0.101620     -0.017899
50%        0.000323     -0.000135
75%        0.101660      0.017639
max        2.000000      0.115830
label bites, wk5-6, 0521390e


  df = df.append(df_p, ignore_index=True)


Invalid frequency: 2.000037L

p2, wk3-4, 0315230e
range_max:  46410
data.shape:  (46410, 2)
data:                   MR            ML
count  46410.000000  46410.000000
mean       0.000068     -0.000168
std        2.282732      9.261159
min     -116.900000   -116.000000
25%       -0.675825     -0.726577
50%        0.001474      0.075343
75%        0.679740      0.748903
max       54.921000     82.100000
label bites, wk3-4, 0315230e
df_p.shape:  (23205, 3)
df.shape:  (139670, 3)

p2, wk13-14, 1722373d
range_max:  39605
'utf-8' codec can't decode byte 0x82 in position 11: invalid start byte

fit model p2


  df = df.append(df_p, ignore_index=True)


save model p2
fit general model
Patient:  1
p1, wk21-22, 2020582c
range_max:  198171
data.shape:  (198171, 2)
data:                    MR            ML
count  198171.000000  1.981650e+05
mean       -0.000005 -7.299130e-07
std         0.024706  3.007308e+00
min        -0.062052 -6.824600e+00
25%        -0.018987 -3.206000e+00
50%        -0.000096 -3.782600e-05
75%         0.018979  3.220300e+00
max         0.064868  7.067700e+00
label bites, wk21-22, 2020582c
df_p.shape:  (99082, 3)
df.shape:  (0, 3)

p1, wk1, 0901260c
range_max:  98692
data.shape:  (98692, 2)
data:                   MR            ML
count  98692.000000  98692.000000
mean      -0.008723     -0.001926
std        9.037443      4.272387
min     -117.350000    -81.854000
25%       -1.479100     -0.515870
50%        0.002658     -0.010820
75%        1.552625      0.498565
max       84.228000     95.417000
label bites, wk1, 0901260c


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (49346, 3)
df.shape:  (99082, 3)

p1, wk1, 1022102c
range_max:  38255
data.shape:  (38255, 2)
data:                   MR            ML
count  38255.000000  38255.000000
mean      -0.000004      0.000014
std        0.573150      0.928574
min      -10.111000     -6.510200
25%       -0.104775     -0.313710
50%       -0.003646      0.012859
75%        0.076491      0.454295
max       20.053000      5.087400
label bites, wk1, 1022102c


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (19128, 3)
df.shape:  (148428, 3)

p1, wk1, 1222325c
range_max:  47678
data.shape:  (47678, 2)
data:                   MR            ML
count  47678.000000  47678.000000
mean      -0.000047     -0.000013
std        0.122124      1.381749
min       -1.478200    -17.247000
25%       -0.043328     -0.574365
50%        0.002210      0.004575
75%        0.047989      0.650145
max        1.734100      8.678600
label bites, wk1, 1222325c


  df = df.append(df_p, ignore_index=True)


df_p.shape:  (23839, 3)
df.shape:  (167556, 3)

fit model p1


  df = df.append(df_p, ignore_index=True)


save model p1
fit general model
save general model
