# Определение активности субъекта

## Чтение данных с одного субъекта

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
# from skimage import transform

In [2]:
col_names = ['timestamp', 'activityID', 'heart rate', 'temperature hand',\
             '3Da_x scale_16 hand', '3Da_y scale_16 hand', '3Da_z scale_16 hand', \
             '3Da_x scale_6 hand', '3Da_y scale_6 hand', '3Da_z scale_6 hand', \
             '3Dg_x hand', '3Dg_y hand', '3Dg_z hand', '3Dm_x hand', '3Dm_y hand', '3Dm_z hand', \
             'orientation_0 hand', 'orientation_1 hand', 'orientation_2 hand', 'orientation_3 hand', 
             'temperature chest', '3Da_x scale_16 chest', '3Da_y scale_16 chest', '3Da_z scale_16 chest', \
             '3Da_x scale_6 chest', '3Da_y scale_6 chest', '3Da_z scale_6 chest', \
             '3Dg_x chest', '3Dg_y chest', '3Dg_z chest', '3Dm_x chest', '3Dm_y chest', '3Dm_z chest', \
             'orientation_0 chest', 'orientation_1 chest', 'orientation_2 chest', 'orientation_3 chest',
             'temperature ankle', '3Da_x scale_16 ankle', '3Da_y scale_16 ankle', '3Da_z scale_16 ankle', \
             '3Da_x scale_6 ankle', '3Da_y scale_6 ankle', '3Da_z scale_6 ankle', \
             '3Dg_x ankle', '3Dg_y ankle', '3Dg_z ankle', '3Dm_x ankle', '3Dm_y ankle', '3Dm_z ankle', \
             'orientation_0 ankle', 'orientation_1 ankle', 'orientation_2 ankle', 'orientation_3 ankle']

In [3]:
good_cols = ['activityID', 'heart rate', 'temperature hand',\
             '3Da_x scale_16 hand', '3Da_y scale_16 hand', '3Da_z scale_16 hand', \
             '3Dg_x hand', '3Dg_y hand', '3Dg_z hand', '3Dm_x hand', '3Dm_y hand', '3Dm_z hand', \
             'temperature chest', '3Da_x scale_16 chest', '3Da_y scale_16 chest', '3Da_z scale_16 chest', \
             '3Dg_x chest', '3Dg_y chest', '3Dg_z chest', '3Dm_x chest', '3Dm_y chest', '3Dm_z chest', \
             'temperature ankle', '3Da_x scale_16 ankle', '3Da_y scale_16 ankle', '3Da_z scale_16 ankle', \
             '3Dg_x ankle', '3Dg_y ankle', '3Dg_z ankle', '3Dm_x ankle', '3Dm_y ankle', '3Dm_z ankle']

In [4]:
def get_good_data(fname, delete_zero_activity = True):
    data = pd.read_csv(fname, names = col_names, sep = ' ')
    data_gc = data[good_cols]
    if delete_zero_activity:
        data_gc = data_gc[(data_gc.activityID != 0)]
        
    return data_gc

In [5]:
subj_fname = 'PAMAP2_Dataset/Protocol/subject101.dat'
df = get_good_data(subj_fname)

In [6]:
df.sample()

Unnamed: 0,activityID,heart rate,temperature hand,3Da_x scale_16 hand,3Da_y scale_16 hand,3Da_z scale_16 hand,3Dg_x hand,3Dg_y hand,3Dg_z hand,3Dm_x hand,...,temperature ankle,3Da_x scale_16 ankle,3Da_y scale_16 ankle,3Da_z scale_16 ankle,3Dg_x ankle,3Dg_y ankle,3Dg_z ankle,3Dm_x ankle,3Dm_y ankle,3Dm_z ankle
187363,12,,33.8125,-18.0909,3.28924,-1.66602,-3.09525,2.10417,4.34945,54.2716,...,34.9375,11.4464,-2.80516,-0.190104,-0.098531,-0.419308,-1.22948,-40.1482,9.47885,53.1066


### Выделяем активности

In [7]:
def data_fill_na(data):
    data_nonans = data.interpolate(axis = 0, method='linear')
    data_nonans = data_nonans.fillna(axis = 0, method='bfill')
    data_nonans = data_nonans.fillna(axis = 0, method='ffill')
    return data_nonans

In [8]:
def get_activity(data, activityID, with_fill = True):
    data_act = data[(data.activityID == activityID)]
    data_act.pop('activityID')
    if with_fill:
        data_act = data_fill_na(data_act)
    return np.array(data_act).T

In [9]:
len(get_activity(df, 5)[0])

21265

### Собираем один DataFrame

In [10]:
def get_df():
    adf = pd.DataFrame(columns=good_cols, dtype = object)

    for i in tqdm(range(1,10)):
        subj_fname = f'PAMAP2_Dataset/Protocol/subject10{i}.dat'
        subj_df = get_good_data(subj_fname)
        df_arr = []
        uniq_act = df['activityID'].unique().tolist()
        for act in uniq_act:
            arr = []
            arr.append(act)
            arr += list(get_activity(subj_df, act))
            if len(arr[1]):
                df_arr.append(arr)

        add = pd.DataFrame(data = np.array(df_arr, dtype = object), dtype = object, columns=good_cols)
        adf = pd.concat([adf,add])
    
    adf.index = pd.Int64Index(list(range(len(adf))))
    return adf

In [11]:
all_df = get_df()
all_df.sample()

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:27<00:00,  3.02s/it]


Unnamed: 0,activityID,heart rate,temperature hand,3Da_x scale_16 hand,3Da_y scale_16 hand,3Da_z scale_16 hand,3Dg_x hand,3Dg_y hand,3Dg_z hand,3Dm_x hand,...,temperature ankle,3Da_x scale_16 ankle,3Da_y scale_16 ankle,3Da_z scale_16 ankle,3Dg_x ankle,3Dg_y ankle,3Dg_z ankle,3Dm_x ankle,3Dm_y ankle,3Dm_z ankle
83,12,"[95.0, 95.0, 95.0, 95.0, 95.0, 95.0, 95.0, 95....","[34.375, 34.375, 34.375, 34.375, 34.375, 34.37...","[-9.57112, -9.68663, -9.46283, -9.49661, -9.53...","[-1.68575, -1.79903, -1.83939, -1.83809, -1.76...","[1.05703, 1.0172, 0.943713, 1.05866, 0.903824,...","[-0.0138066, -0.00408345, -0.0367714, -0.03786...","[0.0332071, -0.00593475, 0.0217486, -0.0167512...","[-0.01224, -0.012541, -0.018759, -0.0191805, 0...","[40.0712, 40.8207, 41.2038, 41.0672, 41.1921, ...",...,"[34.3125, 34.3125, 34.3125, 34.3125, 34.3125, ...","[9.59919, 9.67683, 9.60288, 9.67923, 9.60469, ...","[-0.269223, -0.156434, -0.194249, -0.119107, -...","[-2.46298, -2.3857, -2.34764, -2.30873, -2.269...","[0.0142413, -0.035898, -0.0187516, -0.00029558...","[0.00970597, 0.011012, 0.0390591, 0.00420348, ...","[-0.0301642, -0.00192537, 0.0172885, 0.011496,...","[-29.8306, -29.3119, -29.2017, -30.0841, -29.5...","[5.29258, 4.7509, 5.64704, 4.91732, 5.0214, 4....","[-11.0422, -11.3234, -11.1822, -11.61, -11.324..."


Сохранение не работает...

In [95]:
adf.to_csv("PAMAP2_Dataset/protocol_all_subj.csv", index_label='Index', )

In [96]:
all_df = pd.read_csv("PAMAP2_Dataset/protocol_all_subj.csv", index_col='Index')

### Смотрим на длину активностей

In [12]:
def sep_by_len(df, min_act_len, ret_min_len = False) -> (pd.DataFrame, np.ndarray):
    uniq_act = df['activityID'].unique().tolist()
    min_len = np.zeros((len(uniq_act)), dtype = np.int64) - 1
    chosen = []

    for i in range(len(df)):
        p = df.iloc[i]
        if len(p['heart rate']) >= min_act_len:
            chosen.append(i)
            if min_len[uniq_act.index(p['activityID'])] == -1:
                min_len[uniq_act.index(p['activityID'])] = len(p['heart rate'])
            else:
                min_len[uniq_act.index(p['activityID'])] = min(min_len[uniq_act.index(p['activityID'])], len(p['heart rate']))
    
    if ret_min_len:
        return min_len
    else:
        return df.iloc[(chosen)] 

Минимальные длины

In [13]:
lens = sep_by_len(all_df, 0, ret_min_len=True)
lens

array([22044, 12282, 20533, 23573, 20037, 10389,  9655, 22253, 20265,
       20486,     1,   256], dtype=int64)

Выбираем активности с 20000+ длиной

In [14]:
activities = sep_by_len(all_df, 20000)
activities.describe()

Unnamed: 0,activityID,heart rate,temperature hand,3Da_x scale_16 hand,3Da_y scale_16 hand,3Da_z scale_16 hand,3Dg_x hand,3Dg_y hand,3Dg_z hand,3Dm_x hand,...,temperature ankle,3Da_x scale_16 ankle,3Da_y scale_16 ankle,3Da_z scale_16 ankle,3Dg_x ankle,3Dg_y ankle,3Dg_z ankle,3Dm_x ankle,3Dm_y ankle,3Dm_z ankle
count,64,64,64,64,64,64,64,64,64,64,...,64,64,64,64,64,64,64,64,64,64
unique,9,64,64,64,64,64,64,64,64,64,...,64,64,64,64,64,64,64,64,64,64
top,1,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100...","[30.375, 30.375, 30.375, 30.375, 30.375, 30.37...","[2.2153, 2.29196, 2.2909, 2.218, 2.30106, 2.07...","[8.27915, 7.67288, 7.1424, 7.14365, 7.25857, 7...","[5.58753, 5.74467, 5.82342, 5.8993, 6.09259, 6...","[-0.00475004, -0.17171, -0.238241, -0.192912, ...","[0.0375788, 0.0254788, 0.0112136, 0.0190534, -...","[-0.011145, -0.00953821, 0.000830722, 0.013374...","[8.932, 9.583, 9.05516, 9.92698, 9.15626, 8.60...",...,"[30.75, 30.75, 30.75, 30.75, 30.75, 30.75, 30....","[9.73855, 9.69762, 9.69633, 9.6637, 9.77578, 9...","[-1.84761, -1.88438, -1.92203, -1.84714, -1.88...","[0.0951561, -0.0208042, -0.0591734, 0.0943855,...","[0.00290826, 0.020882, -0.0353922, -0.0325136,...","[-0.0277138, 0.000944724, -0.0524217, -0.01884...","[0.00175228, 0.00600704, -0.00488214, 0.026949...","[-61.1081, -60.8916, -60.3407, -60.7646, -60.2...","[-36.8636, -36.3197, -35.7842, -37.1028, -37.1...","[-58.3696, -58.3656, -58.6119, -57.8799, -57.8..."
freq,8,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [15]:
def cut_act(df, cut_len, count=-1, random_start = False) -> pd.DataFrame:
    '''
    Consideres that cut_len is le to the lenght of all activities
    count is maximum number of cutted signals from one activity (-1 is default for maximum number)
    random start works with due regard for count
    '''
    tdf = pd.DataFrame(columns=good_cols, dtype = object)
    for i in range(len(df)):
        l = len(df.iloc[i]['heart rate'])
        start = np.random.randint(0, l-count*(l//count)+1) if random_start else 0
        el = np.array(df.iloc[i], dtype = object)
#         print(el[1], '\n')
        for j in range(start, l-cut_len, cut_len):
            if j//cut_len == count:
                break
            new_el = [el[0]]
            for k in range(1, len(el)):
                new_el.append(el[k][j:j+cut_len])
            tdf = tdf.append(pd.DataFrame([new_el], columns=good_cols))
#             pd.DataFrame.append()
#             print(tdf, '\n')
    
    tdf.index = pd.Int64Index(list(range(len(tdf))))
    return tdf

In [16]:
cut_df = cut_act(activities, 20000)
len(cut_df.iloc[0][1])

20000

In [81]:
n_df = normalize_df(cut_df)
n_df.sample()

[1, 2, 3, 17, 16, 4, 7, 6, 5, 1, 2, 3, 17, 16, 4, 7, 6, 1, 2, 3, 17, 16, 4, 1, 2, 3, 17, 16, 4, 7, 6, 1, 2, 3, 17, 16, 4, 7, 6, 5, 1, 2, 3, 17, 16, 4, 7, 6, 5, 1, 3, 17, 16, 4, 7, 6, 1, 2, 3, 17, 16, 4, 7, 6]


ValueError: could not broadcast input array from shape (1,1,64) into shape (64,1,20000)