# 数据预处理

### 数据对齐


In [1]:
import pandas as pd
import numpy as np
import re
import glob
import os
import time
from copy import deepcopy
from tqdm import tqdm
from datetime import datetime, timedelta
from angle_calculator import angle_cal_frame

In [2]:
# 判断时间 n_time 是否在范围时间 [time1, time2] 内
def time_in_period(time1, time2, n_time):
    return (n_time > time1 and n_time < time2)

def get_start_time(optical_data):
    start_time = datetime.strptime(optical_data.iloc[0,0], '%Y-%m-%d %I.%M.%S %p')
    return start_time

### sensor_data 写入时间戳

In [3]:
def sensor_data_processing(sensor_data):
    timestamp = dict()
    key = list()
    sensor_data_list = sensor_data.values.tolist()
    num_sensor_data = len(sensor_data_list)
    
    # 提取中括号中的时间
    re_midbkt = re.compile('\[(.*?)\]', re.S)
    for i, data in enumerate(sensor_data_list):
        if sensor_data_list[i][0].split(' ')[0][0] == '[':
            t = sensor_data_list[i][0].split(' ')[0]
            timestamp[i] = re.findall(re_midbkt, t)[0]
            timestamp[i] = datetime.strptime(timestamp[i], '%H:%M:%S.%f')
            # 时间对齐
            delta = timedelta(minutes=23, seconds=45)
            timestamp[i] = timestamp[i] + delta
            timestamp[i] = timestamp[i].replace(year=start_time.year, month=start_time.month, day=start_time.day)
            key.append(i)
    if key[-1] != len(sensor_data_list)-1:
        key.append(len(sensor_data_list)-1)


    # 生成每一行的时间
    # 针对末尾数据，默认 time_addition 为本次数据采集的平均间隔
    mean_time_addition = (timestamp[key[-2]] - timestamp[0]) / (key[-2] - key[0])
    for i in range(len(key)-1):
        if (i==len(key)-2):
            time_addition = mean_time_addition
            for idx, j in enumerate(range(key[-2]+1, key[-1]+1)):
                if j in key:
                    continue
                timestamp[j] = timestamp[key[-2]] + time_addition * (idx+1)
            break

        else:
            time_interval = timestamp[key[i+1]] - timestamp[key[i]]
            index_interval = key[i+1] - key[i]
            time_addition = time_interval / index_interval
            for idx, j in enumerate(range(key[i]+1, key[i+1])):
                if j in key:
                    continue
                timestamp[j] = timestamp[key[i]] + time_addition * (idx+1)

    timestamp = dict(sorted(timestamp.items()))
    pair = [[timestamp[i-1], timestamp[i]] for i in range(1, len(timestamp))]
    
    # 提取传感器数据到列表
    for i in range(len(sensor_data_list)):
        s = sensor_data_list[i][0]
        a = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", s)
        # 去除字符串两端空格
        sensor_str = a.split(',')[:-1]
        sensor_str = [float(i.strip()) for i in sensor_str]
        sensor_data_list[i] = sensor_str
        
    return sensor_data_list, pair

### optical_data 写入时间戳

In [4]:
def optical_data_processing(optical_data, sensor_data_list, pair, is_save=False):
    num_sample = optical_data.shape[0]
    start_time = get_start_time(optical_data)
    optical_data.drop([0], inplace=True)
    optical_data.dropna(inplace=True)
    optical_data.reset_index(drop=True, inplace=True)
    print(f'清洗数据：({optical_data.shape[0]}/{num_sample})')

    for i in tqdm(range(1, optical_data.shape[0])):
        row_time = optical_data.loc[i, 'Time']
        row_time = timedelta(seconds=row_time) + start_time
        # 写入 optical_data 时间戳
        optical_data.loc[i, 'Time'] = datetime.strftime(row_time, '%Y-%m-%d %H:%M:%S.%f')
        # 写入角度数据
        angle_result = angle_cal_frame(optical_data, i-1)
        optical_data.loc[i, 'AA_SN_X'] = angle_result['AA_SN_X'].to_numpy()[0]
        optical_data.loc[i, 'AA_SN_Y'] = angle_result['AA_SN_Y'].to_numpy()[0]
        optical_data.loc[i, 'AA_SN_Z'] = angle_result['AA_SN_Z'].to_numpy()[0]
        optical_data.loc[i, 'GH_AA_X'] = angle_result['GH_AA_X'].to_numpy()[0]
        optical_data.loc[i, 'GH_AA_Y'] = angle_result['GH_AA_Y'].to_numpy()[0]
        optical_data.loc[i, 'GH_AA_Z'] = angle_result['GH_AA_Z'].to_numpy()[0]
        optical_data.loc[i, 'GH_SN_X'] = angle_result['GH_SN_X'].to_numpy()[0]
        optical_data.loc[i, 'GH_SN_Y'] = angle_result['GH_SN_Y'].to_numpy()[0]
        optical_data.loc[i, 'GH_SN_Z'] = angle_result['GH_SN_Z'].to_numpy()[0]
        
        for idx, p in enumerate(pair):
            is_in_period = time_in_period(p[0], p[1], row_time)
            if is_in_period:
                # 写入 sensor_data 时间戳
                optical_data.loc[i, 'sensor_0'] = sensor_data_list[idx][0]
                optical_data.loc[i, 'sensor_1'] = sensor_data_list[idx][1]
                optical_data.loc[i, 'sensor_2'] = sensor_data_list[idx][2]
                optical_data.loc[i, 'sensor_3'] = sensor_data_list[idx][3]
                optical_data.loc[i, 'sensor_4'] = sensor_data_list[idx][4]
                break

    if is_save:
        save_path = optical_csv_path.replace('learning', 'processing')
        optical_data.to_csv(save_path)
        print(f'处理结束，保存路径:{save_path}')
        
    return optical_data

## Main

In [8]:
sensor_csv_paths = glob.glob('./data/learning/*_s*')
for sensor_csv_path in sensor_csv_paths:
    optical_csv_path = re.sub('[_][s]','', sensor_csv_path)
    print(f'正在处理 {sensor_csv_path} & {optical_csv_path}')
    print('='*20)
    sensor_data = pd.read_csv(sensor_csv_path, header=None)
    optical_data = pd.read_csv(optical_csv_path)
    start_time = get_start_time(optical_data)
    sensor_data_list, pair = sensor_data_processing(sensor_data)
    optical_data = optical_data_processing(optical_data, sensor_data_list, pair, True)

  1%|          | 5/401 [00:00<00:08, 48.42it/s]

正在处理 ./data/learning/Waizhan_s.csv & ./data/learning/Waizhan.csv
清洗数据：(402/3750)


100%|██████████| 401/401 [00:07<00:00, 51.73it/s]
  0%|          | 5/2253 [00:00<00:47, 47.68it/s]

处理结束，保存路径:./data/processing/Waizhan.csv
正在处理 ./data/learning/WaizhanElevation_s.csv & ./data/learning/WaizhanElevation.csv
清洗数据：(2254/4342)


100%|██████████| 2253/2253 [00:40<00:00, 55.57it/s]
  1%|          | 6/596 [00:00<00:10, 56.72it/s]

处理结束，保存路径:./data/processing/WaizhanElevation.csv
正在处理 ./data/learning/WaizhanElevationForward_s.csv & ./data/learning/WaizhanElevationForward.csv
清洗数据：(597/4323)


100%|██████████| 596/596 [00:10<00:00, 56.08it/s]
  0%|          | 6/3710 [00:00<01:07, 54.66it/s]

处理结束，保存路径:./data/processing/WaizhanElevationForward.csv
正在处理 ./data/learning/InitialData_s.csv & ./data/learning/InitialData.csv
清洗数据：(3711/3713)


100%|██████████| 3710/3710 [01:05<00:00, 56.91it/s]
  1%|          | 6/966 [00:00<00:16, 59.46it/s]

处理结束，保存路径:./data/processing/InitialData.csv
正在处理 ./data/learning/Forward_s.csv & ./data/learning/Forward.csv
清洗数据：(967/3713)


100%|██████████| 966/966 [00:16<00:00, 57.62it/s]
  0%|          | 6/2018 [00:00<00:36, 55.40it/s]

处理结束，保存路径:./data/processing/Forward.csv
正在处理 ./data/learning/Depression_s.csv & ./data/learning/Depression.csv
清洗数据：(2019/3720)


100%|██████████| 2018/2018 [00:36<00:00, 54.98it/s]
  0%|          | 6/3706 [00:00<01:11, 51.46it/s]

处理结束，保存路径:./data/processing/Depression.csv
正在处理 ./data/learning/Backward_s.csv & ./data/learning/Backward.csv
清洗数据：(3707/3737)


100%|██████████| 3706/3706 [01:11<00:00, 52.02it/s]
  0%|          | 6/3739 [00:00<01:04, 57.85it/s]

处理结束，保存路径:./data/processing/Backward.csv
正在处理 ./data/learning/HoushenBackward_s.csv & ./data/learning/HoushenBackward.csv
清洗数据：(3740/3741)


100%|██████████| 3739/3739 [01:13<00:00, 51.17it/s]
  3%|▎         | 6/206 [00:00<00:03, 51.23it/s]

处理结束，保存路径:./data/processing/HoushenBackward.csv
正在处理 ./data/learning/Houshen_s.csv & ./data/learning/Houshen.csv
清洗数据：(207/3748)


100%|██████████| 206/206 [00:03<00:00, 52.72it/s]
  0%|          | 4/4386 [00:00<01:50, 39.80it/s]

处理结束，保存路径:./data/processing/Houshen.csv
正在处理 ./data/learning/Qianqu_s.csv & ./data/learning/Qianqu.csv
清洗数据：(4387/5382)


100%|██████████| 4386/4386 [01:18<00:00, 56.04it/s]
  0%|          | 6/3609 [00:00<01:00, 60.00it/s]

处理结束，保存路径:./data/processing/Qianqu.csv
正在处理 ./data/learning/Elevation_s.csv & ./data/learning/Elevation.csv
清洗数据：(3610/3746)


100%|██████████| 3609/3609 [01:01<00:00, 59.04it/s]
  0%|          | 6/3762 [00:00<01:02, 59.69it/s]

处理结束，保存路径:./data/processing/Elevation.csv
正在处理 ./data/learning/QianquForward_s.csv & ./data/learning/QianquForward.csv
清洗数据：(3763/3803)


100%|██████████| 3762/3762 [01:03<00:00, 59.71it/s]


处理结束，保存路径:./data/processing/QianquForward.csv
