# <font color = 'orange'> Make Dataset

---

In [1]:
import pandas as pd
from glob import glob

---

### Create accelerator and gyroscope dataframe

In [2]:
files = [f.replace('\\','/') for f in glob("E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/raw/MetaMotion/*.csv")]

files

['E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv',
 'E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/raw/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv',
 'E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/raw/MetaMotion/A-bench-heavy2_MetaWear_2019-01-14T14.27.00.784_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv',
 'E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/raw/MetaMotion/A-bench-heavy2_MetaWear_2019-01-14T14.27.00.784_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv',
 'E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/raw/MetaMotion/A-bench-heavy3-rpe8_MetaWear_2019-01-11T16.14.45.178_C42732BE255C_Accelerometer_12.5

In [3]:
def read_data_from_files(files):
    '''
    takes the csv file names list and 
    returns the accerator dataframe and gyroscope dataframe 
    '''
    acc_df = pd.DataFrame()
    gyr_df = pd.DataFrame()

    acc_set = 1
    gyr_set = 1

    # Read all files and concatenate them with respective dataframe
    for file in files:    
        file_name = file.split('/')[-1]
        
        # Extract features from filename
        participant = file_name.split('-')[0]
        label = file_name.split('-')[1]
        category = file_name.split('-')[2].rstrip('_MetaWear_2019').rstrip('123')
        
        df = pd.read_csv(file)
        df['participant'] = participant
        df['label'] = label
        df['category'] = category
        
        if 'Accelerometer' in file_name:
            df['set'] = acc_set
            acc_set += 1
            acc_df = pd.concat([acc_df, df], axis = 0)
        
        if 'Gyroscope' in file_name:
            df['set'] = gyr_set
            gyr_set += 1
            gyr_df = pd.concat([gyr_df, df], axis = 0)
    
    # Working with datetimes
    acc_df.index = pd.to_datetime(acc_df['epoch (ms)'], unit = 'ms')
    gyr_df.index = pd.to_datetime(gyr_df['epoch (ms)'], unit = 'ms')

    # dropping unwanted datetime column
    acc_df.drop(columns = ['epoch (ms)', 'time (01:00)', 'elapsed (s)'], axis = 1, inplace = True)
    gyr_df.drop(columns = ['epoch (ms)', 'time (01:00)', 'elapsed (s)'], axis = 1, inplace = True)
    
    return acc_df, gyr_df

acc_df, gyr_df = read_data_from_files(files)

In [4]:
acc_df

Unnamed: 0_level_0,x-axis (g),y-axis (g),z-axis (g),participant,label,category,set
epoch (ms),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-11 15:10:08.431,0.010,0.964,-0.087,A,bench,heavy,1
2019-01-11 15:10:08.511,0.000,0.961,-0.069,A,bench,heavy,1
2019-01-11 15:10:08.591,0.001,0.974,-0.087,A,bench,heavy,1
2019-01-11 15:10:08.671,-0.012,0.971,-0.084,A,bench,heavy,1
2019-01-11 15:10:08.751,-0.013,0.954,-0.094,A,bench,heavy,1
...,...,...,...,...,...,...,...
2019-01-16 19:14:23.089,0.012,0.596,0.815,E,squat,heavy,94
2019-01-16 19:14:23.169,0.009,0.528,0.821,E,squat,heavy,94
2019-01-16 19:14:23.249,0.015,0.554,0.746,E,squat,heavy,94
2019-01-16 19:14:23.329,0.006,0.574,0.824,E,squat,heavy,94


In [5]:
gyr_df

Unnamed: 0_level_0,x-axis (deg/s),y-axis (deg/s),z-axis (deg/s),participant,label,category,set
epoch (ms),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-11 15:10:08.351,0.122,-5.488,-3.841,A,bench,heavy,1
2019-01-11 15:10:08.391,2.195,-9.695,-0.610,A,bench,heavy,1
2019-01-11 15:10:08.431,2.622,-8.110,-4.024,A,bench,heavy,1
2019-01-11 15:10:08.471,1.951,-4.695,-4.634,A,bench,heavy,1
2019-01-11 15:10:08.511,1.524,-2.561,-2.500,A,bench,heavy,1
...,...,...,...,...,...,...,...
2019-01-16 19:14:23.314,-1.707,-0.671,1.585,E,squat,heavy,93
2019-01-16 19:14:23.354,0.915,0.305,2.988,E,squat,heavy,93
2019-01-16 19:14:23.394,5.854,-2.561,1.463,E,squat,heavy,93
2019-01-16 19:14:23.434,4.268,0.549,-0.305,E,squat,heavy,93


---

### Merging datasets - to get the accelerator and gyroscope value at the time stamp

In [6]:
df_merged = pd.concat([acc_df.iloc[:, :3], gyr_df], axis = 1)

df_merged.columns = [
    'acc_x', 
    'acc_y', 
    'acc_z', 
    'gyr_x', 
    'gyr_y', 
    'gyr_z',
    'participant',
    'label',
    'category',
    'set'
]

In [7]:
df_merged

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyr_x,gyr_y,gyr_z,participant,label,category,set
epoch (ms),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-11 15:08:04.950,,,,-10.671,-1.524,5.976,B,bench,heavy,30.0
2019-01-11 15:08:04.990,,,,-8.720,-2.073,3.171,B,bench,heavy,30.0
2019-01-11 15:08:05.030,,,,0.488,-3.537,-4.146,B,bench,heavy,30.0
2019-01-11 15:08:05.070,,,,0.244,-5.854,3.537,B,bench,heavy,30.0
2019-01-11 15:08:05.110,,,,-0.915,0.061,-2.805,B,bench,heavy,30.0
...,...,...,...,...,...,...,...,...,...,...
2019-01-20 17:35:13.382,-0.060,-1.021,-0.058,,,,,,,
2019-01-20 17:35:13.462,-0.035,-1.037,-0.026,,,,,,,
2019-01-20 17:35:13.542,-0.045,-1.029,-0.033,,,,,,,
2019-01-20 17:35:13.622,-0.039,-1.027,-0.039,,,,,,,


---

### Resample data (frequency conversion) - to make accelerator and gyroscope frequency match

#### hz = number of cycles per seconds

In [8]:
# Accelerometer:    12.500HZ
# Gyroscope:        25.000Hz

sampling = {
    'acc_x' : 'mean',
    'acc_y' : 'mean',
    'acc_z' : 'mean',
    'gyr_x' : 'mean',
    'gyr_y' : 'mean',
    'gyr_z' : 'mean',
    'participant' : 'last',
    'label' : 'last',
    'category' : 'last',
    'set' : 'last'
}

# resamping is done to take same frequency reading for both accelerator and gyroscope
# df_merged[:1000].resample(rule = '200ms').apply(sampling)


# grouping dataframe by the day to apply resampling for a particular day
days = [g for n, g in df_merged.groupby(pd.Grouper(freq = 'D'))]
df_resampled = pd.concat([df.resample(rule = '200ms').apply(sampling).dropna() for df in days])

In [9]:
df_resampled

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyr_x,gyr_y,gyr_z,participant,label,category,set
epoch (ms),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-11 15:08:05.200,0.013500,0.977000,-0.071000,-1.8904,2.4392,0.9388,B,bench,heavy,30.0
2019-01-11 15:08:05.400,-0.001500,0.970500,-0.079500,-1.6826,-0.8904,2.1708,B,bench,heavy,30.0
2019-01-11 15:08:05.600,0.001333,0.971667,-0.064333,2.5608,-0.2560,-1.4146,B,bench,heavy,30.0
2019-01-11 15:08:05.800,-0.024000,0.957000,-0.073500,8.0610,-4.5244,-2.0730,B,bench,heavy,30.0
2019-01-11 15:08:06.000,-0.028000,0.957667,-0.115000,2.4390,-1.5486,-3.6098,B,bench,heavy,30.0
...,...,...,...,...,...,...,...,...,...,...
2019-01-20 17:33:27.000,-0.048000,-1.041500,-0.076500,1.4146,-5.6218,0.2926,E,row,medium,90.0
2019-01-20 17:33:27.200,-0.037000,-1.030333,-0.053333,-2.7684,-0.5854,2.2440,E,row,medium,90.0
2019-01-20 17:33:27.400,-0.060000,-1.031000,-0.082000,2.8416,-5.1342,-0.1220,E,row,medium,90.0
2019-01-20 17:33:27.600,-0.038667,-1.025667,-0.044667,-0.2318,0.2562,1.1220,E,row,medium,90.0


In [10]:
df_resampled.info()
df_resampled['set'] = df_resampled['set'].astype(int)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2019-01-11 15:08:05.200000 to 2019-01-20 17:33:27.800000
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   acc_x        9009 non-null   float64
 1   acc_y        9009 non-null   float64
 2   acc_z        9009 non-null   float64
 3   gyr_x        9009 non-null   float64
 4   gyr_y        9009 non-null   float64
 5   gyr_z        9009 non-null   float64
 6   participant  9009 non-null   object 
 7   label        9009 non-null   object 
 8   category     9009 non-null   object 
 9   set          9009 non-null   float64
dtypes: float64(7), object(3)
memory usage: 774.2+ KB


---

### Export dataset

In [11]:
df_resampled.to_csv('E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/interim/01_processed_data.csv')
df_resampled.to_pickle('E:/PW Skills - Data Science/04_Machine_Learning/Week_ML_Projects/04_Fitness_Tracker/data/interim/01_processed_data.pkl')

---