1.
Download dataset and put in → data/raw

2.
Open VS Code workspace

3.
Create a new Conda environment from the environment.yml file

Make sure to also activate the environment in your VS Code workspace (I forgot to show this in the video)

4.
Open file src/data/make_dataset.py

5.
Understanding the CSV files (measurement, participant, exercise, intensity)

6.
Understanding the data transformation (supervised learning)

7.
Terminology — sets, reps, intensity

8.
Read single CSV file

9.
Extract features from filename

10.
Read all files

11.
Working with datetimes

12.
Creating a custom function

13.
Merge datasets

14.
Resample data (frequency conversion)

15.
Export intermediate dataset

In [2]:
!unzip /content/MetaMotion.zip

Archive:  /content/MetaMotion.zip
   creating: MetaMotion/
  inflating: __MACOSX/._MetaMotion   
  inflating: MetaMotion/B-ohp-heavy2-rpe7_MetaWear_2019-01-11T16.42.43.398_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv  
  inflating: __MACOSX/MetaMotion/._B-ohp-heavy2-rpe7_MetaWear_2019-01-11T16.42.43.398_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv  
  inflating: MetaMotion/A-squat-heavy_MetaWear_2019-01-15T20.09.06.903_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv  
  inflating: __MACOSX/MetaMotion/._A-squat-heavy_MetaWear_2019-01-15T20.09.06.903_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv  
  inflating: MetaMotion/C-squat-heavy_MetaWear_2019-01-15T20.11.55.634_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv  
  inflating: __MACOSX/MetaMotion/._C-squat-heavy_MetaWear_2019-01-15T20.11.55.634_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv  
  inflating: MetaMotion/E-ohp-heavy_MetaWear_2019-01-14T14.49.46.484_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv  
  inflating: __MACOSX/MetaMotion/._E-ohp-he

In [3]:
import pandas as pd
from glob import glob

# --------------------------------------------------------------
# Read single CSV file
# --------------------------------------------------------------
f = pd.read_csv("/content/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv")

# --------------------------------------------------------------
# List all data in data/raw/MetaMotion
# --------------------------------------------------------------
from os import listdir
from os.path import isfile, join
mypath = "/content/MetaMotion"
onlyfiles = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
filnames = [f.split("/")[-1][0:-4] for f in listdir(mypath) if isfile(join(mypath, f))]
print(onlyfiles)

['/content/MetaMotion/A-bench-heavy2_MetaWear_2019-01-14T14.27.00.784_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv', '/content/MetaMotion/A-bench-heavy2-rpe8_MetaWear_2019-01-11T16.10.08.270_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv', '/content/MetaMotion/C-bench-heavy1_MetaWear_2019-01-14T14.29.37.418_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv', '/content/MetaMotion/A-ohp-heavy3-rpe7_MetaWear_2019-01-11T16.44.00.801_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv', '/content/MetaMotion/A-ohp-heavy_MetaWear_2019-01-14T14.53.06.282_C42732BE255C_Accelerometer_12.500Hz_1.4.4.csv', '/content/MetaMotion/A-bench-heavy2_MetaWear_2019-01-14T14.27.00.784_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv', '/content/MetaMotion/A-rest-standing_MetaWear_2019-01-18T18.25.39.382_C42732BE255C_Gyroscope_25.000Hz_1.4.41.csv', '/content/MetaMotion/A-squat-medium3-rpe7_MetaWear_2019-01-11T17.19.34.896_C42732BE255C_Gyroscope_25.000Hz_1.4.4.csv', '/content/MetaMotion/E-squat-heavy_MetaWear_2019-01-15T20.09.0

In [4]:
# --------------------------------------------------------------
# Extract features from filename
# Extract participant, excercise label and set intensity from filename
# Read all files
# --------------------------------------------------------------
df_acc = pd.DataFrame()
df_gyro = pd.DataFrame()
acc_ind = 1
gyro_ind = 1


for f in onlyfiles:
  temp = f.split("/")[-1][:-4].split("_")
  sensor = temp[-3]
  temp = temp[0].split("-")
  participant = temp[0]
  excercise = temp[1]
  intensity = temp[2]

  df = pd.read_csv(f)
  df["participant"] = participant
  df["excercise"] = excercise
  df["intensity"] = intensity

  if sensor == "Gyroscope":
    df["ind"] = gyro_ind
    gyro_ind += 1
    df_gyro = pd.concat([df_gyro, df])
  else:
    df["ind"] = acc_ind
    acc_ind += 1
    df_acc = pd.concat([df_acc, df])

In [5]:
# --------------------------------------------------------------
# Working with datetimes
# --------------------------------------------------------------
df_acc.index = pd.to_datetime(df_acc['epoch (ms)'], unit='ms')
df_gyro.index = pd.to_datetime(df_gyro['epoch (ms)'], unit='ms')

del df_acc["epoch (ms)"]
del df_acc['elapsed (s)']
del df_acc['time (01:00)']

del df_gyro["epoch (ms)"]
del df_gyro['elapsed (s)']
del df_gyro['time (01:00)']

In [6]:
# --------------------------------------------------------
# Turn into function
# --------------------------------------------------------------
from os import listdir
from os.path import isfile, join
mypath = "/content/MetaMotion"
onlyfiles = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]

def df_from_files(onlyfiles):
  df_acc = pd.DataFrame()
  df_gyro = pd.DataFrame()
  acc_ind = 1
  gyro_ind = 1


  for f in onlyfiles:
    temp = f.split("/")[-1][:-4].split("_")
    sensor = temp[-3]
    temp = temp[0].split("-")
    participant = temp[0]
    excercise = temp[1]
    intensity = temp[2]

    df = pd.read_csv(f)
    df["participant"] = participant
    df["excercise"] = excercise
    df["intensity"] = intensity


    if sensor == "Gyroscope":
      df["ind"] = gyro_ind
      gyro_ind += 1
      df_gyro = pd.concat([df_gyro, df])
    else:
      df["ind"] = acc_ind
      acc_ind += 1
      df_acc = pd.concat([df_acc, df])

  df_acc.index = pd.to_datetime(df_acc['epoch (ms)'], unit='ms')
  df_gyro.index = pd.to_datetime(df_gyro['epoch (ms)'], unit='ms')

  del df_acc["epoch (ms)"]
  del df_acc['elapsed (s)']
  del df_acc['time (01:00)']

  del df_gyro["epoch (ms)"]
  del df_gyro['elapsed (s)']
  del df_gyro['time (01:00)']

  return df_gyro, df_acc

df_gyro, df_acc = df_from_files(onlyfiles)

In [26]:
print(df_gyro.shape, df_acc.shape)

(47218, 7) (23578, 7)


In [48]:
# --------------------------------------------------------------
# Merging datasets
# --------------------------------------------------------------
df = pd.concat([df_acc.iloc[:,0:3], df_gyro], axis = 1)

#  Since most data is not going to be synced to the millisecond since, the sensos are not synced
df.dropna() # We see that most data is lost

# Change column names
df.columns = [
    "acc_x",
    "acc_y",
    'acc_z',
    "gyro_x",
    "gyro_y",
    "gyro_z",
    "participant",
    "excercise",
    "intensity",
    "ind"
]

display(df.head())


Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,participant,excercise,intensity,ind
epoch (ms),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-11 15:08:04.950,,,,-10.671,-1.524,5.976,B,bench,heavy1,28.0
2019-01-11 15:08:04.990,,,,-8.72,-2.073,3.171,B,bench,heavy1,28.0
2019-01-11 15:08:05.030,,,,0.488,-3.537,-4.146,B,bench,heavy1,28.0
2019-01-11 15:08:05.070,,,,0.244,-5.854,3.537,B,bench,heavy1,28.0
2019-01-11 15:08:05.110,,,,-0.915,0.061,-2.805,B,bench,heavy1,28.0


In [67]:
# --------------------------------------------------------------
# Resample data (frequency conversion)
# --------------------------------------------------------------

# Accelerometer:    12.500HZ
# Gyroscope:        25.000Hz

# Create a function for resampling
def resampling(df):
  sampling_rule = {"acc_x":"mean",
                   "acc_y":"mean",
                   'acc_z':"mean",
                   "gyro_x":"mean",
                   "gyro_y":"mean",
                   "gyro_z":"mean",
                   "participant":"last",
                   "excercise":"last",
                   "intensity":"last",
                   "ind":"last"
                   }
  # Split the data by days
  days = [g for n, g in df.groupby(pd.Grouper(freq="D"))]
  df = pd.concat([day.resample(rule="200ms").apply(sampling_rule).dropna() for day in days])
  return(df)

df_resampled = resampling(df)
df_resampled.info()
# Changing ind to int

df_resampled["ind"] = df_resampled["ind"].astype("int")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2019-01-11 15:08:05.200000 to 2019-01-20 17:33:27.800000
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   acc_x        9009 non-null   float64
 1   acc_y        9009 non-null   float64
 2   acc_z        9009 non-null   float64
 3   gyro_x       9009 non-null   float64
 4   gyro_y       9009 non-null   float64
 5   gyro_z       9009 non-null   float64
 6   participant  9009 non-null   object 
 7   excercise    9009 non-null   object 
 8   intensity    9009 non-null   object 
 9   ind          9009 non-null   float64
dtypes: float64(7), object(3)
memory usage: 774.2+ KB


In [69]:
# --------------------------------------------------------------
# Export dataset
# --------------------------------------------------------------
df_resampled.info()
df_resampled.to_pickle("01_data_processed.pkl")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2019-01-11 15:08:05.200000 to 2019-01-20 17:33:27.800000
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   acc_x        9009 non-null   float64
 1   acc_y        9009 non-null   float64
 2   acc_z        9009 non-null   float64
 3   gyro_x       9009 non-null   float64
 4   gyro_y       9009 non-null   float64
 5   gyro_z       9009 non-null   float64
 6   participant  9009 non-null   object 
 7   excercise    9009 non-null   object 
 8   intensity    9009 non-null   object 
 9   ind          9009 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 774.2+ KB
