# Data Cleaning
The issues at hand are the different methods of encoding data which must be manipulated properly.
Current process in mind is to import data as "grid"(possibly a df with header=False).
Then use said grid to visually note where to make "cuts" in how data is saved.

Setup, Importing, & splitting Data

In [6]:
#Packages
import pandas as pd # Used for data manipulation

In [7]:
# Import data
raw = pd.read_csv("../data/raw/Gym - Brandon Miner - Sheet1.csv", header=None)

# Split raw data into segments
raw1 = raw[:77]
raw2 = raw[77:215]
raw3 = raw[215:]


Baseline Cleaning of Each split

In [8]:
# Define a function for standard data processing
def process_data(df, col_names, drop_cols):
    """Cleans and standardizes data with given column names and columns to drop."""
    df = df.drop(columns=drop_cols, inplace=False)
    df.iloc[0] = col_names  # Rename columns
    df.columns = df.iloc[0]  # Set first row as header
    df = df[1:].dropna()  # Drop empty rows
    return df

In [9]:
# Process datasets
processed1 = process_data(
    raw1,
    ["Date", "Workout", "Weight1", "rep1", "Weight2", "rep2", "Weight3", "rep3"],
    range(8, 15)
)
processed2 = process_data(
    raw2, 
    ["Date", "Workout", "Weight", "Reps", "Sets"], 
    range(5, 15)
)
processed3 = process_data(
    raw3,
    ["Date", "Workout", "Weight", "rep1", "rep2", "rep3", "rep4"],
    range(7, 15)
)

In [11]:
# Additional cleaning for processed3
processed2[["Weight", "Reps", "Sets"]].replace("-", 0, inplace=True)
processed3 = processed3[processed3["Workout"] != "Bicycle Warm-up"]
processed3["rep1", "rep2", "rep3", "rep4"].replace("-", 0, inplace=True)

# Standardize numeric columns
for df in [processed1, processed2, processed3]:
    df["Date"] = pd.to_datetime(df["Date"])
    df[["rep1", "rep2", "rep3", "rep4"]] = df[["rep1", "rep2", "rep3", "rep4"]].apply(pd.to_numeric)

# Add new columns
processed1["Weight"] = processed1["Weight3"]
processed1["Reps"] = processed1["rep3"]
processed1["Sets"] = 1

processed3["Reps"] = processed3[["rep1", "rep2", "rep3", "rep4"]].mean(axis=1)
processed3["Sets"] = 4

# Drop unnecessary columns
processed1.drop(columns=["Weight1", "rep1", "Weight2", "rep2", "Weight3", "rep3"], inplace=True)
processed3.drop(columns=["rep1", "rep2", "rep3", "rep4"], inplace=True)

# Concatenate datasets
master = pd.concat([processed1, processed2, processed3], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed2[["Weight", "Reps", "Sets"]].replace("-", 0, inplace=True)


KeyError: ('rep1', 'rep2', 'rep3', 'rep4')

In [None]:
# Manual fixes
fixes = [
    (0, 85, 5),
    (6, 50, 8),
    (8, 30, 8),
    (17, None, 7),
    (18, 70, 10),
]
for idx, weight, reps in fixes:
    if weight is not None:
        master.loc[idx, "Weight"] = weight
    if reps is not None:
        master.loc[idx, "Reps"] = reps


In [None]:
master["Workout"] = master["Workout"].apply(str.strip)
master = master[master["Workout"] != "Box Jumps"]
master = master[master["Workout"] != "Stairmaster"] 

In [None]:
# Normalize workout names
replacements = {
    "Seated Dip": "Seated Dips",
    "Bench Press": "Flat Bench Press",
    "Dual Pulley Pulldown Mchn": "Pulldowns",
    "Row Mchn": "Rows",
    "Seated Dips Mchn": "Seated Dips",
    "Seated Shoulder Press Mchn": "Seated Shoulder Press",
    "Hip Abduction Mchn": "Hip Abduction",
    "Seated Calf Extention Mchn": "Seated Calf Extention",
    "Ab Curl Mchn": "Ab Curl",
    "Seated Leg Press Mchn": "Seated Leg Press",
    "Inner Thigh Mchn": "Hip Adduction",
    "Lateral Raise Mchn": "Lateral Raise",
    "Bicep Curl Mchn": "Bicep Curls",
    "Chest Press Mchn": "Chest Press",
    "Lateral Raises": "Lateral Raise",
    "Pulldown": "Pulldowns",
    "Tricep Cable Pulls": "Cable Pulls",
    "Cable Pulldowns": "Cable Pulls",
    "Ab Curl Mchn": "Ab Curls",
    "Barbell Curls (90°)": "Barbell Curls",
    "Flat Bench Smith Machine": "Smith Machine Flat Bench Press",
    "Laying Leg Curl": "Laying Leg Curls",
    "Bicep Curls": "Barbell Curls",
    "Laying Leg Curls": "Prone Leg Curls",
    "Laying Sit-Ups": "Sit-Ups",
    "Smith Machine Barbell Squats": "Smith Machine Squats",
    "Front Raises": "Dumbell Front Raises",
    "Side Raises": "Dumbell Side Raises",
    "Seated Overhead Dumbell Press": "Overhead Dumbell Press",
    "Seated Bicep Curls": "Barbell Curls",
    "Seated Overhead Press": "Overhead Press",
    "Cable Rows": "Rows",
    "Ab Curl": "Ab Curls",
    "Assisted Dips": "Dips",
    "Incline Bench Press": "Incline Bench Barbell Press",
    "Decline Bench Press": "Decline Bench Barbell Press",
    "Incline Bench Smith Machine": "Smith Machine Incline Bench Press",
    "Seated Shoulder Press": "Shoulder Press",
    "Closed Leg Press": "Leg Press",
    "Seated Leg Press": "Leg Press",
    "Decline Barbell Chest Press": "Decline Bench Barbell Press",
    "Seated Calf Extentions": "Leg Extentions",
    "Overhead Pulls": "Pulldowns",
    "Cable Pulls": "Tricep Cable Pulls",
    "Prone Leg Curl": "Prone Leg Curls",
    "Deadlift": "Barbell Deadlifts",
    "Romanian Deadlift": "Romanian Deadlifts",
    "Chest Press": "Seated Chest Press",
    "Seated Calf Extention": "Leg Extentions",
    "Flat Bench Press": "Flat Bench Barbell Press",
    "Overhead Press": "Overhead Barbell Press",
    "Shoulder Press": "Overhead Barbell Press",
    "Seated Overheard Press": "Overheard Barbell Press",
}

master["Workout"] = master["Workout"].str.title().replace(replacements)

# Drop unwanted workouts
master = master[~master["Workout"].isin(["Box Jumps", "Stairmaster"])]

# Convert columns to numeric
master[["Weight", "Reps", "Sets"]] = master[["Weight", "Reps", "Sets"]].apply(pd.to_numeric)
master[["Reps", "Sets"]] = master[["Reps", "Sets"]].round()


In [None]:
# Export to CSV
master.to_cs()

In [None]:
master

Unnamed: 0,Date,Workout,Weight,Reps,Sets
0,2024-02-19 00:00:00,Incline Bench Barbell Press,85.0,5.0,1.0
1,2024-02-19 00:00:00,Seated Dips,120.0,15.0,1.0
2,2024-02-19 00:00:00,Shoulder Press,32.5,15.0,1.0
3,2024-02-21 00:00:00,Hip Abduction,100.0,15.0,1.0
4,2024-02-21 00:00:00,Leg Press,230.0,15.0,1.0
...,...,...,...,...,...
157,2024-10-29 00:00:00,Prone Leg Curls,70.0,10.0,4.0
158,2024-10-29 00:00:00,Leg Extentions,130.0,10.0,4.0
159,2024-11-11 00:00:00,Smith Machine Flat Bench Press,135.0,10.0,4.0
160,2024-11-11 00:00:00,Smith Machine Incline Bench Press,95.0,7.0,4.0
