<a href="https://colab.research.google.com/github/CH2-PS020-FitSync/CH2-PS020-ML/blob/main/model/Workout-Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    from google.colab import drive

    # gymvisual-cleaned-2.json
    !gdown "1K1rc0tBiqdSs7ZUkSZTkUCnixvsaF0Dx"
    # work-hist.json (DUMMY, SHOULD RETRIEVE history & users' data FROM DATABASE)
    !gdown "1SlgWerOrAqgBdaE4Hhzb8XHVBhPKzKwj"

    workout_json = './gymvisual-cleaned-2.json'
    hist_json = './work-hist.json'
except ImportError:
    workout_json = '../data/gymvisual-cleaned-2.json'
    hist_json = '../data/work-hist.json'

In [2]:
import json

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import (Dense, Embedding, Flatten, Input, concatenate)
from tensorflow.keras.models import Model

In [3]:
with open(workout_json, 'r') as f:
    workout_f = json.load(f)

df_workout = pd.json_normalize(workout_f)
# df_hist = pd.read_json(hist_json)

df_workout.drop(
    ['desc', 'jpg', 'gif', 'duration.desc', 'duration.min', 'duration.rep', 'duration.set', 'duration.sec'],
    axis=1, inplace=True
)

In [4]:
from faker import Faker
from random import choice, random, randint

gender = ['Male', 'Female']
level = ['Beginner', 'Intermediate', 'Expert']

faker = Faker()
df_user = pd.DataFrame([
    {
        'name': faker.name(),
        'gender': choice(gender),
        'weight': round(random(), 1) + randint(40, 70),
        'height': randint(150, 180),
        'age': randint(15, 30),
        'level': choice(level)
    } for _ in range(100)
])

df_hist = []

for _ in range(100):
    user = df_user.sample(1)
    u_level = level.index(user.level.values[0])
    u_gender = user.gender.values[0]

    for _ in range(randint(20, 100)):
        workout = df_workout[df_workout.gender == u_gender].sample(1)
        w_level = level.index(workout.level.values[0])
        diff = abs(u_level - w_level)
        rating = randint(5, 10) - (randint(2, 5) if diff > 1 else randint(1, 3) if diff else randint(0, 1))

        df_hist.append(
            {
                'name': user.name.values[0],
                'gender': user.gender.values[0],
                'title': workout.title.values[0],
                'level': user.level.values[0],
                'rating': rating
            }
        )

df_hist = pd.DataFrame(df_hist)
df_hist.head()

Unnamed: 0,name,gender,title,level,rating
0,Nancy Flores,Female,Side Bridge with Straight Legs (female),Intermediate,6
1,Nancy Flores,Female,Hanging Leg Hip Raise (female),Intermediate,3
2,Nancy Flores,Female,Lunge with Leg Lift (female),Intermediate,5
3,Nancy Flores,Female,Reverse Shoulder Stretch,Intermediate,5
4,Nancy Flores,Female,Single Leg Bodyweight Deadlift with Arm and Le...,Intermediate,7


# EDA

In [5]:
df_workout.head()

Unnamed: 0,title,type,body_part,gender,level
0,3/4 Sit-up,Strength,Waist,Male,Beginner
1,Air bike,Strength,Waist,Male,Intermediate
2,Air Twisting Crunch,Strength,Waist,Female,Beginner
3,Alternate Heel Touchers,Strength,Waist,Male,Beginner
4,Alternate Lying Floor Leg Raise,Strength,Waist,Female,Beginner


In [6]:
df_workout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1866 entries, 0 to 1865
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      1866 non-null   object
 1   type       1865 non-null   object
 2   body_part  1856 non-null   object
 3   gender     1866 non-null   object
 4   level      1866 non-null   object
dtypes: object(5)
memory usage: 73.0+ KB


In [7]:
print(
    df_workout.type.value_counts(),
    df_workout.body_part.value_counts()[:10],
    df_workout.gender.value_counts(),
    df_workout.level.value_counts(),
    sep='\n\n'
)

type
Strength      969
Stretching    633
Aerobic       263
Name: count, dtype: int64

body_part
Waist          338
Hips           291
Back           181
Thighs         168
Plyometrics    150
Stretching     102
Cardio          88
Calves          88
Chest           84
Upper Arms      69
Name: count, dtype: int64

gender
Male      1200
Female     666
Name: count, dtype: int64

level
Beginner        1661
Intermediate     124
Expert            81
Name: count, dtype: int64


# Merged Encoding 

In [8]:
df_workout_copy = df_workout.copy()
df_hist_copy = df_hist.copy()

In [9]:
pd.merge(df_hist_copy, df_workout_copy, on='title')

Unnamed: 0,name,gender_x,title,level_x,rating,type,body_part,gender_y,level_y
0,Nancy Flores,Female,Side Bridge with Straight Legs (female),Intermediate,6,Strength,Waist,Female,Beginner
1,Kristen Carpenter,Female,Side Bridge with Straight Legs (female),Intermediate,8,Strength,Waist,Female,Beginner
2,Wayne Davis,Female,Side Bridge with Straight Legs (female),Expert,2,Strength,Waist,Female,Beginner
3,Nancy Flores,Female,Hanging Leg Hip Raise (female),Intermediate,3,Strength,"Hips, Waist",Female,Beginner
4,Wayne Davis,Female,Hanging Leg Hip Raise (female),Expert,4,Strength,"Hips, Waist",Female,Beginner
...,...,...,...,...,...,...,...,...,...
5974,Joanna Frazier,Male,Jumping Jack High Knee (male),Beginner,6,Aerobic,Plyometrics,Male,Beginner
5975,Joanna Frazier,Male,Lying Toe Touch,Beginner,8,Strength,Waist,Male,Beginner
5976,Joanna Frazier,Male,Seated Toe Flexor And Foot Everter Stretch,Beginner,7,Stretching,Calves,Male,Beginner
5977,Joanna Frazier,Male,Hip Circles Stretch (male),Beginner,10,Stretching,Hips,Male,Beginner


In [10]:
# df_workout_copy.loc[
#     df_workout_copy.title.isin(
#         ['3/4 Sit-up', 'Alternate Heel Touchers', 'Bench Dip (knees bent)', 'One Arm Dip', 'Overhead Triceps Stretch']
#     )
# ] # When using work-hist.json

In [11]:
workout_col = list(df_workout_copy.select_dtypes(exclude=[np.number]))
hist_col = list(df_hist_copy.select_dtypes(exclude=[np.number]))

columns_to_encode = set(workout_col + hist_col)
le = {
    col: LabelEncoder().fit(df_workout_copy[col]
                            if col in df_workout_copy.columns
                            else df_hist_copy[col]) for col in columns_to_encode
}

for col in columns_to_encode:

    if col in df_workout_copy.columns:
        df_workout_copy[col] = le[col].transform(df_workout_copy[col])

    if col in df_hist_copy.columns:
        df_hist_copy[col] = le[col].transform(df_hist_copy[col])

In [12]:
pd.merge(df_hist_copy, df_workout_copy, on='title')

Unnamed: 0,name,gender_x,title,level_x,rating,type,body_part,gender_y,level_y
0,46,0,1356,2,6,1,50,0,0
1,37,0,1356,2,8,1,50,0,0
2,58,0,1356,1,2,1,50,0,0
3,46,0,601,2,3,1,39,0,0
4,58,0,601,1,4,1,39,0,0
...,...,...,...,...,...,...,...,...,...
5974,26,1,745,0,6,0,42,1,0
5975,26,1,974,0,8,1,50,1,0
5976,26,1,1308,0,7,2,17,1,0
5977,26,1,645,0,10,2,34,1,0


In [13]:
# df_workout_copy.loc[
#     df_workout_copy.title.isin(
#         [6, 40, 108, 1025, 1045]
#     )
# ] # When using work-hist.json

# Building Model

In [24]:
def train(workout_data, model_path, history_data=None, user_data=None):
    if history_data is not None and len(history_data.title.unique()) >= 5:
        features = ['name', 'gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']

        merged_data = pd.merge(history_data, workout_data, on='title').dropna()
        X_train, X_test, Y_train, Y_test = train_test_split(merged_data[features], merged_data['rating'], test_size=0.2)
        # merged_data = merged_data.drop_duplicates(subset=['title'], keep='last')

        model = tf.keras.Sequential([
            tf.keras.layers.Dense(30, activation='relu'),
            tf.keras.layers.Dense(10, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1, activation='linear'),
        ])

        model.compile(
            loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['mse', 'mae']
        )

        history = model.fit(
            X_train, Y_train,
            epochs=10,
            validation_data=(X_test, Y_test)
        )

        loss = model.evaluate(X_test)
        print(f"Test loss: {loss}")

        return model
    elif user_data is not None:
        pass

In [25]:
model = train(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: [0.0, 0.0, 0.0]


In [69]:
df_hist_copy.iloc[0]

name        46
gender       0
title     1356
level        2
rating       6
Name: 0, dtype: int64

In [53]:
features = ['name', 'gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']

test_user = df_hist_copy.iloc[0]
encoded_name = df_hist_copy[0]
gender = df_hist_copy[1]

name = le['name'].inverse_transform([encoded_name])[0]

history = df_hist[df_hist.name == name]

dummy_user = df_user.copy()[df_user.name == name]
dummy_gender_workout = df_workout_copy[
    (df_workout_copy.gender == gender) & (~df_workout.title.isin(history.title))
]

dummy_gender_workout.info()

<class 'pandas.core.frame.DataFrame'>
Index: 591 entries, 2 to 1863
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   title      591 non-null    int32
 1   type       591 non-null    int32
 2   body_part  591 non-null    int32
 3   gender     591 non-null    int32
 4   level      591 non-null    int32
dtypes: int32(5)
memory usage: 16.2 KB


In [54]:
for col in columns_to_encode:

    if col in dummy_user.columns:
        dummy_user[col] = le[col].transform(dummy_user[col])

dummy_user_merge = pd.merge(dummy_gender_workout, dummy_user, how='cross')
dummy_user_merge.shape

(591, 11)

In [57]:
result = model.predict(dummy_user_merge[features])
result[:10]



array([[3.9074173],
       [4.049181 ],
       [4.64514  ],
       [3.9456964],
       [3.946216 ],
       [3.3078923],
       [3.9560914],
       [2.6677046],
       [3.959209 ],
       [1.8160149]], dtype=float32)

In [58]:
result_df = pd.DataFrame(result)
result_df.value_counts()

1.677377    1
4.334143    1
4.303591    1
4.304660    1
4.306899    1
           ..
3.815433    1
3.815952    1
3.820630    1
3.821004    1
5.610106    1
Name: count, Length: 591, dtype: int64

In [61]:
df_hist[df_hist.name == name]

Unnamed: 0,name,gender,title,level,rating
0,Nancy Flores,Female,Side Bridge with Straight Legs (female),Intermediate,6
1,Nancy Flores,Female,Hanging Leg Hip Raise (female),Intermediate,3
2,Nancy Flores,Female,Lunge with Leg Lift (female),Intermediate,5
3,Nancy Flores,Female,Reverse Shoulder Stretch,Intermediate,5
4,Nancy Flores,Female,Single Leg Bodyweight Deadlift with Arm and Le...,Intermediate,7
...,...,...,...,...,...
81,Nancy Flores,Female,Cross Twisting Sit-up (VERSION 2) (female),Intermediate,9
82,Nancy Flores,Female,Curl-up (female),Intermediate,8
83,Nancy Flores,Female,Swimmer Kicks (VERSION 2) (female),Intermediate,6
84,Nancy Flores,Female,Bodyweight Standing Close-grip One Arm Row (fe...,Intermediate,7


In [65]:
df_workout[df_workout.title == 'Bench Pull-ups']

Unnamed: 0,title,type,body_part,gender,level
728,Bench Pull-ups,Strength,Back,Male,Expert


In [63]:
le['title'].inverse_transform([result.argmax()])

array(['Bench Pull-ups'], dtype=object)

In [None]:
check = df_hist[df_hist.name == df_hist.name[0]].sort_values('title')
check_work = df_workout[df_workout.title.isin(check.title)].sort_values('title')

In [None]:
df_workout.iloc[result.argmax()]

# AutoEncoder

workout_dataset:

`workout_id	title	workout_type	body_part	gender	level`

user_dataset:

`user_id	name	gender	weight	height	age	level`

In [90]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def auto(workout_data, model_path, history_data=None, user_data=None):
    features = ['name', 'gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']

    merged_data = pd.merge(history_data, workout_data, on='title')[features]
    X_train, X_valid = train_test_split(merged_data, test_size=0.2, random_state=42)

    input_dim = len(merged_data.columns)
    encoding_dim = 32

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    decoder = Dense(input_dim, activation="sigmoid")(encoder)

    autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')

    autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_valid, X_valid))

    # Encoder for rec
    encoder_model = tf.keras.models.Model(inputs=input_layer, outputs=encoder)

    return encoder_model

In [91]:
encoder_model = auto(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [92]:
features = ['name', 'gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']

dummy_user = df_user.copy()[df_user.name == df_hist.name[0]]
dummy_gender_workout = df_workout_copy[
    (df_workout_copy.gender == df_hist_copy.gender[0])
]

for col in columns_to_encode:

    if col in dummy_user.columns:
        dummy_user[col] = le[col].transform(dummy_user[col])


dummy_user_merge = pd.merge(dummy_gender_workout, dummy_user, how='cross')
dummy_user_merge.head()

Unnamed: 0,title,type,body_part,gender_x,level_x,name,gender_y,weight,height,age,level_y
0,34,1,50,0,0,40,0,53.0,178,19,0
1,46,1,50,0,0,40,0,53.0,178,19,0
2,140,1,50,0,0,40,0,53.0,178,19,0
3,143,2,50,0,0,40,0,53.0,178,19,0
4,239,1,50,0,0,40,0,53.0,178,19,0


In [93]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_workouts(model, user_data, workout_data, k=5):
    workout_representations = model.predict(workout_data)
    user_representation = model.predict(user_data)

    print("user_representation shape:", user_representation.shape)
    print("workout_representations shape:", workout_representations.shape)

    # Calculate cosine similarity between the user and all workouts
    similarities = cosine_similarity(user_representation.reshape(1, -1), workout_representations)

    # Get top k similar workouts's idx
    similar_workout_indices = np.argsort(similarities[0])[::-1][:k]

    # Get workout data top k similar workouts
    similar_workouts = workout_data.iloc[similar_workout_indices]

    return similar_workouts

In [105]:
name = le['name'].inverse_transform([df_hist_copy.iloc[0][0]])[0]
test_user = df_user[df_user.name == name]
col_encode = list(test_user.select_dtypes(exclude=[np.number]))

for col in col_encode:
    test_user[col] = le[col].transform(test_user[col])

test_user

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_user[col] = le[col].transform(test_user[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_user[col] = le[col].transform(test_user[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_user[col] = le[col].transform(test_user[col])


Unnamed: 0,name,gender,weight,height,age,level
93,40,0,53.0,178,19,0


In [106]:
# Find similar workouts in latent space
similar_workouts = find_similar_workouts(encoder_model, test_user, dummy_user_merge[features], k=5)



ValueError: in user code:

    File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\src\engine\training.py", line 2341, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\src\engine\training.py", line 2327, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\src\engine\training.py", line 2315, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\src\engine\training.py", line 2283, in predict_step
        return self(x, training=False)
    File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_14" is incompatible with the layer: expected shape=(None, 8), found shape=(None, 6)
