<a href="https://colab.research.google.com/github/CH2-PS020-FitSync/CH2-PS020-ML/blob/main/model/Workout-Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    from google.colab import drive

    !pip install faker

    # gymvisual-cleaned-2.json
    !gdown "1iFDqbXWFs3CM5mgXpsN4f5NTbZdzdMiX"
    # work-hist.json (DUMMY, SHOULD RETRIEVE history & users' data FROM DATABASE)
    !gdown "1SlgWerOrAqgBdaE4Hhzb8XHVBhPKzKwj"

    workout_json = './gymvisual-cleaned-2.json'
    hist_json = './work-hist.json'
except ImportError:
    workout_json = '../data/gymvisual-cleaned-2.json'
    hist_json = '../data/work-hist.json'

Downloading...
From: https://drive.google.com/uc?id=1iFDqbXWFs3CM5mgXpsN4f5NTbZdzdMiX
To: /content/gymvisual-cleaned-2.json
100% 1.20M/1.20M [00:00<00:00, 101MB/s]
Downloading...
From: https://drive.google.com/uc?id=1SlgWerOrAqgBdaE4Hhzb8XHVBhPKzKwj
To: /content/work-hist.json
100% 789/789 [00:00<00:00, 2.52MB/s]


In [2]:
import json

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import (Dense, Embedding, Flatten, Input, concatenate)
from tensorflow.keras.models import Model

In [3]:
with open(workout_json, 'r') as f:
    workout_f = json.load(f)

df_workout = pd.json_normalize(workout_f)
# df_hist = pd.read_json(hist_json)

df_workout.drop(
    ['desc', 'jpg', 'gif', 'duration.desc', 'duration.min', 'duration.rep', 'duration.set', 'duration.sec'],
    axis=1, inplace=True
)

df_workout.drop(
    df_workout[df_workout.level == 'Beginner'].sample(frac=.8).index,
    inplace=True
)


In [4]:
# Dummy User

from faker import Faker
from random import choice, random, randint

gender = ['Male', 'Female']
level = ['Beginner', 'Intermediate', 'Expert']

faker = Faker()
df_user = pd.DataFrame([
    {
        'name': faker.name(),
        'gender': choice(gender),
        'weight': round(random(), 1) + randint(40, 70),
        'height': randint(150, 180),
        'age': randint(15, 30),
        'level': choice(level)
    } for _ in range(100)
])

df_hist = []

for name in df_user.name:
    user = df_user[df_user.name == name]
    u_level = level.index(user.level.values[0])
    u_gender = user.gender.values[0]

    for _ in range(randint(20, 100)):
        workout_det_level = df_workout[(df_workout.gender == 'Female') & (random() < 0.4 or df_workout.level == 'Expert')]
        workout = workout_det_level.sample(1)
        w_level = level.index(workout.level.values[0])
        diff = abs(u_level - w_level)
        rating = max(0, randint(5, 10) - (randint(3, 6) if diff > 1 else randint(2, 4) if diff else randint(0, 1)))

        df_hist.append(
            {
                'name': user.name.values[0],
                'gender': user.gender.values[0],
                'title': workout.title.values[0],
                'level': user.level.values[0],
                'rating': rating,
                'diff': diff
            }
        )

df_hist = pd.DataFrame(df_hist)
df_hist.head()

Unnamed: 0,name,gender,title,level,rating,diff
0,Sandy Thomas,Female,Two Legged Inverted Staff Pose Dwi Pada Vipari...,Expert,7,0
1,Sandy Thomas,Female,Two Legged Inverted Staff Pose Dwi Pada Vipari...,Expert,6,0
2,Sandy Thomas,Female,Scissors (advanced) (female),Expert,5,0
3,Sandy Thomas,Female,Scissors (advanced) (female),Expert,8,0
4,Sandy Thomas,Female,Static Position Seated Back with Pad (female),Expert,6,2


In [5]:
FEATURES = ['gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']
LABEL_ENCODER = dict()

In [6]:
def get_col_to_encode(*dataframes):
    cols = set()

    for dataframe in dataframes:
        dataframe_cols = dataframe.select_dtypes(exclude=[np.number])
        cols.update(dataframe_cols)

        for col in dataframe_cols:
            LABEL_ENCODER[col] = LABEL_ENCODER.get(col, LabelEncoder().fit(dataframe[col]))

    return cols

# EDA

In [7]:
df_workout.head()

Unnamed: 0,title,type,body_part,gender,level
1,Air bike,Strength,Waist,Male,Intermediate
8,Boat Stretch,Stretching,Waist,Female,Beginner
9,Bridge (on knees),Strength,Waist,Female,Beginner
15,Cat Stretch,Stretching,Back,Female,Beginner
17,Chest Dip,Strength,Chest,Male,Intermediate


In [8]:
df_workout.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 537 entries, 1 to 1865
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      537 non-null    object
 1   type       537 non-null    object
 2   body_part  534 non-null    object
 3   gender     537 non-null    object
 4   level      537 non-null    object
dtypes: object(5)
memory usage: 25.2+ KB


In [9]:
print(
    df_workout.type.value_counts(),
    df_workout.body_part.value_counts()[:10],
    df_workout.gender.value_counts(),
    df_workout.level.value_counts(),
    sep='\n\n'
)

Strength      283
Stretching    154
Aerobic       100
Name: type, dtype: int64

Waist          89
Hips           79
Plyometrics    67
Back           58
Thighs         40
Chest          34
Cardio         28
Calves         24
Stretching     22
Upper Arms     18
Name: body_part, dtype: int64

Male      381
Female    156
Name: gender, dtype: int64

Beginner        332
Intermediate    124
Expert           81
Name: level, dtype: int64


# Encoding

In [10]:
def encode_hist_work(df_workout, df_hist):
    encoded_df_workout = df_workout.copy()
    encoded_df_hist = df_hist.copy()

    columns_to_encode = get_col_to_encode(encoded_df_workout, encoded_df_hist) # Inplace encode

    for col in columns_to_encode:

        if col in encoded_df_workout.columns:
            encoded_df_workout[col] = LABEL_ENCODER[col].transform(encoded_df_workout[col])

        if col in encoded_df_hist.columns:
            encoded_df_hist[col] = LABEL_ENCODER[col].transform(encoded_df_hist[col])


    return encoded_df_workout, encoded_df_hist

df_workout_copy, df_hist_copy = encode_hist_work(df_workout, df_hist)

In [11]:
for le in LABEL_ENCODER:
    print(repr(dict(zip(LABEL_ENCODER[le].classes_, LABEL_ENCODER[le].transform(LABEL_ENCODER[le].classes_))))[-200:])

Wrist Push-up (male)': 532, 'Wrist Radial Deviator And Flexor Stretch': 533, 'Wrist Ulnar Deviator And Flexor Stretch': 534, 'Yoga Vajrasana Thunderbolt Diamond Pose (male)': 535, 'ZigZag Hopes': 536}
{'Aerobic': 0, 'Strength': 1, 'Stretching': 2}
, 'Plyometrics': 21, 'Shoulders': 22, 'Shoulders, Thighs, Waist': 23, 'Stretching': 24, 'Thighs': 25, 'Upper Arms': 26, 'Upper Arms, Waist': 27, 'Waist': 28, 'Weightlifting': 29, 'Yoga': 30, None: 31}
{'Female': 0, 'Male': 1}
{'Beginner': 0, 'Expert': 1, 'Intermediate': 2}
: 90, 'Tammy Roach': 91, 'Thomas Miller': 92, 'Timothy Bass': 93, 'Timothy Johnson': 94, 'Travis Vasquez': 95, 'Victoria Reyes': 96, 'Victoria Robinson': 97, 'William Molina': 98, 'Zachary Patel': 99}


# Building Model

In [12]:
def train(workout_data, model_path, history_data=None, user_data=None):
    if history_data is not None and len(history_data.title.unique()) >= 5:
        merged_data = pd.merge(history_data, workout_data, on='title').dropna()
        X_train, X_test, Y_train, Y_test = train_test_split(merged_data[FEATURES], merged_data['rating'], test_size=0.2)
        # merged_data = merged_data.drop_duplicates(subset=['title'], keep='last')

        model = tf.keras.Sequential([
            tf.keras.layers.Dense(30, activation='relu'),
            tf.keras.layers.Dense(10, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1, activation='linear'),
        ])

        model.compile(
            loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['mse', 'mae']
        )

        history = model.fit(
            X_train, Y_train,
            epochs=20,
            validation_data=(X_test, Y_test)
        )

        loss = model.evaluate(X_test, Y_test)
        print(f"Test loss: {loss}")

        return model
    elif user_data is not None:
        pass

model = train(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: [8.214831352233887, 8.214831352233887, 2.349087953567505]


In [13]:
def predict_n(n, name, gender_workout, df_hist, df_user):
    user = df_user.copy()[df_user.name == name]
    history = df_hist.copy()[df_hist.name == name]

    gender_workout = gender_workout.copy()

    columns_to_encode = get_col_to_encode(user, gender_workout)

    for col in columns_to_encode:

        if col in user.columns:
            user[col] = LABEL_ENCODER[col].transform(user[col])

        if col in gender_workout.columns:
            gender_workout[col] = LABEL_ENCODER[col].transform(gender_workout[col])

    user_merge = pd.merge(gender_workout, user, how='cross')

    result = model.predict(user_merge[FEATURES])

    top_n_index = np.argpartition(-result[:, 0], n)[:n] # Top n max values index
    sorted_top_n_index = top_n_index[np.argsort(-result[top_n_index][:, 0])] # Sorted from max to min

    top_n_recommended = gender_workout.iloc[sorted_top_n_index]
    top_n_recommended_workout = LABEL_ENCODER['title'].inverse_transform(top_n_recommended.title)

    return top_n_recommended_workout

In [21]:
user = df_user.loc[df_user.level == 'Intermediate'].sample(1)
name = user.name.values[0]
gender_work = gender_workout = df_workout[
    (df_workout.gender == user.gender.values[0]) & (~df_workout.title.isin(df_hist[df_hist.name == name].title))
]
n = 10

top_n_prediction = predict_n(n, name, gender_work, df_hist, df_user)

top_n_prediction



array(['Backward Abdominal Stretch', 'Biceps Stretch Behind The Back',
       'Body Extension', 'Bench Dip on floor (VERSION 2)',
       'Barbell Full Squat (Back POV)', 'Armless Prayer Stretch (male)',
       'Bar Close Grip Biceps Curl', 'Bodyweight Single Leg Deadlift',
       'Warming-up in Lunge (five)', 'Bench dip on floor'], dtype=object)

In [22]:
user_hist = df_hist[df_hist.name == name]
user_hist

Unnamed: 0,name,gender,title,level,rating,diff
723,Kevin Brown,Male,Scissors (advanced) (female),Intermediate,6,1
724,Kevin Brown,Male,Frog Pose Mandukasana (female),Intermediate,8,1
725,Kevin Brown,Male,Lying Leg Raise to Side (female),Intermediate,4,1
726,Kevin Brown,Male,Two Legged Inverted Staff Pose Dwi Pada Vipari...,Intermediate,8,1
727,Kevin Brown,Male,Air Bike (VERSION 2) (female),Intermediate,3,1
...,...,...,...,...,...,...
790,Kevin Brown,Male,Lying Simultaneous Alternating Leg Raise,Intermediate,6,1
791,Kevin Brown,Male,Hip External Rotator Stretch,Intermediate,5,1
792,Kevin Brown,Male,Reaching Up Shoulder Stretch,Intermediate,5,1
793,Kevin Brown,Male,Hip Swirls (female),Intermediate,5,1


In [23]:
pd.merge(df_workout.loc[df_workout.title.isin(user_hist.title)], user_hist, on='title').sort_values('rating')

Unnamed: 0,title,type,body_part,gender_x,level_x,name,gender_y,level_y,rating,diff
70,Standing Foot Muscles Activation (female),Strength,Feet,Female,Beginner,Kevin Brown,Male,Intermediate,1,1
30,Starfish Crunch (advanced) (female),Strength,Waist,Female,Expert,Kevin Brown,Male,Intermediate,1,1
35,Starfish Crunch (advanced) (female),Strength,Waist,Female,Expert,Kevin Brown,Male,Intermediate,2,1
39,Scissors (advanced) (female),Strength,Waist,Female,Expert,Kevin Brown,Male,Intermediate,2,1
22,Air Bike (VERSION 2) (female),Strength,Waist,Female,Expert,Kevin Brown,Male,Intermediate,2,1
...,...,...,...,...,...,...,...,...,...,...
59,Frog Pose Mandukasana (female),Stretching,Stretching,Female,Beginner,Kevin Brown,Male,Intermediate,8,1
12,Standing Toe Flexor Stretch,Stretching,Calves,Female,Beginner,Kevin Brown,Male,Intermediate,8,1
9,Standing Hip Adduction Stretch,Stretching,Hips,Female,Beginner,Kevin Brown,Male,Intermediate,8,1
60,Two Legged Inverted Staff Pose Dwi Pada Vipari...,Stretching,Stretching,Female,Expert,Kevin Brown,Male,Intermediate,8,1


In [24]:
df_prediction = gender_work.set_index('title').loc[top_n_prediction].reset_index()
df_prediction

Unnamed: 0,title,type,body_part,gender,level
0,Backward Abdominal Stretch,Stretching,Waist,Male,Beginner
1,Biceps Stretch Behind The Back,Stretching,Upper Arms,Male,Beginner
2,Body Extension,Stretching,Thighs,Male,Beginner
3,Bench Dip on floor (VERSION 2),Strength,Upper Arms,Male,Expert
4,Barbell Full Squat (Back POV),Strength,Thighs,Male,Expert
5,Armless Prayer Stretch (male),Stretching,Stretching,Male,Beginner
6,Bar Close Grip Biceps Curl,Strength,Upper Arms,Male,Beginner
7,Bodyweight Single Leg Deadlift,Strength,Thighs,Male,Intermediate
8,Warming-up in Lunge (five),Stretching,"Back, Hips, Thighs",Male,Beginner
9,Bench dip on floor,Strength,Upper Arms,Male,Beginner


# AutoEncoder

workout_dataset:

`workout_id	title	workout_type	body_part	gender	level`

user_dataset:

`user_id	name	gender	weight	height	age	level`

In [18]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def auto(workout_data, model_path, history_data=None, user_data=None):
    merged_data = pd.merge(history_data, workout_data, on='title')[FEATURES]
    X_train, X_valid = train_test_split(merged_data, test_size=0.2, random_state=42)

    input_dim = len(merged_data.columns)
    encoding_dim = 32

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    decoder = Dense(input_dim, activation="sigmoid")(encoder)

    autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')

    autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_valid, X_valid))

    # Encoder for rec
    encoder_model = tf.keras.models.Model(inputs=input_layer, outputs=encoder)

    return encoder_model

In [19]:
encoder_model = auto(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
dummy_user = df_user.copy()[df_user.name == df_hist.name[0]]
dummy_gender_workout = df_workout_copy[
    (df_workout_copy.gender == df_hist_copy.gender[0])
]

for col in columns_to_encode:

    if col in dummy_user.columns:
        dummy_user[col] = le[col].transform(dummy_user[col])


dummy_user_merge = pd.merge(dummy_gender_workout, dummy_user, how='cross')
dummy_user_merge.head()

NameError: ignored

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_workouts(model, user_data, workout_data, k=5):
    workout_representations = model.predict(workout_data)
    user_representation = model.predict(user_data)

    print("user_representation shape:", user_representation.shape)
    print("workout_representations shape:", workout_representations.shape)

    # Calculate cosine similarity between the user and all workouts
    similarities = cosine_similarity(user_representation.reshape(1, -1), workout_representations)

    # Get top k similar workouts's idx
    similar_workout_indices = np.argsort(similarities[0])[::-1][:k]

    # Get workout data top k similar workouts
    similar_workouts = workout_data.iloc[similar_workout_indices]

    return similar_workouts

In [None]:
name = le['name'].inverse_transform([df_hist_copy.iloc[0][0]])[0]
test_user = df_user.copy()[df_user.name == name]
col_encode = list(test_user.select_dtypes(exclude=[np.number]))

for col in col_encode:
    test_user[col] = le[col].transform(test_user[col])

test_user

In [None]:
# Find similar workouts in latent space
similar_workouts = find_similar_workouts(encoder_model, test_user, dummy_user_merge[features], k=5)