<a href="https://colab.research.google.com/github/CH2-PS020-FitSync/CH2-PS020-ML/blob/main/model/Workout-Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    from google.colab import drive

    # gymvisual-cleaned-2.json
    !gdown "1K1rc0tBiqdSs7ZUkSZTkUCnixvsaF0Dx"
    # work-hist.json (DUMMY, SHOULD RETRIEVE history & users' data FROM DATABASE)
    !gdown "1SlgWerOrAqgBdaE4Hhzb8XHVBhPKzKwj"

    workout_json = './gymvisual-cleaned-2.json'
    hist_json = './work-hist.json'
except ImportError:
    workout_json = '../data/gymvisual-cleaned-2.json'
    hist_json = '../data/work-hist.json'

In [2]:
import json

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import (Dense, Embedding, Flatten, Input, concatenate)
from tensorflow.keras.models import Model

In [3]:
with open(workout_json, 'r') as f:
    workout_f = json.load(f)

df_workout = pd.json_normalize(workout_f)
# df_hist = pd.read_json(hist_json)

df_workout.drop(
    ['desc', 'jpg', 'gif', 'duration.desc', 'duration.min', 'duration.rep', 'duration.set', 'duration.sec'],
    axis=1, inplace=True
)

In [4]:
# Dummy User

from faker import Faker
from random import choice, random, randint

gender = ['Male', 'Female']
level = ['Beginner', 'Intermediate', 'Expert']

faker = Faker()
df_user = pd.DataFrame([
    {
        'name': faker.name(),
        'gender': choice(gender),
        'weight': round(random(), 1) + randint(40, 70),
        'height': randint(150, 180),
        'age': randint(15, 30),
        'level': choice(level)
    } for _ in range(100)
])

df_hist = []

for name in df_user.name:
    user = df_user[df_user.name == name]
    u_level = level.index(user.level.values[0])
    u_gender = user.gender.values[0]

    for _ in range(randint(20, 100)):
        workout = df_workout[df_workout.gender == u_gender].sample(1)
        w_level = level.index(workout.level.values[0])
        diff = abs(u_level - w_level)
        rating = randint(5, 10) - (randint(2, 5) if diff > 1 else randint(1, 3) if diff else randint(0, 1))

        df_hist.append(
            {
                'name': user.name.values[0],
                'gender': user.gender.values[0],
                'title': workout.title.values[0],
                'level': user.level.values[0],
                'rating': rating
            }
        )

df_hist = pd.DataFrame(df_hist)
df_hist.head()

Unnamed: 0,name,gender,title,level,rating
0,Charles Taylor,Female,Half Frog Pose Ardha Bhekasana (female),Intermediate,9
1,Charles Taylor,Female,Sumo Squat,Intermediate,6
2,Charles Taylor,Female,Chaturanga Dandasana (Four Limbed Staff Pose) ...,Intermediate,4
3,Charles Taylor,Female,Static Position Lying Back with Pad (female),Intermediate,7
4,Charles Taylor,Female,Lying (prone) Abdominal Stretch,Intermediate,9


In [5]:
FEATURES = ['gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']
LABEL_ENCODER = dict()

In [6]:
def get_col_to_encode(*dataframes):
    cols = set()

    for dataframe in dataframes:
        dataframe_cols = dataframe.select_dtypes(exclude=[np.number])
        cols.update(dataframe_cols)

        for col in dataframe_cols:
            LABEL_ENCODER[col] = LABEL_ENCODER.get(col, LabelEncoder().fit(dataframe[col]))
    
    return cols

# EDA

In [7]:
df_workout.head()

Unnamed: 0,title,type,body_part,gender,level
0,3/4 Sit-up,Strength,Waist,Male,Beginner
1,Air bike,Strength,Waist,Male,Intermediate
2,Air Twisting Crunch,Strength,Waist,Female,Beginner
3,Alternate Heel Touchers,Strength,Waist,Male,Beginner
4,Alternate Lying Floor Leg Raise,Strength,Waist,Female,Beginner


In [8]:
df_workout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1866 entries, 0 to 1865
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      1866 non-null   object
 1   type       1865 non-null   object
 2   body_part  1856 non-null   object
 3   gender     1866 non-null   object
 4   level      1866 non-null   object
dtypes: object(5)
memory usage: 73.0+ KB


In [9]:
print(
    df_workout.type.value_counts(),
    df_workout.body_part.value_counts()[:10],
    df_workout.gender.value_counts(),
    df_workout.level.value_counts(),
    sep='\n\n'
)

type
Strength      969
Stretching    633
Aerobic       263
Name: count, dtype: int64

body_part
Waist          338
Hips           291
Back           181
Thighs         168
Plyometrics    150
Stretching     102
Cardio          88
Calves          88
Chest           84
Upper Arms      69
Name: count, dtype: int64

gender
Male      1200
Female     666
Name: count, dtype: int64

level
Beginner        1661
Intermediate     124
Expert            81
Name: count, dtype: int64


# Encoding 

In [10]:
def encode_hist_work(df_workout, df_hist):
    encoded_df_workout = df_workout.copy()
    encoded_df_hist = df_hist.copy()

    columns_to_encode = get_col_to_encode(encoded_df_workout, encoded_df_hist) # Inplace encode
    
    for col in columns_to_encode:

        if col in encoded_df_workout.columns:
            encoded_df_workout[col] = LABEL_ENCODER[col].transform(encoded_df_workout[col])

        if col in encoded_df_hist.columns:
            encoded_df_hist[col] = LABEL_ENCODER[col].transform(encoded_df_hist[col])


    return encoded_df_workout, encoded_df_hist
    
df_workout_copy, df_hist_copy = encode_hist_work(df_workout, df_hist)

In [11]:
for le in LABEL_ENCODER:
    print(repr(dict(zip(LABEL_ENCODER[le].classes_, LABEL_ENCODER[le].transform(LABEL_ENCODER[le].classes_))))[-200:])

Wrist Ulnar Deviator And Extensor Stretch': 1861, 'Wrist Ulnar Deviator And Flexor Stretch': 1862, 'X Drill (male)': 1863, 'Yoga Vajrasana Thunderbolt Diamond Pose (male)': 1864, 'ZigZag Hopes': 1865}
{'Aerobic': 0, 'Strength': 1, 'Stretching': 2, None: 3}
'Shoulders': 43, 'Shoulders, Thighs, Waist': 44, 'Stretching': 45, 'Thighs': 46, 'Thighs, Waist': 47, 'Upper Arms': 48, 'Upper Arms, Waist': 49, 'Waist': 50, 'Weightlifting': 51, 'Yoga': 52, None: 53}
{'Female': 0, 'Male': 1}
{'Beginner': 0, 'Expert': 1, 'Intermediate': 2}
': 90, 'Terri Khan': 91, 'Thomas Phillips': 92, 'Tiffany Baker': 93, 'Travis Haynes': 94, 'Tricia Franklin': 95, 'Tyler Jimenez': 96, 'Victor Mccarthy': 97, 'William Miller': 98, 'William Weaver': 99}


# Building Model

In [12]:
def train(workout_data, model_path, history_data=None, user_data=None):
    if history_data is not None and len(history_data.title.unique()) >= 5:
        merged_data = pd.merge(history_data, workout_data, on='title').dropna()
        X_train, X_test, Y_train, Y_test = train_test_split(merged_data[FEATURES], merged_data['rating'], test_size=0.2)
        # merged_data = merged_data.drop_duplicates(subset=['title'], keep='last')

        model = tf.keras.Sequential([
            tf.keras.layers.Dense(30, activation='relu'),
            tf.keras.layers.Dense(10, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1, activation='linear'),
        ])

        model.compile(
            loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['mse', 'mae']
        )

        history = model.fit(
            X_train, Y_train,
            epochs=20,
            validation_data=(X_test, Y_test)
        )

        loss = model.evaluate(X_test, Y_test)
        print(f"Test loss: {loss}")

        return model
    elif user_data is not None:
        pass

model = train(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: [6.970854759216309, 6.970854759216309, 2.1306352615356445]


In [30]:
def predict_n(n, name, df_workout, df_hist, df_user):
    user = df_user.copy()[df_user.name == name]
    history = df_hist.copy()[df_hist.name == name]

    gender_workout = df_workout.copy()[
        (df_workout.gender == user.gender.values[0]) & (~df_workout.title.isin(history.title))
    ]

    columns_to_encode = get_col_to_encode(user, gender_workout)

    for col in columns_to_encode:

        if col in user.columns:
            user[col] = LABEL_ENCODER[col].transform(user[col])

        if col in gender_workout.columns:
            gender_workout[col] = LABEL_ENCODER[col].transform(gender_workout[col])

    user_merge = pd.merge(gender_workout, user, how='cross')

    result = model.predict(user_merge[FEATURES])

    top_n_index = np.argpartition(-result[:, 0], n)[:n] # Top n max values index
    sorted_top_n_index = top_n_index[np.argsort(-result[top_n_index][:, 0])] # Sorted from max to min

    top_n_recommended = gender_workout.iloc[sorted_top_n_index]
    top_n_recommended_workout = LABEL_ENCODER['title'].inverse_transform(top_n_recommended.title)

    return top_n_recommended_workout

In [37]:
name = df_user.iloc[:1].name.values[0]
n = 10

top_n_prediction = predict_n(n, name, df_workout, df_hist, df_user)

top_n_prediction



array(['Wrist Ulnar Deviator And Flexor Stretch',
       'Wrist Ulnar Deviator And Extensor Stretch',
       'Wrist Radial Deviator And Flexor Stretch',
       'Wrist Radial Deviator And Extensor Stretch',
       'Wrist Flexor Stretch', 'Wrist Extensor Stretch',
       'Wide legged Forward Bend Prasarita Padottanasana',
       'Warrior Pose I Virabhadrasana I', 'Warrior Pose II (female)',
       'Warrior III Pose (female)'], dtype=object)

In [43]:
user_hist = df_hist[df_hist.name == name]
user_hist

Unnamed: 0,name,gender,title,level,rating
0,Charles Taylor,Female,Half Frog Pose Ardha Bhekasana (female),Intermediate,9
1,Charles Taylor,Female,Sumo Squat,Intermediate,6
2,Charles Taylor,Female,Chaturanga Dandasana (Four Limbed Staff Pose) ...,Intermediate,4
3,Charles Taylor,Female,Static Position Lying Back with Pad (female),Intermediate,7
4,Charles Taylor,Female,Lying (prone) Abdominal Stretch,Intermediate,9
...,...,...,...,...,...
63,Charles Taylor,Female,Triceps Dip (female),Intermediate,2
64,Charles Taylor,Female,Burpee (female),Intermediate,5
65,Charles Taylor,Female,Seated Rotation Stretch (female),Intermediate,8
66,Charles Taylor,Female,Sitting Bent Over Back Stretch (female),Intermediate,8


In [40]:
df_workout.loc[df_workout.title.isin(top_n_prediction)]

Unnamed: 0,title,type,body_part,gender,level
222,Warrior Pose I Virabhadrasana I,Stretching,Yoga,Female,Beginner
223,Wide legged Forward Bend Prasarita Padottanasana,Stretching,Yoga,Female,Beginner
404,Wrist Ulnar Deviator And Extensor Stretch,Stretching,Forearms,Female,Beginner
405,Wrist Radial Deviator And Extensor Stretch,Stretching,Forearms,Female,Beginner
406,Wrist Radial Deviator And Flexor Stretch,Stretching,Forearms,Female,Beginner
407,Wrist Ulnar Deviator And Flexor Stretch,Stretching,Forearms,Female,Beginner
430,Wrist Extensor Stretch,Stretching,Forearms,Female,Beginner
431,Wrist Flexor Stretch,Stretching,Forearms,Female,Beginner
1653,Warrior III Pose (female),Stretching,Stretching,Female,Beginner
1744,Warrior Pose II (female),Stretching,Stretching,Female,Beginner


In [44]:
pd.merge(df_workout.loc[df_workout.title.isin(user_hist.title)], user_hist[['title', 'rating']], on='title')

Unnamed: 0,title,type,body_part,gender,level,rating
0,Hip Crunch (knees bent),Strength,Waist,Female,Beginner,5
1,Jack knife Floor,Strength,Waist,Female,Beginner,6
2,Narrow Leg Bench Bridge,Strength,Waist,Female,Beginner,8
3,Squat,Strength,Thighs,Female,Beginner,2
4,Sumo Squat,Strength,Thighs,Female,Beginner,6
...,...,...,...,...,...,...
63,Dancer Pose Natarajasana (female),Stretching,Stretching,Female,Beginner,3
64,Balance Board (female) (VERSION 2),Stretching,Hips,Female,Beginner,4
65,Balance Board (female) (VERSION 2),Stretching,Hips,Female,Beginner,4
66,Forward Bend Back Stretch (female),Stretching,Stretching,Female,Beginner,4


# AutoEncoder

workout_dataset:

`workout_id	title	workout_type	body_part	gender	level`

user_dataset:

`user_id	name	gender	weight	height	age	level`

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def auto(workout_data, model_path, history_data=None, user_data=None):
    merged_data = pd.merge(history_data, workout_data, on='title')[features]
    X_train, X_valid = train_test_split(merged_data, test_size=0.2, random_state=42)

    input_dim = len(merged_data.columns)
    encoding_dim = 32

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    decoder = Dense(input_dim, activation="sigmoid")(encoder)

    autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')

    autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_valid, X_valid))

    # Encoder for rec
    encoder_model = tf.keras.models.Model(inputs=input_layer, outputs=encoder)

    return encoder_model

In [None]:
encoder_model = auto(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

In [None]:
dummy_user = df_user.copy()[df_user.name == df_hist.name[0]]
dummy_gender_workout = df_workout_copy[
    (df_workout_copy.gender == df_hist_copy.gender[0])
]

for col in columns_to_encode:

    if col in dummy_user.columns:
        dummy_user[col] = le[col].transform(dummy_user[col])


dummy_user_merge = pd.merge(dummy_gender_workout, dummy_user, how='cross')
dummy_user_merge.head()

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_workouts(model, user_data, workout_data, k=5):
    workout_representations = model.predict(workout_data)
    user_representation = model.predict(user_data)

    print("user_representation shape:", user_representation.shape)
    print("workout_representations shape:", workout_representations.shape)

    # Calculate cosine similarity between the user and all workouts
    similarities = cosine_similarity(user_representation.reshape(1, -1), workout_representations)

    # Get top k similar workouts's idx
    similar_workout_indices = np.argsort(similarities[0])[::-1][:k]

    # Get workout data top k similar workouts
    similar_workouts = workout_data.iloc[similar_workout_indices]

    return similar_workouts

In [None]:
name = le['name'].inverse_transform([df_hist_copy.iloc[0][0]])[0]
test_user = df_user.copy()[df_user.name == name]
col_encode = list(test_user.select_dtypes(exclude=[np.number]))

for col in col_encode:
    test_user[col] = le[col].transform(test_user[col])

test_user

In [None]:
# Find similar workouts in latent space
similar_workouts = find_similar_workouts(encoder_model, test_user, dummy_user_merge[features], k=5)