<a href="https://colab.research.google.com/github/CH2-PS020-FitSync/CH2-PS020-ML/blob/main/model/Workout-Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    from google.colab import drive

    !pip install faker

    # gymvisual-cleaned-2.json
    !gdown "1iFDqbXWFs3CM5mgXpsN4f5NTbZdzdMiX"
    # work-hist.json (DUMMY, SHOULD RETRIEVE history & users' data FROM DATABASE)
    !gdown "1SlgWerOrAqgBdaE4Hhzb8XHVBhPKzKwj"

    workout_json = './gymvisual-cleaned-2.json'
    hist_json = './work-hist.json'
except ImportError:
    workout_json = '../data/gymvisual-cleaned-2.json'
    hist_json = '../data/work-hist.json'

In [2]:
import json

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import (Dense, Embedding, Flatten, Input, concatenate)
from tensorflow.keras.models import Model

In [3]:
with open(workout_json, 'r') as f:
    workout_f = json.load(f)

df_workout = pd.json_normalize(workout_f)
# df_hist = pd.read_json(hist_json)

df_workout.drop(
    ['desc', 'jpg', 'gif', 'duration.desc', 'duration.min', 'duration.rep', 'duration.set', 'duration.sec'],
    axis=1, inplace=True
)

df_workout.drop(
    df_workout[df_workout.level == 'Beginner'].sample(frac=.8).index,
    inplace=True
)


In [4]:
# Dummy User

from faker import Faker
from random import choice, random, randint

gender = ['Male', 'Female']
level = ['Beginner', 'Intermediate', 'Expert']

faker = Faker()
df_user = pd.DataFrame([
    {
        'name': faker.name(),
        'gender': choice(gender),
        'weight': round(random(), 1) + randint(40, 70),
        'height': randint(150, 180),
        'age': randint(15, 30),
        'level': choice(level)
    } for _ in range(100)
])

df_hist = []

for name in df_user.name:
    user = df_user[df_user.name == name]
    u_level = level.index(user.level.values[0])
    u_gender = user.gender.values[0]

    for _ in range(randint(20, 100)):
        workout_det_level = df_workout[(df_workout.gender == 'Female') & (random() < 0.4 or df_workout.level == 'Expert')]
        workout = workout_det_level.sample(1)
        w_level = level.index(workout.level.values[0])
        diff = abs(u_level - w_level)
        rating = max(0, randint(5, 10) - (randint(3, 6) if diff > 1 else randint(2, 4) if diff else randint(0, 1)))

        df_hist.append(
            {
                'name': user.name.values[0],
                'gender': user.gender.values[0],
                'title': workout.title.values[0],
                'level': user.level.values[0],
                'rating': rating,
                'diff': diff
            }
        )

df_hist = pd.DataFrame(df_hist)
df_hist.head()

Unnamed: 0,name,gender,title,level,rating,diff
0,John Ramos,Female,Decline Push-up (Kneeling) (female),Expert,2,2
1,John Ramos,Female,Two Legged Inverted Staff Pose Dwi Pada Vipari...,Expert,5,0
2,John Ramos,Female,Two Legged Inverted Staff Pose Dwi Pada Vipari...,Expert,6,0
3,John Ramos,Female,Scissors (advanced) (female),Expert,5,0
4,John Ramos,Female,Air Bike (VERSION 2) (female),Expert,8,0


In [5]:
FEATURES = ['gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']
LABEL_ENCODER = dict()

In [6]:
def get_col_to_encode(*dataframes):
    cols = set()

    for dataframe in dataframes:
        dataframe_cols = dataframe.select_dtypes(exclude=[np.number])
        cols.update(dataframe_cols)

        for col in dataframe_cols.columns:
            if col != 'name':
                LABEL_ENCODER[col] = LABEL_ENCODER.get(col, LabelEncoder().fit(dataframe[col]))

    if 'name' in cols:
        cols.remove('name')

    return cols

# EDA

In [7]:
df_workout.head()

Unnamed: 0,title,type,body_part,gender,level
1,Air bike,Strength,Waist,Male,Intermediate
5,Bench Dip (knees bent),Strength,Upper Arms,Male,Beginner
17,Chest Dip,Strength,Chest,Male,Intermediate
23,Cross Body Twisting Crunch,Strength,Waist,Female,Beginner
32,Donkey Calf Raise,Strength,Calves,Male,Beginner


In [8]:
df_workout.info()

<class 'pandas.core.frame.DataFrame'>
Index: 537 entries, 1 to 1865
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      537 non-null    object
 1   type       537 non-null    object
 2   body_part  532 non-null    object
 3   gender     537 non-null    object
 4   level      537 non-null    object
dtypes: object(5)
memory usage: 25.2+ KB


In [9]:
print(
    df_workout.type.value_counts(),
    df_workout.body_part.value_counts()[:10],
    df_workout.gender.value_counts(),
    df_workout.level.value_counts(),
    sep='\n\n'
)

type
Strength      289
Stretching    149
Aerobic        99
Name: count, dtype: int64

body_part
Hips           81
Waist          80
Plyometrics    69
Back           55
Thighs         44
Chest          27
Cardio         26
Calves         22
Shoulders      20
Stretching     20
Name: count, dtype: int64

gender
Male      399
Female    138
Name: count, dtype: int64

level
Beginner        332
Intermediate    124
Expert           81
Name: count, dtype: int64


# Encoding

In [10]:
def encode_hist_work(df_workout, df_hist):
    encoded_df_workout = df_workout.copy()
    encoded_df_hist = df_hist.copy()

    columns_to_encode = get_col_to_encode(encoded_df_workout, encoded_df_hist) # Inplace encode

    for col in columns_to_encode:

        if col in encoded_df_workout.columns:
            encoded_df_workout[col] = LABEL_ENCODER[col].transform(encoded_df_workout[col])

        if col in encoded_df_hist.columns:
            encoded_df_hist[col] = LABEL_ENCODER[col].transform(encoded_df_hist[col])


    return encoded_df_workout, encoded_df_hist

df_workout_copy, df_hist_copy = encode_hist_work(df_workout, df_hist)

In [11]:
df_workout_copy, df_hist_copy

(      title  type  body_part  gender  level
 1         7     1         26       1      2
 5        24     1         24       1      0
 17       71     1          9       1      2
 23       83     1         26       0      0
 32      100     1          7       1      0
 ...     ...   ...        ...     ...    ...
 1850    314     1          0       1      2
 1852    459     0         20       1      0
 1860    212     0          8       1      1
 1864    252     1         23       1      0
 1865    248     1         23       1      2
 
 [537 rows x 5 columns],
                 name  gender  title  level  rating  diff
 0         John Ramos       0     92      1       2     2
 1         John Ramos       0    518      1       5     0
 2         John Ramos       0    518      1       6     0
 3         John Ramos       0    372      1       5     0
 4         John Ramos       0      4      1       8     0
 ...              ...     ...    ...    ...     ...   ...
 6199  Lucas Cummings      

In [12]:
for le in LABEL_ENCODER:
    print(repr(dict(zip(LABEL_ENCODER[le].classes_, LABEL_ENCODER[le].transform(LABEL_ENCODER[le].classes_))))[-200:])

rints': 531, 'World Greatest Stretch': 532, 'Wrist Circles (female)': 533, 'Wrist Ulnar Deviator And Extensor Stretch': 534, 'Yoga Vajrasana Thunderbolt Diamond Pose (male)': 535, 'ZigZag Hopes': 536}
{'Aerobic': 0, 'Strength': 1, 'Stretching': 2}
7, 'Hips, Waist': 18, 'Neck': 19, 'Plyometrics': 20, 'Shoulders': 21, 'Stretching': 22, 'Thighs': 23, 'Upper Arms': 24, 'Upper Arms, Waist': 25, 'Waist': 26, 'Weightlifting': 27, 'Yoga': 28, None: 29}
{'Female': 0, 'Male': 1}
{'Beginner': 0, 'Expert': 1, 'Intermediate': 2}


# Building Model

In [13]:
def train(workout_data, model_path, train=True, history_data=None, user_data=None):
    if history_data is not None and len(history_data.title.unique()) >= 5:
        merged_data = pd.merge(history_data, workout_data, on='title').dropna()
        X_train, X_test, Y_train, Y_test = train_test_split(merged_data[FEATURES], merged_data['rating'], test_size=0.2)
        # merged_data = merged_data.drop_duplicates(subset=['title'], keep='last')

        model = tf.keras.Sequential([
            tf.keras.layers.Dense(30, activation='relu'),
            tf.keras.layers.Dense(10, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1, activation='linear'),
        ])

        model.compile(
            loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['mse', 'mae']
        )

        history = model.fit(
            X_train, Y_train,
            epochs=500,
            validation_data=(X_test, Y_test),
            verbose=0
        )

        loss = model.evaluate(X_test, Y_test)
        print(f"Test loss: {loss}")

        model.save(model_path)

        return model
    elif user_data is not None:
        pass

model = train(df_workout_copy, './saved_model/dummy_workout_recommend.h5', history_data=df_hist_copy)

Test loss: [4.703448295593262, 4.703448295593262, 1.7811813354492188]


  saving_api.save_model(


In [14]:
df_workout_copy

Unnamed: 0,title,type,body_part,gender,level
1,7,1,26,1,2
5,24,1,24,1,0
17,71,1,9,1,2
23,83,1,26,0,0
32,100,1,7,1,0
...,...,...,...,...,...
1850,314,1,0,1,2
1852,459,0,20,1,0
1860,212,0,8,1,1
1864,252,1,23,1,0


In [15]:
def predict_n(n, name, gender_workout, df_hist, df_user):
    user = df_user.copy()[df_user.name == name]
    history = df_hist.copy()[df_hist.name == name]

    gender_workout = gender_workout.copy()

    columns_to_encode = get_col_to_encode(user, gender_workout)
    print(columns_to_encode)

    for col in columns_to_encode:

        if col in user.columns:
            print(col)
            user[col] = LABEL_ENCODER[col].transform(user[col])

        if col in gender_workout.columns:
            gender_workout[col] = LABEL_ENCODER[col].transform(gender_workout[col])

    user_merge = pd.merge(gender_workout, user, how='cross')

    result = model.predict(user_merge[FEATURES])

    top_n_index = np.argpartition(-result[:, 0], n)[:n] # Top n max values index
    sorted_top_n_index = top_n_index[np.argsort(-result[top_n_index][:, 0])] # Sorted from max to min

    top_n_recommended = gender_workout.iloc[sorted_top_n_index]
    top_n_recommended_workout = LABEL_ENCODER['title'].inverse_transform(top_n_recommended.title)

    return top_n_recommended_workout

In [17]:
# user = df_user.loc[df_user.level == 'Intermediate'].sample(1)
user = pd.DataFrame({
    'name': {1000000: 'Williamd Krause'},
    'gender': {1000000: 'Female'},
    'weight': {1000000: 57.1},
    'height': {1000000: 163},
    'age': {1000000: 21},
    'level': {1000000: 'Intermediate'}
})

name = user.name.values[0]
gender_work = df_workout[
    (df_workout.gender == user.gender.values[0]) & (~df_workout.title.isin(df_hist[df_hist.name == name].title))
]
n = 10

print(user)

top_n_prediction = predict_n(n, name, gender_work, df_hist, df_user)

top_n_prediction

                    name  gender  weight  height  age         level
1000000  Williamd Krause  Female    57.1     163   21  Intermediate
{'body_part', 'level', 'title', 'type', 'gender'}
level
gender


array(['Air Bike (VERSION 2) (female)', 'Alternate Leg Raise',
       'Standing Gastrocnemius Calf Stretch (female)',
       'Side Bridge with Bent Leg (female)',
       'Intermediate Hip Flexor and Quad Stretch (female)',
       'Neck Side Stretch (female)', 'Seated Calf Stretch (female)',
       'Sitting Bent Over Back Stretch (female)',
       'Forward to Side to Rear Lunge (female)',
       'Lying Leg Raise (female)'], dtype=object)

In [18]:
user_hist = df_hist[df_hist.name == name]
user_hist

Unnamed: 0,name,gender,title,level,rating,diff


In [19]:
pd.merge(df_workout.loc[df_workout.title.isin(user_hist.title)], user_hist, on='title').sort_values('rating')

Unnamed: 0,type,body_part,gender_x,level_x,name,gender_y,title,level_y,rating,diff


In [20]:
df_prediction = gender_work.set_index('title').loc[top_n_prediction].reset_index()
df_prediction

Unnamed: 0,title,type,body_part,gender,level
0,Air Bike (VERSION 2) (female),Strength,Waist,Female,Expert
1,Alternate Leg Raise,Strength,Hips,Female,Beginner
2,Standing Gastrocnemius Calf Stretch (female),Stretching,Calves,Female,Beginner
3,Side Bridge with Bent Leg (female),Strength,Waist,Female,Beginner
4,Intermediate Hip Flexor and Quad Stretch (female),Stretching,Thighs,Female,Intermediate
5,Neck Side Stretch (female),Stretching,Neck,Female,Beginner
6,Seated Calf Stretch (female),Stretching,Calves,Female,Beginner
7,Sitting Bent Over Back Stretch (female),Stretching,Back,Female,Beginner
8,Forward to Side to Rear Lunge (female),Strength,"Hips, Thighs",Female,Intermediate
9,Lying Leg Raise (female),Strength,Waist,Female,Beginner


# AutoEncoder

workout_dataset:

`workout_id	title	workout_type	body_part	gender	level`

user_dataset:

`user_id	name	gender	weight	height	age	level`

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def auto(workout_data, model_path, history_data=None, user_data=None):
    merged_data = pd.merge(history_data, workout_data, on='title')[FEATURES]
    X_train, X_valid = train_test_split(merged_data, test_size=0.2, random_state=42)

    input_dim = len(merged_data.columns)
    encoding_dim = 32

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    decoder = Dense(input_dim, activation="sigmoid")(encoder)

    autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')

    autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_valid, X_valid))

    # Encoder for rec
    encoder_model = tf.keras.models.Model(inputs=input_layer, outputs=encoder)

    return encoder_model

In [None]:
encoder_model = auto(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
dummy_user = df_user.copy()[df_user.name == df_hist.name[0]]
dummy_gender_workout = df_workout_copy[
    (df_workout_copy.gender == df_hist_copy.gender[0])
]

for col in columns_to_encode:

    if col in dummy_user.columns:
        dummy_user[col] = le[col].transform(dummy_user[col])


dummy_user_merge = pd.merge(dummy_gender_workout, dummy_user, how='cross')
dummy_user_merge.head()

NameError: ignored

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_workouts(model, user_data, workout_data, k=5):
    workout_representations = model.predict(workout_data)
    user_representation = model.predict(user_data)

    print("user_representation shape:", user_representation.shape)
    print("workout_representations shape:", workout_representations.shape)

    # Calculate cosine similarity between the user and all workouts
    similarities = cosine_similarity(user_representation.reshape(1, -1), workout_representations)

    # Get top k similar workouts's idx
    similar_workout_indices = np.argsort(similarities[0])[::-1][:k]

    # Get workout data top k similar workouts
    similar_workouts = workout_data.iloc[similar_workout_indices]

    return similar_workouts

In [None]:
name = le['name'].inverse_transform([df_hist_copy.iloc[0][0]])[0]
test_user = df_user.copy()[df_user.name == name]
col_encode = list(test_user.select_dtypes(exclude=[np.number]))

for col in col_encode:
    test_user[col] = le[col].transform(test_user[col])

test_user

In [None]:
# Find similar workouts in latent space
similar_workouts = find_similar_workouts(encoder_model, test_user, dummy_user_merge[features], k=5)