<a href="https://colab.research.google.com/github/CH2-PS020-FitSync/CH2-PS020-ML/blob/main/model/Workout-Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    from google.colab import drive

    !pip install faker

    # gymvisual-cleaned-2.json
    !gdown "1iFDqbXWFs3CM5mgXpsN4f5NTbZdzdMiX"
    # dummy_user.json
    !gdown "1iN6QQD2QVTjRj_I_DKE0hRjcVMHgCqsb"
    # dummy_user_act.json
    !gdown "1Ob3I8LiJGb5TESmQ4UuBZLIBTf09A5DU"
    # work-hist.json (DUMMY, SHOULD RETRIEVE history & users' data FROM DATABASE)
    !gdown "1SlgWerOrAqgBdaE4Hhzb8XHVBhPKzKwj"

    workout_json = './gymvisual-cleaned-2.json'
    user_json = './dummy_user.json'
    user_act_json = './dummy_user_act.json'
    hist_json = './work-hist.json'
except ImportError:
    workout_json = '../data/gymvisual-cleaned-2.json'
    user_json = '../data/dummy_user.json'
    user_act_json = '../data/dummy_user_act.json'
    hist_json = '../data/work-hist.json'

In [2]:
import joblib
import json

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import (Dense, Embedding, Flatten, Input, concatenate)
from tensorflow.keras.models import Model

In [3]:
with open(workout_json, 'r') as f:
    workout_f = json.load(f)

df_workout = pd.json_normalize(workout_f)
df_user = pd.read_json(user_json)
df_hist = pd.read_json(user_act_json)
# df_hist = pd.read_json(hist_json)

df_workout.drop(
    ['desc', 'jpg', 'gif', 'duration.desc', 'duration.min', 'duration.rep', 'duration.set', 'duration.sec'],
    axis=1, inplace=True
)

In [4]:
FEATURES = ['gender_x', 'level_x', 'title', 'type', 'body_part', 'gender_y', 'level_y']
LABEL_ENCODER = dict()

In [5]:
def get_col_to_encode(*dataframes, output_path=None):
    cols = set()

    for dataframe in dataframes:
        dataframe_cols = dataframe.select_dtypes(exclude=[np.number])
        cols.update(dataframe_cols)

        for col in dataframe_cols.columns:
            if col != 'name':
                LABEL_ENCODER[col] = LABEL_ENCODER.get(col, LabelEncoder().fit(dataframe[col]))

    if 'name' in cols:
        cols.remove('name')

    if output_path is not None:
        joblib.dump(LABEL_ENCODER, '../' + output_path + '.joblib')

    return cols

# EDA

In [6]:
df_workout.head()

Unnamed: 0,title,type,body_part,gender,level
0,3/4 Sit-up,Strength,Waist,Male,Beginner
1,Air bike,Strength,Waist,Male,Intermediate
2,Air Twisting Crunch,Strength,Waist,Female,Beginner
3,Alternate Heel Touchers,Strength,Waist,Male,Beginner
4,Alternate Lying Floor Leg Raise,Strength,Waist,Female,Beginner


In [7]:
df_workout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1866 entries, 0 to 1865
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      1866 non-null   object
 1   type       1865 non-null   object
 2   body_part  1856 non-null   object
 3   gender     1866 non-null   object
 4   level      1866 non-null   object
dtypes: object(5)
memory usage: 73.0+ KB


In [8]:
print(
    df_workout.type.value_counts(),
    df_workout.body_part.value_counts()[:10],
    df_workout.gender.value_counts(),
    df_workout.level.value_counts(),
    sep='\n\n'
)

type
Strength      969
Stretching    633
Aerobic       263
Name: count, dtype: int64

body_part
Waist          338
Hips           291
Back           181
Thighs         168
Plyometrics    150
Stretching     102
Cardio          88
Calves          88
Chest           84
Upper Arms      69
Name: count, dtype: int64

gender
Male      1200
Female     666
Name: count, dtype: int64

level
Beginner        1661
Intermediate     124
Expert            81
Name: count, dtype: int64


# Encoding

In [9]:
def encode_hist_work(df_workout, df_hist):
    encoded_df_workout = df_workout.copy()
    encoded_df_hist = df_hist.copy()

    columns_to_encode = get_col_to_encode(encoded_df_workout, encoded_df_hist, output_path='workout_hist_label') # Inplace encode

    for col in columns_to_encode:

        if col in encoded_df_workout.columns:
            encoded_df_workout[col] = LABEL_ENCODER[col].transform(encoded_df_workout[col])

        if col in encoded_df_hist.columns:
            encoded_df_hist[col] = LABEL_ENCODER[col].transform(encoded_df_hist[col])


    return encoded_df_workout, encoded_df_hist

df_workout_copy, df_hist_copy = encode_hist_work(df_workout, df_hist)

In [10]:
df_workout_copy, df_hist_copy

(      title  type  body_part  gender  level
 0         6     1         50       1      0
 1        36     1         50       1      2
 2        34     1         50       0      0
 3        40     1         50       1      0
 4        46     1         50       0      0
 ...     ...   ...        ...     ...    ...
 1861    966     2         45       1      0
 1862   1420     2         45       1      0
 1863    484     2         45       0      0
 1864    815     1         46       1      0
 1865    810     1         46       1      2
 
 [1866 rows x 5 columns],
                     name  gender  title  level  rating  diff
 0           Keith Dudley       1   1667      1       8     0
 1           Keith Dudley       1   1667      1       7     0
 2           Keith Dudley       1   1247      1       6     0
 3           Keith Dudley       1   1247      1       9     0
 4           Keith Dudley       1   1247      1       8     0
 ...                  ...     ...    ...    ...     ...   ..

In [11]:
for le in LABEL_ENCODER:
    print(repr(dict(zip(LABEL_ENCODER[le].classes_, LABEL_ENCODER[le].transform(LABEL_ENCODER[le].classes_))))[-200:])

Wrist Ulnar Deviator And Extensor Stretch': 1861, 'Wrist Ulnar Deviator And Flexor Stretch': 1862, 'X Drill (male)': 1863, 'Yoga Vajrasana Thunderbolt Diamond Pose (male)': 1864, 'ZigZag Hopes': 1865}
{'Aerobic': 0, 'Strength': 1, 'Stretching': 2, None: 3}
'Shoulders': 43, 'Shoulders, Thighs, Waist': 44, 'Stretching': 45, 'Thighs': 46, 'Thighs, Waist': 47, 'Upper Arms': 48, 'Upper Arms, Waist': 49, 'Waist': 50, 'Weightlifting': 51, 'Yoga': 52, None: 53}
{'Female': 0, 'Male': 1}
{'Beginner': 0, 'Expert': 1, 'Intermediate': 2}


# Building Model

In [12]:
def train(workout_data, model_path, train=True, history_data=None, user_data=None):
    if history_data is not None and len(history_data.title.unique()) >= 5:
        merged_data = pd.merge(history_data, workout_data, on='title').dropna()
        X_train, X_test, Y_train, Y_test = train_test_split(merged_data[FEATURES], merged_data['rating'], test_size=0.2)
        # merged_data = merged_data.drop_duplicates(subset=['title'], keep='last')

        model = tf.keras.Sequential([
            tf.keras.layers.Dense(30, activation='relu'),
            tf.keras.layers.Dense(10, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1, activation='linear'),
        ])

        model.compile(
            loss=tf.keras.losses.MeanSquaredError(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['mse', 'mae']
        )

        history = model.fit(
            X_train, Y_train,
            epochs=500,
            validation_data=(X_test, Y_test),
            verbose=0
        )

        loss = model.evaluate(X_test, Y_test)
        print(f"Test loss: {loss}")

        model.save(model_path)

        return model
    elif user_data is not None:
        pass

model = train(df_workout_copy, './saved_model/dummy_workout_recommend.h5', history_data=df_hist_copy)

Test loss: [4.565776824951172, 4.565776824951172, 1.7372322082519531]


  saving_api.save_model(


In [13]:
df_workout_copy

Unnamed: 0,title,type,body_part,gender,level
0,6,1,50,1,0
1,36,1,50,1,2
2,34,1,50,0,0
3,40,1,50,1,0
4,46,1,50,0,0
...,...,...,...,...,...
1861,966,2,45,1,0
1862,1420,2,45,1,0
1863,484,2,45,0,0
1864,815,1,46,1,0


In [14]:
def predict_n(n, name, gender_workout, df_hist, df_user):
    user = df_user.copy()[df_user.name == name]
    history = df_hist.copy()[df_hist.name == name]

    gender_workout = gender_workout.copy()

    columns_to_encode = get_col_to_encode(user, gender_workout)
    print(columns_to_encode)

    for col in columns_to_encode:

        if col in user.columns:
            print(col)
            user[col] = LABEL_ENCODER[col].transform(user[col])

        if col in gender_workout.columns:
            gender_workout[col] = LABEL_ENCODER[col].transform(gender_workout[col])

    user_merge = pd.merge(gender_workout, user, how='cross')

    result = model.predict(user_merge[FEATURES])

    top_n_index = np.argpartition(-result[:, 0], n)[:n] # Top n max values index
    sorted_top_n_index = top_n_index[np.argsort(-result[top_n_index][:, 0])] # Sorted from max to min

    top_n_recommended = gender_workout.iloc[sorted_top_n_index]
    top_n_recommended_workout = LABEL_ENCODER['title'].inverse_transform(top_n_recommended.title)

    return top_n_recommended_workout

In [15]:
# user = df_user.loc[df_user.level == 'Intermediate'].sample(1)
user = pd.DataFrame({
    'name': {1: 'Williamd Krause'},
    'gender': {1: 'Female'},
    'weight': {1: 57.1},
    'height': {1: 163},
    'age': {1: 21},
    'level': {1: 'Intermediate'}
})

name = user.name.values[0]
gender_work = df_workout[
    (df_workout.gender == user.gender.values[0]) & (~df_workout.title.isin(df_hist[df_hist.name == name].title))
]
n = 10

print(user)

top_n_prediction = predict_n(n, name, gender_work, df_hist, user) # For now use `user` as dummy new user as the database is not updated in realtime

top_n_prediction

              name  gender  weight  height  age         level
1  Williamd Krause  Female    57.1     163   21  Intermediate
{'body_part', 'level', 'gender', 'type', 'title'}
level
gender


array(['Air Bike (VERSION 2) (female)', '45 degrees Side Bend (female)',
       '45 degree Bycicle Twisting Crunch',
       '45 degree one leg hyperextension (arms in front of chest)',
       '45 degree twisting hyperextension', 'Ab Mat Sit-up (female)',
       'Ab Roller Crunch', '3 Leg Dog Pose (female)',
       '3 4 Sit up (female)', '3 Leg Chatarunga Pose (female)'],
      dtype=object)

In [16]:
user_hist = df_hist[df_hist.name == name]
user_hist

Unnamed: 0,name,gender,title,level,rating,diff


In [17]:
pd.merge(df_workout.loc[df_workout.title.isin(user_hist.title)], user_hist, on='title').sort_values('rating')

Unnamed: 0,type,body_part,gender_x,level_x,name,gender_y,title,level_y,rating,diff


In [18]:
df_prediction = gender_work.set_index('title').loc[top_n_prediction].reset_index()
df_prediction

Unnamed: 0,title,type,body_part,gender,level
0,Air Bike (VERSION 2) (female),Strength,Waist,Female,Expert
1,45 degrees Side Bend (female),Strength,Waist,Female,Beginner
2,45 degree Bycicle Twisting Crunch,Strength,Waist,Female,Beginner
3,45 degree one leg hyperextension (arms in fron...,Strength,Hips,Female,Beginner
4,45 degree twisting hyperextension,Strength,Hips,Female,Beginner
5,Ab Mat Sit-up (female),Strength,Waist,Female,Beginner
6,Ab Roller Crunch,Strength,Waist,Female,Beginner
7,3 Leg Dog Pose (female),Stretching,Stretching,Female,Beginner
8,3 4 Sit up (female),Strength,Waist,Female,Beginner
9,3 Leg Chatarunga Pose (female),Stretching,Stretching,Female,Beginner


# AutoEncoder

workout_dataset:

`workout_id	title	workout_type	body_part	gender	level`

user_dataset:

`user_id	name	gender	weight	height	age	level`

In [19]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def auto(workout_data, model_path, history_data=None, user_data=None):
    merged_data = pd.merge(history_data, workout_data, on='title')[FEATURES]
    X_train, X_valid = train_test_split(merged_data, test_size=0.2, random_state=42)

    input_dim = len(merged_data.columns)
    encoding_dim = 32

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    decoder = Dense(input_dim, activation="sigmoid")(encoder)

    autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder)
    autoencoder.compile(optimizer='adam', loss='mse')

    autoencoder.fit(X_train, X_train, epochs=10, batch_size=32, shuffle=True, validation_data=(X_valid, X_valid))

    # Encoder for rec
    encoder_model = tf.keras.models.Model(inputs=input_layer, outputs=encoder)

    return encoder_model

In [20]:
encoder_model = auto(df_workout_copy, './saved_model/workout.h5', df_hist_copy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
dummy_user = df_user.copy()[df_user.name == df_hist.name[0]]
dummy_gender_workout = df_workout_copy[
    (df_workout_copy.gender == df_hist_copy.gender[0])
]

for col in columns_to_encode:

    if col in dummy_user.columns:
        dummy_user[col] = le[col].transform(dummy_user[col])


dummy_user_merge = pd.merge(dummy_gender_workout, dummy_user, how='cross')
dummy_user_merge.head()

NameError: name 'columns_to_encode' is not defined

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_workouts(model, user_data, workout_data, k=5):
    workout_representations = model.predict(workout_data)
    user_representation = model.predict(user_data)

    print("user_representation shape:", user_representation.shape)
    print("workout_representations shape:", workout_representations.shape)

    # Calculate cosine similarity between the user and all workouts
    similarities = cosine_similarity(user_representation.reshape(1, -1), workout_representations)

    # Get top k similar workouts's idx
    similar_workout_indices = np.argsort(similarities[0])[::-1][:k]

    # Get workout data top k similar workouts
    similar_workouts = workout_data.iloc[similar_workout_indices]

    return similar_workouts

In [None]:
name = le['name'].inverse_transform([df_hist_copy.iloc[0][0]])[0]
test_user = df_user.copy()[df_user.name == name]
col_encode = list(test_user.select_dtypes(exclude=[np.number]))

for col in col_encode:
    test_user[col] = le[col].transform(test_user[col])

test_user

In [None]:
# Find similar workouts in latent space
similar_workouts = find_similar_workouts(encoder_model, test_user, dummy_user_merge[features], k=5)