# Dependencies

In [None]:
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import re
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn import metrics

# Data Preparation

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Using dataset: [Gym Exercise Dataset | Kaggle](https://www.kaggle.com/datasets/niharika41298/gym-exercise-data)

In [None]:
file_path = '/content/gdrive/MyDrive/FAIZ/Dataset/Workout recom/megaGymDataset.csv'

In [None]:
data = pd.read_csv(file_path)
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,Rating,RatingDesc
0,0,Partner plank band row,The partner plank band row is an abdominal exe...,Strength,Abdominals,Bands,Intermediate,0.0,
1,1,Banded crunch isometric hold,The banded crunch isometric hold is an exercis...,Strength,Abdominals,Bands,Intermediate,,
2,2,FYR Banded Plank Jack,The banded plank jack is a variation on the pl...,Strength,Abdominals,Bands,Intermediate,,
3,3,Banded crunch,The banded crunch is an exercise targeting the...,Strength,Abdominals,Bands,Intermediate,,
4,4,Crunch,The crunch is a popular core exercise targetin...,Strength,Abdominals,Bands,Intermediate,,


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2918 entries, 0 to 2917
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2918 non-null   int64  
 1   Title       2918 non-null   object 
 2   Desc        1368 non-null   object 
 3   Type        2918 non-null   object 
 4   BodyPart    2918 non-null   object 
 5   Equipment   2918 non-null   object 
 6   Level       2918 non-null   object 
 7   Rating      1031 non-null   float64
 8   RatingDesc  862 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 205.3+ KB


In [None]:
# Dropping irrelevant columns
data = data.drop(['Unnamed: 0', 'Rating', 'RatingDesc'], axis=1)

In [None]:
# Dropping rows with blank description, system will use description to find similarities
data = data.dropna(subset=['Desc'])

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1368 entries, 0 to 2916
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      1368 non-null   object
 1   Desc       1368 non-null   object
 2   Type       1368 non-null   object
 3   BodyPart   1368 non-null   object
 4   Equipment  1368 non-null   object
 5   Level      1368 non-null   object
dtypes: object(6)
memory usage: 74.8+ KB


In [None]:
data.value_counts('Type')

Type
Strength                 1234
Plyometrics                53
Stretching                 44
Cardio                     17
Powerlifting               11
Olympic Weightlifting       5
Strongman                   4
dtype: int64

In [None]:
data.value_counts('BodyPart')

BodyPart
Abdominals     298
Quadriceps     245
Shoulders      174
Chest          149
Biceps         101
Triceps         88
Lats            69
Hamstrings      50
Middle Back     49
Lower Back      42
Glutes          29
Calves          26
Traps           17
Forearms        16
Abductors        8
Adductors        7
dtype: int64

In [None]:
data.value_counts('Equipment')

Equipment
Body Only        404
Dumbbell         246
Barbell          161
Cable            149
Machine          121
Other            101
Kettlebells       53
Bands             49
Exercise Ball     28
Medicine Ball     25
E-Z Curl Bar      14
None               9
Foam Roll          8
dtype: int64

In [None]:
data.value_counts('Level')

Level
Intermediate    1250
Beginner         108
Expert            10
dtype: int64

According to the data, system will recommend top 3 alternatives to given workout

In [None]:
data.loc[data['Equipment'] == 'None']

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level
637,Decline oblique crunch,The decline oblique crunch is a popular bodywe...,Strength,Abdominals,,Intermediate
638,Decline sit-up,The decline sit-up is a bodyweight core exerci...,Strength,Abdominals,,Intermediate
639,Hanging Windshield Wiper,The hanging windshield wiper is an advanced ab...,Strength,Abdominals,,Intermediate
1402,Glute ham raise-,The glute ham raise is an exercise targeting t...,Strength,Hamstrings,,Beginner
1403,Lying hamstring stretch with band,The lying hamstring stretch with band is a sim...,Stretching,Hamstrings,,Beginner
1406,Alternating lunge jump,The alternating lunge jump is an explosive bod...,Stretching,Hamstrings,,Beginner
2421,Dumbbell lateral hop to sprint,The dumbbell lateral hop to sprint is a multi-...,Plyometrics,Quadriceps,,Intermediate
2422,Smith machine lunge sprint,The Smith machine lunge sprint is a lower-body...,Strength,Quadriceps,,Intermediate
2423,Sissy squat,The sissy squat is a bodyweight squat variatio...,Strength,Quadriceps,,Intermediate


In [None]:
data = data.replace('None', "Body Only")

In [None]:
data.value_counts('Equipment')

Equipment
Body Only        413
Dumbbell         246
Barbell          161
Cable            149
Machine          121
Other            101
Kettlebells       53
Bands             49
Exercise Ball     28
Medicine Ball     25
E-Z Curl Bar      14
Foam Roll          8
dtype: int64

In [None]:
# data.to_csv(r'/content/data.csv', index=False, header=True)

In [None]:
data.isna().sum()

Title        0
Desc         0
Type         0
BodyPart     0
Equipment    0
Level        0
dtype: int64

In [None]:
data.duplicated().sum()

9

In [None]:
data.loc[data.duplicated(subset=['Title'])]

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level
97,Decline bar press sit-up,The decline bar press sit-up is a weighted cor...,Strength,Abdominals,Barbell,Intermediate
645,Exercise Ball Cable Crunch - Gethin Variation,The exercise ball crunch is a popular gym exer...,Strength,Abdominals,Cable,Intermediate
939,Band-suspended kettlebell bench press,The band-suspended kettlebell bench press is a...,Strength,Chest,Bands,Intermediate
958,Band-suspended kettlebell bench press,The band-suspended kettlebell bench press is a...,Strength,Chest,Bands,Intermediate
1709,Seated Cable Rows,The cable seated row is a popular exercise to ...,Strength,Middle Back,Cable,Intermediate
1730,Seated Cable Rows,The cable seated row is a popular exercise to ...,Strength,Middle Back,Cable,Intermediate
2004,Dumbbell step-up,The dumbbell step-up is a great exercise for b...,Strength,Quadriceps,Dumbbell,Intermediate
2655,Arnold press,Named after the iconic bodybuilder and movie s...,Strength,Shoulders,Dumbbell,Intermediate
2658,Seated rear delt fly,The seated rear delt fly is an upper-body exer...,Strength,Shoulders,Dumbbell,Intermediate


In [None]:
# Dropping duplicate workout
data = data.drop(index=939, axis=1)
data = data.drop(index=1709, axis=1)

# Modeling

In [None]:
new_data = data

In [None]:
# Normalizing description feature
new_data['Desc'] = data['Desc'].apply(lambda x: x.lower())
new_data['Desc'] = new_data['Desc'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))

In [None]:
new_data.head()

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level
0,Partner plank band row,the partner plank band row is an abdominal exe...,Strength,Abdominals,Bands,Intermediate
1,Banded crunch isometric hold,the banded crunch isometric hold is an exercis...,Strength,Abdominals,Bands,Intermediate
2,FYR Banded Plank Jack,the banded plank jack is a variation on the pl...,Strength,Abdominals,Bands,Intermediate
3,Banded crunch,the banded crunch is an exercise targeting the...,Strength,Abdominals,Bands,Intermediate
4,Crunch,the crunch is a popular core exercise targetin...,Strength,Abdominals,Bands,Intermediate


In [None]:
# new_data.to_csv(r'/content/new_data.csv', index=False, header=True)

In [None]:
encoded_data = new_data

In [None]:
# Using BERT to encode description text
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=True)

def get_bert_embeddings(text, preprocessor, encoder):
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
  encoder_inputs = preprocessor(text_input)
  outputs = encoder(encoder_inputs)
  embedding_model = tf.keras.Model(text_input, outputs['pooled_output'])
  sentences = tf.constant([text])
  return embedding_model(sentences)

encoded_data['encodings'] = encoded_data['Desc'].apply(lambda x: get_bert_embeddings(x, preprocessor, encoder))

# Retrieving recommendations

In [None]:
# Take one workout sample
sample_input = new_data.sample()
sample_input

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,encodings
2811,Incline dumbbell overhead triceps extension,the incline dumbbell overhead triceps extensio...,Strength,Triceps,Dumbbell,Intermediate,"((tf.Tensor(-0.5833119, shape=(), dtype=float3..."


In [None]:
query_text = sample_input.iat[0, 1]
print(query_text)

the incline dumbbell overhead triceps extension is a popular exercise targeting the triceps muscles the incline angle helps to target the long head of the triceps in particular this exercise is usually performed for moderate to high reps as part of an upper body or arm focused workout 


In [None]:
query_encoding = get_bert_embeddings(query_text, preprocessor, encoder)

# Calculating cosine similarity
encoded_data['similarity_score'] = encoded_data['encodings'].apply(lambda x: metrics.pairwise.cosine_similarity(x, query_encoding)[0][0])
results = encoded_data.sort_values(by=['similarity_score'], ascending=False)

In [None]:
# Returning top 3 recommendation
top3 = results.iloc[1:4]
top3 = top3.drop('encodings', axis=1)
top3

Unnamed: 0,Title,Desc,Type,BodyPart,Equipment,Level,similarity_score
2719,Smith machine behind-the-neck press,the smith machine behind the neck press is a m...,Strength,Shoulders,Machine,Intermediate,0.990528
2720,Smith Machine Behind-The-Neck Press - Gethin V...,the smith machine behind the neck press is a m...,Strength,Shoulders,Machine,Intermediate,0.990528
1222,Single-arm triceps kick-back,the single arm triceps kick back is a popular ...,Strength,Glutes,Dumbbell,Intermediate,0.988209


# Evaluation

In [None]:
def eval(index):
  # Taking a row from workout data
  sample_input = new_data.iloc[[index]]
  query_text = sample_input.iat[0, 1]
  query_encoding = get_bert_embeddings(query_text, preprocessor, encoder)

  # Calculating similarity
  encoded_data['similarity_score'] = encoded_data['encodings'].apply(lambda x: metrics.pairwise.cosine_similarity(x, query_encoding)[0][0])
  results = encoded_data.sort_values(by=['similarity_score'], ascending=False)

  # Returning top 3 recommendation
  top3 = results.iloc[1:4]
  top3 = top3.drop('encodings', axis=1)

  # Normalizing input dataframe
  sample_input = sample_input.drop('encodings', axis=1)
  sample_input = sample_input.reset_index(drop=True)
  sample_input = pd.concat([sample_input]*3, ignore_index=True)

  # Normalizing output dataframe
  top3 = top3.reset_index(drop=True)
  top3 = top3.drop('similarity_score', axis=1)

  # Calculating precision
  top3['type_precision'] = np.where(sample_input['Type'] == top3['Type'], 1, 0)
  top3['body_precision'] = np.where(sample_input['BodyPart'] == top3['BodyPart'], 1, 0)
  top3['equipment_precision'] = np.where(sample_input['Equipment'] == top3['Equipment'], 1, 0)
  top3['level_precision'] = np.where(sample_input['Level'] == top3['Level'], 1, 0)
  precision = (top3['type_precision'].sum() + top3['body_precision'].sum() + top3['equipment_precision'].sum() + top3['level_precision'].sum())/12
  return precision

In [None]:
# Iterating through all rows of data to calculate system precision
all_precision = []
for i in range(len(new_data.index)):
  precision = eval(i)
  all_precision.append(precision)
mean_precision = sum(all_precision) / len(all_precision)
print('System precision:', "{:.0%}".format(mean_precision))

System precision: 64%
