# Finding Experience Level of Gym Members

In [None]:

# Importing Necessary Libraries and Packages.

import pandas as pd 
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, accuracy_score,precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from imblearn.over_sampling._smote.base import SMOTE
import missingno as msno
import os
import tensorflow as tf
import warnings



In [None]:

## Importing necessary libraries and packages.

warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.get_logger().setLevel('ERROR')
tf.config.optimizer.set_jit(False)


In [None]:

# Visualization settings

plt.rcParams["figure.figsize"] = (10, 6)
sns.set_style("whitegrid")


In [None]:

#### Pandas Setting

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)


In [None]:

### Reading Data

data = pd.read_csv('/Users/erkan/Downloads/feature_engineering/feature_engineering/datasets/gym_members_exercise_tracking.csv')
np.random.seed(12345)
dataset = data.copy()


## First sight at the Data

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:

dataset.isnull().sum()

### There is no null in dataset.

In [None]:

dataset.duplicated().sum()

### There is no duplicated instances in dataset

In [None]:
dataset.shape

## Exploratory Data Analysis

In [None]:

#### Grabbing categorical and numerical columns

def grab_col_names(dataframe, cat_th=10, car_th=20):

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car


cat_cols, num_cols, cat_but_car = grab_col_names(dataset)


In [None]:
#### Numerical columns analysis

dataset.describe().T


In [None]:
### Distrubition Of Numerical Columns.

for col in num_cols:
    sns.histplot(x=col, data=dataset)
    plt.show()

In [None]:
### Outlier Detection of Numerical Columns with Boxplot

for col in num_cols:
    sns.boxplot(x=col, data=dataset)
    plt.show()

# BMI, Calories Burned, Weight columns have outliers. 


In [None]:

### Relation Between Numerical Column And Target

for col in num_cols:
    sns.scatterplot(x=col, data=dataset, y= 'Experience_Level')
    plt.show()



In [None]:

## Looking the relationship between target columns and numerical columns.

def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n")

for col in num_cols:
    target_summary_with_num(dataset, "Experience_Level", col)

In [None]:

### Correlation Analysis

cor_matrix = dataset[num_cols].corr()

cor_matrix

sns.set(rc={'figure.figsize': (12, 12)})
sns.heatmap(cor_matrix, cmap="RdBu")
plt.show()

# There is negative corr between the fat percentage and the both Sessin Duration and Calories burned
# and there is positive corr between BMI and Weight as expected.
# Absolute values of other corrs is less than 0.5 which is not necessary to consider.

In [None]:
####### Categorical Columns Analysis


def cat_summary(dataframe, col_name):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
    print("##########################################")

for col in cat_cols:
    cat_summary(dataset, col)

for col in cat_cols:
    sns.countplot(x=dataset[col], data=dataset)
    plt.show(block = True)


# Dataset is imbalance, some techniques could be used;
# in order to create more robust model.


In [None]:

##### Relation Between Categorical Columns And Target

def target_summary_with_cat(dataframe, target, categorical_col):
    summary_df = dataframe.groupby(categorical_col).agg(
        TARGET_MEAN=(target, 'mean'),
        COUNTS=(target, 'count')
    ).reset_index()

    print(summary_df, end="\n\n\n")
    

## Feature Engineering with Columntransformer

In [None]:
## Outliers

# Since we will use models that sensitive to outliers, they should be considered.
# From EDA part, we know; BMI, Calories Burned, Weight columns has outliers.

has_out = ['BMI','Calories_Burned',  'Weight (kg)']

#BMI

dataset['BMI'].describe()
dataset.groupby('Experience_Level').agg({'BMI' : 'mean'})

#### Solutions for IBM

## I dont want to use iqr to replace outliers with quartiles because;
# we have also height and weight which should also be considered.

## Since ıbm can be categorized and that can be solve our outlier problem but;
# we have to consider curse of dimensionality.
# We can get rid of curse of dimensionality by using categorical mapping, instead of one hot encoder.

In [None]:
# Categorizing

bins = [0, 18.5, 24.9, 29.9, 34.9, 39.9, float('inf')]
labels = [1, 2, 3, 4, 5, 6]

# Use pd.cut() to categorize the BMI values into the defined bins.

dataset['BMI_Label'] = pd.cut(dataset['BMI'], bins=bins, labels=labels, right=False)
dataset.drop(columns = 'BMI', inplace=True, axis=1)

In [None]:
dataset['Calories_Burned'].describe()

# Robust Scaling could be used.

In [None]:

dataset['Weight (kg)'].describe()

# Robust Scaling could be used


In [None]:
## Splitting data in to x train/test and y train/test

X = dataset.drop(columns=['Experience_Level'])  
y = dataset['Experience_Level']       

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)

In [None]:
### CREATİNG COLUMNTRANSFORMER FOR PREPROCESSİNG


std = ['Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)' , 'Calories_Burned','Fat_Percentage', 'Water_Intake (liters)', 'Workout_Frequency (days/week)']
rbst = ['Calories_Burned', 'Weight (kg)']
ohe = ['Workout_Type' , 'Gender']


preprocessor = ColumnTransformer([
        ('ohe', OneHotEncoder(drop = 'first'), ohe),
        ('std_scaler', StandardScaler(), std),
        ('rbst_scaler', RobustScaler(), rbst)  
    ])


## Building KNN

In [None]:

#### CROSS VALİDATİON FOR DETERMİNİNG K FOR KNN


k_values = [3, 5, 7, 9, 11, 13, 15]

sf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12345)

mean_scores = {}

for k in k_values:
    pipeline.set_params(classifier__n_neighbors=k)
    scores = cross_val_score(pipeline, X, y, cv=sf, scoring='accuracy')
    mean_scores[k] = scores.mean()

best_k = max(mean_scores, key=mean_scores.get)

print(f"Best k value: {best_k}")
print(f"Best cross-validation accuracy: {mean_scores[best_k]}")

In [None]:

## EVALUTAION FOR K=15

pipeline = imbpipeline([ 
    ('preprocessor', preprocessor),
    ('smote', SMOTE()),
    ('classifier', KNeighborsClassifier(n_neighbors=15))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


###  Evaluation metrics

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# ROC AUC
y_proba = pipeline.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
print(f"ROC AUC Score (Macro Average): {roc_auc:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average="macro")
print(f"Precision (Macro Average): {precision:.2f}")

# Recall
recall_macro = recall_score(y_test, y_pred, average='macro')
print(f"Recall (Macro): {recall_macro:.2f}")

# F1
f1 = f1_score(y_test, y_pred, average="macro")
print(f"F1 Score: {f1:.2f}")

## Building Logistic Regression

In [None]:
#### Logistic Regression

pipeline1 = imbpipeline([
    ('preprocessor', preprocessor), 
    ('smote' , SMOTE(random_state = 12345)),
    ('classifier', LogisticRegression())
])

pipeline1.fit(X_train, y_train)
y_pred = pipeline1.predict(X_test)

###  Evaluation metrics

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# ROC AUC
y_proba = pipeline1.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
print(f"ROC AUC Score (Macro Average): {roc_auc:.2f}")

# Precision
precision = precision_score(y_test, y_pred, average="macro")
print(f"Precision (Macro Average): {precision:.2f}")

# Recall
recall_macro = recall_score(y_test, y_pred, average='macro')
print(f"Recall (Macro): {recall_macro:.2f}")

# F1
f1 = f1_score(y_test, y_pred, average="macro")
print(f"F1 Score: {f1:.2f}")


## Building Perceptron

In [None]:
#### Perceptron

pipeline1 = imbpipeline([ 
    ('preprocessor', preprocessor), 
    ('smote' , SMOTE(random_state = 12345)),
    ('classifier', Perceptron())
])

pipeline1.fit(X_train, y_train)
y_pred = pipeline1.predict(X_test)

###  Evaluation metrics

# Accuracy 
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Precision
precision = precision_score(y_test, y_pred, average="macro")
print(f"Precision (Macro Average): {precision:.2f}")

# Recall
recall_macro = recall_score(y_test, y_pred, average='macro')
print(f"Recall (Macro): {recall_macro:.2f}")

# F13
f1 = f1_score(y_test, y_pred, average="macro")
print(f"F1 Score: {f1:.2f}")

## Discussion

Logistic Regression performed the best out of the three models after SMOTE was used to alleviate the class imbalance. In order to increase performance in the minority classes, the model was able to modify the decision threshold and better handle the synthetic samples produced by SMOTE since it could produce probability estimates for each class.

Even with the advantage of the synthetic samples, KNN was still unable to overcome the imbalance because of its dependence on nearest neighbors, which can cause bias in favor of the majority class, particularly in high-dimensional data.

Since the perceptron is a linear model, it only slightly improved with SMOTE, producing less-than-ideal results because it was unable to accurately represent the intricate interactions between the classes.

In conclusion, once SMOTE was used, the most appropriate model for this multi-class classification problem was logistic regression due to its adaptability and capacity to account for class imbalance.