In [18]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [19]:
data = pd.read_csv('gym_members_exercise_tracking.csv')
data.sample(5)

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
574,44,Male,124.8,1.85,175,126,69,1.35,842.0,Strength,24.4,3.5,4,2,36.46
221,49,Male,82.1,1.86,162,160,64,1.87,1481.0,Strength,14.2,3.5,4,3,23.73
154,45,Male,113.4,1.71,190,129,63,0.83,530.0,Strength,24.0,2.3,2,1,38.78
941,21,Male,67.0,1.61,161,152,74,1.32,1104.0,Yoga,22.0,2.1,3,2,25.85
531,53,Female,53.3,1.74,173,156,54,1.37,962.0,Strength,32.2,1.8,3,2,17.6


In [20]:
data.shape

(973, 15)

In [21]:
data['Gender'] = (data['Gender'] == 'Male').astype(int)
data.sample(5)

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
321,50,0,55.4,1.55,175,122,65,1.85,1016.0,Strength,18.5,2.7,4,3,23.06
953,50,0,67.9,1.77,171,164,73,1.03,760.0,Yoga,33.0,2.4,3,2,21.67
57,21,0,50.3,1.52,171,154,67,1.08,832.0,Cardio,32.2,2.6,3,1,21.77
207,33,0,65.3,1.71,172,135,74,1.18,796.0,HIIT,29.7,2.5,4,2,22.33
467,19,1,98.6,1.7,197,162,74,0.83,740.0,Cardio,21.1,2.4,2,1,34.12


In [22]:
np.random.seed(113)
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['Workout_Type']), data['Workout_Type'], test_size=0.3,
    stratify=data['Workout_Type'])

In [23]:
fig = go.Figure()

fig.add_trace(go.Histogram(x=y_train,
                           xbins=go.histogram.XBins(size=1)))

fig.update_layout(title='Распределение классов',
                  xaxis_title='Класс',
                  yaxis_title='Количество')
fig.show()

In [24]:
from sklearn.preprocessing import StandardScaler

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [25]:
from sklearn.metrics import accuracy_score, f1_score, classification_report


def train_eval_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    return accuracy, f1

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.simplefilter('ignore', category=ConvergenceWarning)

results = pd.DataFrame({'model_name': [], 'test_acc': [], 'test_f1': []})
models = []
for model in [KNeighborsClassifier(n_neighbors=3), SVC(), GaussianNB(),
              DecisionTreeClassifier(), LogisticRegression(),
              CatBoostClassifier(silent=True)]:
    acc, f1 = train_eval_model(model, X_train_std, X_test_std, y_train, y_test)
    models.append(model)
    results.loc[len(results)] = [type(model).__name__, acc, f1]

results

Unnamed: 0,model_name,test_acc,test_f1
0,KNeighborsClassifier,0.263699,0.263699
1,SVC,0.287671,0.287671
2,GaussianNB,0.226027,0.226027
3,DecisionTreeClassifier,0.284247,0.284247
4,LogisticRegression,0.291096,0.291096
5,CatBoostClassifier,0.270548,0.270548


In [39]:
from sklearn.model_selection import cross_validate


def train_eval_model_cv(model, X_train, X_test, y_train, y_test):
    cv = cross_validate(model, X_train, y_train, cv=3, return_estimator=True)
    best_cv = cv['estimator'][
        np.where(cv['test_score'] == max(cv['test_score']))[0][0]]
    y_pred = best_cv.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='micro')
    return accuracy, f1, best_cv

In [40]:
results_cv = pd.DataFrame({'model_name': [], 'test_acc': [], 'test_f1': []})
models_cv = []
for model in [KNeighborsClassifier(n_neighbors=3), SVC(), GaussianNB(),
              DecisionTreeClassifier(), LogisticRegression(),
              CatBoostClassifier(silent=True)]:
    acc, f1, cv_model = train_eval_model_cv(model, X_train_std, X_test_std,
                                            y_train, y_test)
    models_cv.append(cv_model)
    results_cv.loc[len(results_cv)] = [type(model).__name__, acc, f1]

results_cv

Unnamed: 0,model_name,test_acc,test_f1
0,KNeighborsClassifier,0.267123,0.267123
1,SVC,0.267123,0.267123
2,GaussianNB,0.260274,0.260274
3,DecisionTreeClassifier,0.280822,0.280822
4,LogisticRegression,0.280822,0.280822
5,CatBoostClassifier,0.229452,0.229452


In [41]:
estimators = [(type(model).__name__, model) for model in models_cv]
estimators

[('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=3)),
 ('SVC', SVC()),
 ('GaussianNB', GaussianNB()),
 ('DecisionTreeClassifier', DecisionTreeClassifier()),
 ('LogisticRegression', LogisticRegression()),
 ('CatBoostClassifier', <catboost.core.CatBoostClassifier at 0x222d31b2f50>)]

In [42]:
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(
    estimators=[(type(model).__name__, model) for model in models_cv][:-1],
    voting='hard')
eclf.fit(X_train_std, y_train)
res_eclf = eclf.predict(X_test_std)

In [43]:
accuracy = accuracy_score(y_test, res_eclf)
f1 = f1_score(y_test, res_eclf, average='micro')
accuracy, f1

(0.2671232876712329, 0.2671232876712329)