Код импортирует основные библиотеки: numpy и pandas для работы с данными, seaborn и matplotlib для построения графиков, а также plotly для создания интерактивных визуализаций.

In [3]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px


Загружается датасет gym_members_exercise_tracking.csv, который содержит информацию о клиентах спортзала. Данные представлены в табличном формате.

In [4]:
df = pd.read_csv("gym_members_exercise_tracking.csv")

Выполняется начальное изучение данных: типы столбцов, количество строк и столбцов, наличие пропусков, чтобы понять качество датасета и подготовить его для анализа.

In [5]:
df.sample(5)

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
271,52,Male,105.2,1.62,189,141,61,1.36,949.0,HIIT,27.6,3.2,3,1,40.09
563,18,Female,65.3,1.55,184,161,59,1.42,1143.0,Yoga,28.5,2.2,3,2,27.18
457,29,Male,88.1,1.93,186,143,66,1.88,1479.0,Cardio,11.1,3.5,5,3,23.65
681,27,Male,108.0,1.8,174,122,70,1.38,926.0,Cardio,28.6,2.5,4,2,33.33
484,42,Male,85.2,1.81,189,151,65,1.54,1151.0,Yoga,13.7,3.5,5,3,26.01


In [6]:
num_records = len(df)
num_records

973

In [7]:
num_columns = len(df.columns)
num_columns

15

In [8]:
# Удаление строк с пропусками в ключевых метриках
key_metrics = ['Session_Duration (hours)', 'Calories_Burned', 'Workout_Frequency (days/week)', 'BMI']
df = df.dropna(subset=key_metrics)

In [9]:
# Заполнение пропусков в других метриках медианой
metrics_to_fill = ['Water_Intake (liters)', 'Fat_Percentage', 'Resting_BPM', 'Avg_BPM']
for metric in metrics_to_fill:
    if metric in df.columns:
        df[metric] = df[metric].fillna(df[metric].median())

In [10]:
# Заполнение пропусков для категориальных данных наиболее частым значением (модой)
categorical_columns = ['Gender', 'Workout_Type', 'Experience_Level']
for column in categorical_columns:
    if column in df.columns:
        df[column] = df[column].fillna(df[column].mode()[0])

In [11]:
def summary(df):
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['Duplicate'] = df.duplicated().sum()
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['avg'] = desc['mean'].values
    summ['std dev'] = desc['std'].values
    summ['top value'] = desc['top'].values
    summ['Freq'] = desc['freq'].values

    return summ

summary(df)

Unnamed: 0,data type,#missing,Duplicate,#unique,min,max,avg,std dev,top value,Freq
Age,int64,0,0,42,18.0,59.0,38.683453,12.180928,,
Gender,object,0,0,2,,,,,Male,511.0
Weight (kg),float64,0,0,532,40.0,129.9,73.854676,21.2075,,
Height (m),float64,0,0,51,1.5,2.0,1.72258,0.12772,,
Max_BPM,int64,0,0,40,160.0,199.0,179.883864,11.525686,,
Avg_BPM,int64,0,0,50,120.0,169.0,143.766701,14.345101,,
Resting_BPM,int64,0,0,25,50.0,74.0,62.223022,7.32706,,
Session_Duration (hours),float64,0,0,147,0.5,2.0,1.256423,0.343033,,
Calories_Burned,float64,0,0,621,303.0,1783.0,905.422405,272.641516,,
Workout_Type,object,0,0,4,,,,,Strength,258.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            973 non-null    int64  
 1   Gender                         973 non-null    object 
 2   Weight (kg)                    973 non-null    float64
 3   Height (m)                     973 non-null    float64
 4   Max_BPM                        973 non-null    int64  
 5   Avg_BPM                        973 non-null    int64  
 6   Resting_BPM                    973 non-null    int64  
 7   Session_Duration (hours)       973 non-null    float64
 8   Calories_Burned                973 non-null    float64
 9   Workout_Type                   973 non-null    object 
 10  Fat_Percentage                 973 non-null    float64
 11  Water_Intake (liters)          973 non-null    float64
 12  Workout_Frequency (days/week)  973 non-null    int

In [13]:
df.nunique()

Age                               42
Gender                             2
Weight (kg)                      532
Height (m)                        51
Max_BPM                           40
Avg_BPM                           50
Resting_BPM                       25
Session_Duration (hours)         147
Calories_Burned                  621
Workout_Type                       4
Fat_Percentage                   239
Water_Intake (liters)             23
Workout_Frequency (days/week)      4
Experience_Level                   3
BMI                              771
dtype: int64

In [14]:
import plotly.express as px

cat_columns = df[['Gender', 'Workout_Type', 'Workout_Frequency (days/week)', 'Experience_Level']]

def analys(cols):
    colors = [
        '#FFD700', '#FF6347', '#40E0D0', '#FF69B4', '#7FFFD4',  
        '#FFA500', '#00FA9A', '#FF4500', '#4682B4', '#DA70D6',  
        '#FFB6C1', '#FF1493', '#FF8C00', '#98FB98', '#9370DB', 
        '#32CD32', '#00CED1', '#1E90FF', '#FFFF00', '#7CFC00'  
    ]
    value_counts = cat_columns[cols].value_counts()

    fig = px.bar(
        value_counts,
        x=value_counts.index,
        y=value_counts.values,
        title=f'Distribution of {cols}',
        labels={'x': 'Categories', 'y': 'Count'},
        color_discrete_sequence=[colors]
    )
    fig.update_layout(
        plot_bgcolor='#000000',
        paper_bgcolor='#000000',
        font=dict(color='white', size=12), 
        title_font=dict(size=30),
        legend_font=dict(color='white', size=12),
        width=500,
        height=400
    )
    fig.show()

    percentage = (value_counts / value_counts.sum()) * 100
    
    fig = px.pie(
        values=percentage,
        names=value_counts.index,
        labels={'names': 'Categories', 'values': 'Percentage'},
        hole=0.5,
        color_discrete_sequence=colors
    )
    fig.add_annotation(
        x=0.5, y=0.5,
        text=f'{cols}',
        font=dict(size=18, color='white'),
        showarrow=False
    )
    fig.update_layout(
        plot_bgcolor='#000000',
        paper_bgcolor='#000000',
        font=dict(color='white', size=12),
        title_font=dict(size=30),
        legend=dict(x=0.9, y=0.5),
        legend_font=dict(color='white', size=12),
        width=500,
        height=400
    )
    fig.show()

for x in cat_columns:
    analys(x)

In [30]:
import plotly.express as px

colors = [
    '#FFD700',
    '#FFB6C1',
    '#32CD32',
    '#1E90FF'
]

def create_histplot(df, x, title, nbins=50, color_index=0):
    fig = px.histogram(df, x, nbins=nbins)
    fig.update_traces(marker_color=colors[color_index])
    fig.update_layout(
        plot_bgcolor='black',
        paper_bgcolor='black',
        font_color='white'
    )
    fig.show()

create_histplot(df, 'Weight (kg)', 'Weight (kg)', nbins=50, color_index=0)
create_histplot(df, 'Session_Duration (hours)', 'Session_Duration (hours)', nbins=50, color_index=1)
create_histplot(df, 'Calories_Burned', 'Calories_Burned', nbins=50, color_index=2)
create_histplot(df, 'BMI', 'BMI', nbins=50, color_index=3)

In [29]:
import plotly.express as px

def groupby(data, x):
    result = data.groupby(x).size().rename('count').reset_index()
    return result

def create_scatter_plot(data, x, y, title, xaxis_title, yaxis_title, color, width=600, height=400):  # Reduced dimensions
    fig = px.scatter(data, x=x, y=y, size=y, color_discrete_sequence=[color])
    fig.update_traces(marker=dict(opacity=1))
    fig.update_layout(
        xaxis_title=xaxis_title,
        yaxis_title=yaxis_title,
        plot_bgcolor='black',
        width=width,
        paper_bgcolor='black',
        font=dict(color='white'),
        xaxis=dict(showgrid=False),
        yaxis=dict(showgrid=False),
        height=height
    )
    fig.show()

color = [
    '#FFD700',
    '#FFA500', '#00FA9A',
    '#FFB6C1', '#FF1493',
    '#32CD32', '#00CED1', '#1E90FF', '#FFFF00', '#7CFC00'
]

features = ['Age',  'Height (m)', 'Max_BPM', 'Avg_BPM', 'Session_Duration (hours)', 
            'Resting_BPM', 'Fat_Percentage', 'Water_Intake (liters)']

for i, feature in enumerate(features):
    grouped_data = groupby(df, feature)
    create_scatter_plot(grouped_data, feature, 'count', f'{feature} Distribution', feature, 'Count', color[i])

In [24]:
numeric_df = df.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()
fig = go.Figure(data=go.Heatmap(z=correlation_matrix, x=correlation_matrix.columns, y=correlation_matrix.columns, colorscale='Viridis'))
fig.update_layout(title='Correlation Heatmap')
fig.update_layout(
    title='Correlation Heatmap',
    title_font=dict(color='white'),
    xaxis=dict(color='white', showgrid = True, gridcolor = 'grey'),
    yaxis=dict(color='white', showgrid = True, gridcolor = 'grey'),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font=dict(color='white')
)
fig.show()