### Importing Libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot, init_notebook_mode, plot, iplot 
import plotly.express as px 
init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv('data/gym_membership.csv')
df.head() 

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False


In [3]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     1000 non-null   int64 
 1   gender                 1000 non-null   object
 2   birthday               1000 non-null   object
 3   Age                    1000 non-null   int64 
 4   abonoment_type         1000 non-null   object
 5   visit_per_week         1000 non-null   int64 
 6   days_per_week          1000 non-null   object
 7   attend_group_lesson    1000 non-null   bool  
 8   fav_group_lesson       503 non-null    object
 9   avg_time_check_in      1000 non-null   object
 10  avg_time_check_out     1000 non-null   object
 11  avg_time_in_gym        1000 non-null   int64 
 12  drink_abo              1000 non-null   bool  
 13  fav_drink              496 non-null    object
 14  personal_training      1000 non-null   bool  
 15  name_personal_trainer 

In [4]:
df.shape

(1000, 17)

In [5]:
df.columns

Index(['id', 'gender', 'birthday', 'Age', 'abonoment_type', 'visit_per_week',
       'days_per_week', 'attend_group_lesson', 'fav_group_lesson',
       'avg_time_check_in', 'avg_time_check_out', 'avg_time_in_gym',
       'drink_abo', 'fav_drink', 'personal_training', 'name_personal_trainer',
       'uses_sauna'],
      dtype='object')

In [None]:

df.describe().T.drop('id', axis=0).drop("count", axis=1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Age,30.604,10.817958,12.0,21.0,30.0,40.0,49.0
visit_per_week,2.682,1.241941,1.0,2.0,3.0,3.0,5.0
avg_time_in_gym,105.26,43.557177,30.0,67.0,104.0,143.0,180.0


### Gender Distribution

In [7]:
init_notebook_mode(connected=True)
fig=go.Figure()
fig.add_trace(go.Pie(
    labels=df['gender'].value_counts().index,
    values=df['gender'].value_counts().values,
    title='Gender Distribution'
))
iplot(fig)
fig.write_html("vis/Gender_Distribution.html") 


### Distribution of Age 

In [8]:
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Create histogram trace
fig = go.Figure(data=[go.Histogram(
    x=df['Age'], 
    nbinsx=20,  # Adjust number of bins as needed
    marker_color='skyblue',
    opacity=0.7
)])

# Customize the layout
fig.update_layout(
    title='Distribution of Age',
    xaxis_title='Age',
    yaxis_title='Frequency',
    template='plotly_white'
)

# Show the plot
fig.show()

fig.write_html("vis/Age_Distribution_with_Density_Curve.html")

In [9]:
import numpy as np
hist_data = [df['Age']]
group_labels = ['Age Distribution']
fig_kde = ff.create_distplot(
    hist_data, 
    group_labels, 
    show_hist=False, 
    show_curve=True,
    colors=['skyblue']
)
fig_kde.update_layout(
    title='Age Distribution with Density Curve',
    xaxis_title='Age',
    yaxis_title='Density',
    template='plotly_white'
)
fig_kde.show()
fig_kde.write_html("vis/Age_Distribution.html")

### Abonoment Distribution

In [10]:
fig=go.Figure()
fig.add_trace(go.Pie(
    labels=df['abonoment_type'].value_counts().index,
    values=df['abonoment_type'].value_counts().values,
    title='Abonoment Distribution'
))
iplot(fig)
fig.write_html("vis/abonoment_type.html") 

### Distribution of Visit Per Week 

In [11]:
fig = go.Figure()
fig.add_trace(go.Pie(
    labels=df['visit_per_week'].value_counts().index,
    values=df['visit_per_week'].value_counts().values,
    title='Visit Per Week Distribution',
    textinfo='label+percent+value',  # This will show label, percentage, and actual value
    texttemplate='%{label}<br> (%{percent})',  # Custom formatting
    textposition='auto'  # Automatically positions the text
))
iplot(fig)
fig.write_html("vis/visit_per_week_count.html")

### Count of Days Attendance for people attending single day

In [12]:
week_days=['Sun','Mon','Tue','Wed','Thu','Fri','Sat']
for day in week_days: 
    if day in df['days_per_week'].value_counts().index:
        print(day,df['days_per_week'].value_counts()[day])
# df['days_per_week'].value_counts()

Sun 37
Mon 24
Tue 25
Wed 29
Thu 19
Fri 34
Sat 26


### Count of Days Attendance

In [13]:
week_days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
d = {day: 0 for day in week_days}  # Initialize dictionary with all days set to 0

def count_days(txt):
    for day in week_days:
        if day in txt:
            d[day] += 1

for t in df['days_per_week'].values:
    count_days(t)

sorted_day_count = dict(sorted(d.items(), key=lambda item: item[1],reverse=True))

# Optional: Create a bar plot to visualize the results
import plotly.graph_objs as go

fig = go.Figure(data=[go.Bar(
    x=list(sorted_day_count.keys()),
    y=list(sorted_day_count.values()),
    marker_color='skyblue'
)])

fig.update_layout(
    title='Gym Visits by Day of Week',
    xaxis_title='Day of Week',
    yaxis_title='Number of Visits',
    template='plotly_white'
)

fig.show()
fig.write_html("vis/Gym_Visits_by_Day_of_Week.html")

In [14]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False


### Attend Group Lesson Distribution

In [15]:
fig=go.Figure()
fig.add_trace(go.Pie(
    labels=df['attend_group_lesson'].value_counts().index,
    values=df['attend_group_lesson'].value_counts().values,
    title='Attend Group Lesson Distribution'
))
iplot(fig)
fig.write_html("vis/attend_group_lesson_count.html") 

### Fav Group Lesson Distribution

In [16]:
from collections import Counter 
lessons=[]
for i,lesson_row in enumerate(df['fav_group_lesson'].values): 
    # print(i,lesson_row)
    if type(lesson_row)!=str:
        continue
    lessons.extend(lesson_row.split(', '))
print(lessons)
dict(Counter(lessons))

['Kickboxen', 'BodyPump', 'Zumba', 'XCore', 'Running', 'Yoga', 'Zumba', 'LesMiles', 'BodyPump', 'Yoga', 'XCore', 'BodyPump', 'Pilates', 'Zumba', 'XCore', 'XCore', 'HIT', 'Running', 'Spinning', 'Zumba', 'XCore', 'Pilates', 'BodyPump', 'Zumba', 'Pilates', 'Yoga', 'BodyBalance', 'LesMiles', 'HIT', 'BodyBalance', 'Zumba', 'Yoga', 'Zumba', 'BodyBalance', 'Pilates', 'Running', 'Spinning', 'BodyPump', 'Zumba', 'LesMiles', 'HIT', 'XCore', 'Running', 'BodyBalance', 'XCore', 'LesMiles', 'BodyBalance', 'Spinning', 'Yoga', 'Yoga', 'BodyBalance', 'BodyPump', 'Zumba', 'BodyPump', 'Yoga', 'BodyBalance', 'Running', 'Pilates', 'Spinning', 'Kickboxen', 'LesMiles', 'BodyBalance', 'BodyPump', 'Kickboxen', 'XCore', 'Zumba', 'Running', 'Spinning', 'Zumba', 'BodyPump', 'Zumba', 'Spinning', 'BodyBalance', 'XCore', 'Yoga', 'LesMiles', 'Spinning', 'Running', 'Kickboxen', 'Yoga', 'HIT', 'HIT', 'BodyPump', 'BodyBalance', 'XCore', 'Kickboxen', 'HIT', 'BodyPump', 'Yoga', 'Running', 'BodyPump', 'Spinning', 'Pilates'

{'Kickboxen': 85,
 'BodyPump': 112,
 'Zumba': 81,
 'XCore': 90,
 'Running': 82,
 'Yoga': 90,
 'LesMiles': 99,
 'Pilates': 95,
 'HIT': 97,
 'Spinning': 85,
 'BodyBalance': 87}

In [17]:
lessons_count=dict(Counter(lessons))
sorted_lessons_count = dict(sorted(lessons_count.items(), key=lambda item: item[1],reverse=True))

# Optional: Create a bar plot to visualize the results
import plotly.graph_objs as go

fig = go.Figure(data=[go.Bar(
    x=list(sorted_lessons_count.keys()),
    y=list(sorted_lessons_count.values()),
    marker_color='skyblue'
)])

fig.update_layout(
    title='Lessons Count',
    xaxis_title='Lesson',
    yaxis_title='Count of people',
    template='plotly_white'
)

fig.show()
fig.write_html("vis/Lessons_Count.html")

In [18]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False


In [19]:
df.tail()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna
995,996,Female,1984-09-22,40,Standard,3,"Thu, Tue, Wed",False,,20:56:00,22:42:00,106,False,,False,,False
996,997,Female,2008-11-19,15,Standard,3,"Fri, Mon, Sun",True,"XCore, Zumba",09:14:00,11:28:00,134,True,"orange, lemon",False,,True
997,998,Male,1984-10-05,40,Standard,2,"Fri, Tue",False,,17:21:00,19:53:00,152,True,"coconut_pineapple, black_currant",True,Jeffrey,True
998,999,Male,2001-02-22,23,Standard,4,"Mon, Sun, Thu, Tue",True,"HIT, XCore",10:23:00,12:29:00,126,True,berry_boost,True,Jeffrey,False
999,1000,Female,2006-05-07,18,Premium,2,"Thu, Tue",False,,16:41:00,19:00:00,139,True,"passion_fruit, coconut_pineapple",False,,False


### Check-in Hours

In [20]:
check_in_hours=[]
for t in df['avg_time_check_in'].str.split(':'): 
    check_in_hours.append(int(t[0]))
check_in_hours_count=dict(Counter(check_in_hours))
check_in_hours_count    

{19: 78,
 8: 74,
 9: 86,
 17: 87,
 13: 77,
 10: 92,
 20: 78,
 16: 73,
 15: 64,
 14: 81,
 11: 81,
 18: 65,
 12: 64}

In [21]:
def plot_check_hours_scatter(column_name):
    # First, prepare the data
    check_out_hours = []
    for t in df[column_name].str.split(':'): 
        check_out_hours.append(int(t[0]))
    check_out_hours_count = dict(Counter(check_out_hours))

    # Sort the dictionary by keys (hours) to ensure correct line connection
    sorted_hours = sorted(check_out_hours_count.items())

    fig = go.Figure(data=go.Scatter(
        x=[hour for hour, _ in sorted_hours],
        y=[count for _, count in sorted_hours],
        mode='markers+lines',  # Combines scatter points with connecting lines
        marker=dict(
            size=12,
            color=[hour for hour, _ in sorted_hours],  # Color based on hour
            colorscale='Cividis',  # You can change the colorscale
            showscale=False
        ),
        line=dict(color='lightgray', width=2),  # Customize line appearance
        text=[f'Hour: {hour}, Visits: {count}' for hour, count in sorted_hours],
        hoverinfo='text'
    ))

    fig.update_layout(
        title='Check-out Hours Distribution',
        xaxis_title=f'Hour of Check-{str(column_name.split("_")[-1])}',
        yaxis_title='Number of Visits',
        template='plotly_white',
        xaxis=dict(
            tickmode='linear',
            tick0=min(check_out_hours_count.keys()),
            dtick=1
        )
    )

    fig.show()

    # Optional: If you want to save the plot
    fig.write_html(f"vis/{column_name}.html")

plot_check_hours_scatter('avg_time_check_in')

### Check-out Hours

In [22]:
plot_check_hours_scatter('avg_time_check_out')

#### Avg time in gym distribution

In [23]:
df.columns

Index(['id', 'gender', 'birthday', 'Age', 'abonoment_type', 'visit_per_week',
       'days_per_week', 'attend_group_lesson', 'fav_group_lesson',
       'avg_time_check_in', 'avg_time_check_out', 'avg_time_in_gym',
       'drink_abo', 'fav_drink', 'personal_training', 'name_personal_trainer',
       'uses_sauna'],
      dtype='object')

In [24]:
hist_data = [df['avg_time_in_gym']]
group_labels = ['Time in Gym Distribution']
fig_kde = ff.create_distplot(
    hist_data, 
    group_labels, 
    show_hist=False, 
    show_curve=True,
    colors=['skyblue']
)
fig_kde.update_layout(
    title=f'Distribution of Average Time Spent in Gym ',
    xaxis_title='Average Time in Gym (Hours)',
    yaxis_title='Number of Gym Members',
    template='plotly_white'
)
fig_kde.show()
fig_kde.write_html("vis/avg_time_in_gym_Distribution.html")

In [25]:
nbins=10
fig = go.Figure(data=[go.Histogram(
    x=df['avg_time_in_gym'],
    nbinsx=nbins,  # Specify 10 bins
    marker_color='skyblue',
    opacity=0.7
)])

fig.update_layout(
    title=f'Distribution of Average Time Spent in Gym ',
    xaxis_title='Average Time in Gym (Hours)',
    yaxis_title='Number of Gym Members',
    template='plotly_white'
)
fig.show()
fig.write_html("vis/avg_time_in_gym_Histogram.html")

### Drink Abonoment

In [26]:
# Drink Abonoment
fig_drink_abo = go.Figure(data=go.Pie(
    labels=df['drink_abo'].value_counts().index,
    values=df['drink_abo'].value_counts().values,
    title='Distribution of Drink Abonoment',
    textinfo='label+percent+value',
    textposition='auto',
    marker_colors=px.colors.qualitative.Pastel
))
fig_drink_abo.show()
fig_drink_abo.write_html("vis/drink_abo_distribution.html")

### Favorite Drink


In [27]:
# Favorite Drink
included=10
fig_fav_drink = go.Figure(data=go.Bar(
    x=df['fav_drink'].value_counts().sort_values(ascending=False)[:included].index,
    y=df['fav_drink'].value_counts().sort_values(ascending=False)[:included].values,
    marker_color='skyblue',
    text=df['fav_drink'].value_counts().values,
    textposition='outside'
))
fig_fav_drink.update_layout(
    title='Distribution of Favorite Drinks',
    xaxis_title='Drink',
    yaxis_title='Number of Members',
    template='plotly_white'
)
fig_fav_drink.show()
fig_fav_drink.write_html("vis/fav_drink_distribution.html")

### Personal Training


In [28]:
# Personal Training
fig_personal_training = go.Figure(data=go.Pie(
    labels=df['personal_training'].value_counts().index.astype(str),
    values=df['personal_training'].value_counts().values,
    title='Personal Training Participation',
    textinfo='label+percent+value',
    textposition='auto',
    marker_colors=['lightgreen', 'lightcoral']
))
fig_personal_training.show()
fig_personal_training.write_html("vis/personal_training_distribution.html")

### Personal Trainer Names

In [29]:
# Personal Trainer Names
trainer_counts = df['name_personal_trainer'].value_counts()
fig_trainer = go.Figure(data=go.Bar(
    x=trainer_counts.index,
    y=trainer_counts.values,
    marker_color=px.colors.qualitative.Pastel,
    text=trainer_counts.values,
    textposition='outside'
))
fig_trainer.update_layout(
    title='Distribution of Personal Trainers',
    xaxis_title='Trainer Name',
    yaxis_title='Number of Clients',
    template='plotly_white',
    xaxis_tickangle=-45
)
fig_trainer.show()
fig_trainer.write_html("vis/personal_trainer_distribution.html")

### Sauna Usage


In [30]:
# Sauna Usage
fig_sauna = go.Figure(data=go.Pie(
    labels=df['uses_sauna'].value_counts().index.astype(str),
    values=df['uses_sauna'].value_counts().values,
    title='Sauna Usage Distribution',
    textinfo='label+percent+value',
    textposition='auto',
    marker_colors=['lightsalmon', 'lightblue']
))
fig_sauna.show()
fig_sauna.write_html("vis/sauna_usage_distribution.html")

### Average Time in Gym by Abonoment Type and Gender

In [31]:
pivot_time_abo_gender = df.pivot_table(
    values='avg_time_in_gym', 
    index='abonoment_type', 
    columns='gender', 
    aggfunc='mean'
)
print("Average Time in Gym by Abonoment Type and Gender:")
print(pivot_time_abo_gender)

Average Time in Gym by Abonoment Type and Gender:
gender              Female        Male
abonoment_type                        
Premium         104.871901  104.944223
Standard        107.597701  103.483740


In [32]:
# Create pivot table
pivot_time_abo_gender = df.pivot_table(
    values='avg_time_in_gym', 
    index='abonoment_type', 
    columns='gender', 
    aggfunc='mean'
)

print("Average Time in Gym by Abonoment Type and Gender:")
print(pivot_time_abo_gender)

# Visualization 1: Grouped Bar Chart
fig_bar = go.Figure(data=[
    go.Bar(
        name='Female',
        x=pivot_time_abo_gender.index,
        y=pivot_time_abo_gender['Female'],
        text=[f'{val:.2f}' for val in pivot_time_abo_gender['Female']],
        textposition='auto'
    ),
    go.Bar(
        name='Male',
        x=pivot_time_abo_gender.index,
        y=pivot_time_abo_gender['Male'],
        text=[f'{val:.2f}' for val in pivot_time_abo_gender['Male']],
        textposition='auto'
    )
])

fig_bar.update_layout(
    title='Average Time in Gym by Abonoment Type and Gender',
    xaxis_title='Abonoment Type',
    yaxis_title='Average Time in Gym (Hours)',
    barmode='group',
    template='plotly_white'
)

# Visualization 2: Heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=pivot_time_abo_gender.values,
    x=pivot_time_abo_gender.columns,
    y=pivot_time_abo_gender.index,
    colorscale='Viridis',
    text=pivot_time_abo_gender.values.round(2),
    texttemplate='%{text}',
    textfont={"size":10}
))

fig_heatmap.update_layout(
    title='Heatmap: Average Time in Gym by Abonoment Type and Gender',
    xaxis_title='Gender',
    yaxis_title='Abonoment Type',
    template='plotly_white'
)

# Visualization 3: Pie Charts for each Abonoment Type
fig_pie = make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Premium Membership', 'Standard Membership'],
    specs=[[{'type':'domain'}, {'type':'domain'}]]
)

for idx, abo_type in enumerate(pivot_time_abo_gender.index):
    fig_pie.add_trace(
        go.Pie(
            labels=['Female', 'Male'],
            values=pivot_time_abo_gender.loc[abo_type].values,
            textinfo='label+percent+value',
            marker_colors=['lightpink', 'lightblue'],
            
        ),
        row=1, 
        col=idx+1,
    )

fig_pie.update_layout(
    title='Distribution of Average Gym Time by Gender for Each Membership Type',
    height=500,
    width=1000,
    template='plotly_white',
)

# Display and save visualizations
fig_bar.show()
fig_bar.write_html("vis/avg_gym_time_by_abo_gender_bar.html")

# fig_heatmap.show()
fig_heatmap.write_html("vis/avg_gym_time_by_abo_gender_heatmap.html")

fig_pie.show()
fig_pie.write_html("vis/avg_gym_time_by_abo_gender_pie.html")

# Additional statistical summary
print("\nDescriptive Statistics:")
print(pivot_time_abo_gender.describe())

Average Time in Gym by Abonoment Type and Gender:
gender              Female        Male
abonoment_type                        
Premium         104.871901  104.944223
Standard        107.597701  103.483740



Descriptive Statistics:
gender      Female        Male
count     2.000000    2.000000
mean    106.234801  104.213981
std       1.927432    1.032718
min     104.871901  103.483740
25%     105.553351  103.848861
50%     106.234801  104.213981
75%     106.916251  104.579102
max     107.597701  104.944223


### Visit Frequency by Age Group and Abonoment Type


In [33]:
# Create age groups
df['age_group'] = pd.cut(df['Age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '55+'])

pivot_visits_age_abo = df.pivot_table(
    values='visit_per_week', 
    index='age_group', 
    columns='abonoment_type', 
    aggfunc='count'
)
print("\nAverage Visits per Week by Age Group and Abonoment Type:")
print(pivot_visits_age_abo)


Average Visits per Week by Age Group and Abonoment Type:
abonoment_type  Premium  Standard
age_group                        
18-25               178       182
26-35               126       142
36-45               129       141
46-55                60        42
55+                   0         0


In [34]:
# Create age groups
df['age_group'] = pd.cut(df['Age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '55+'])

# Multiple pivot tables for deeper insights
# 1. Count of members
pivot_visits_count = df.pivot_table(
    values='id', 
    index='age_group', 
    columns='abonoment_type', 
    aggfunc='count'
)

# 2. Average visits per week
pivot_visits_avg = df.pivot_table(
    values='visit_per_week', 
    index='age_group', 
    columns='abonoment_type', 
    aggfunc='mean'
)

print("\nNumber of Members by Age Group and Abonoment Type:")
print(pivot_visits_count)
print("\nAverage Visits per Week by Age Group and Abonoment Type:")
print(pivot_visits_avg)

# Visualization 1: Stacked Bar Chart of Member Count
fig_members = go.Figure(data=[
    go.Bar(
        name='Premium',
        x=pivot_visits_count.index,
        y=pivot_visits_count['Premium'],
        text=pivot_visits_count['Premium'],
        textposition='auto'
    ),
    go.Bar(
        name='Standard',
        x=pivot_visits_count.index,
        y=pivot_visits_count['Standard'],
        text=pivot_visits_count['Standard'],
        textposition='auto'
    )
])

fig_members.update_layout(
    title='Number of Members by Age Group and Abonoment Type',
    xaxis_title='Age Group',
    yaxis_title='Number of Members',
    barmode='stack',
    template='plotly_white'
)

# Visualization 2: Grouped Bar Chart of Average Visits
fig_visits = go.Figure(data=[
    go.Bar(
        name='Premium',
        x=pivot_visits_avg.index,
        y=pivot_visits_avg['Premium'],
        text=[f'{val:.2f}' for val in pivot_visits_avg['Premium']],
        textposition='auto'
    ),
    go.Bar(
        name='Standard',
        x=pivot_visits_avg.index,
        y=pivot_visits_avg['Standard'],
        text=[f'{val:.2f}' for val in pivot_visits_avg['Standard']],
        textposition='auto'
    )
])

fig_visits.update_layout(
    title='Average Visits per Week by Age Group and Abonoment Type',
    xaxis_title='Age Group',
    yaxis_title='Average Visits per Week',
    barmode='group',
    template='plotly_white'
)

# Heatmap for comprehensive view
fig_heatmap = go.Figure(data=go.Heatmap(
    z=pivot_visits_avg.values,
    x=pivot_visits_avg.columns,
    y=pivot_visits_avg.index,
    colorscale='Viridis',
    text=pivot_visits_avg.values.round(2),
    texttemplate='%{text}',
    textfont={"size":10}
))

fig_heatmap.update_layout(
    title='Heatmap: Average Visits per Week',
    xaxis_title='Abonoment Type',
    yaxis_title='Age Group',
    template='plotly_white'
)

# Display and save visualizations
fig_members.show()
fig_members.write_html("vis/members_by_age_abonoment.html")

# fig_visits.show()
fig_visits.write_html("vis/avg_visits_by_age_abonoment.html")

# fig_heatmap.show()
fig_heatmap.write_html("vis/heatmap_visits_by_age_abonoment.html")


Number of Members by Age Group and Abonoment Type:
abonoment_type  Premium  Standard
age_group                        
18-25               178       182
26-35               126       142
36-45               129       141
46-55                60        42
55+                   0         0

Average Visits per Week by Age Group and Abonoment Type:
abonoment_type   Premium  Standard
age_group                         
18-25           2.634831  2.824176
26-35           2.698413  2.683099
36-45           2.798450  2.531915
46-55           2.550000  2.547619


### Personal Training and Sauna Usage by Gender


In [35]:
pivot_training_sauna = df.pivot_table(
    values='personal_training', 
    index='gender', 
    columns='uses_sauna', 
    aggfunc='count'
)
print("\nPersonal Training and Sauna Usage by Gender:")
print(pivot_training_sauna)


Personal Training and Sauna Usage by Gender:
uses_sauna  False  True 
gender                  
Female        262    241
Male          245    252


In [36]:
# Create pivot table
pivot_training_sauna = df.pivot_table(
    values='personal_training', 
    index='gender', 
    columns='uses_sauna', 
    aggfunc='count'
)

# Stacked Bar Chart
# fig_stacked = go.Figure(data=[
#     go.Bar(
#         name='Uses Sauna',
#         x=pivot_training_sauna.index,
#         y=pivot_training_sauna[True],
#         text=pivot_training_sauna[True],
#         textposition='auto'
#     ),
#     go.Bar(
#         name='Does Not Use Sauna',
#         x=pivot_training_sauna.index,
#         y=pivot_training_sauna[False],
#         text=pivot_training_sauna[False],
#         textposition='auto'
#     )
# ])

# fig_stacked.update_layout(
#     title='Sauna Usage by Gender',
#     xaxis_title='Gender',
#     yaxis_title='Number of Members',
#     barmode='stack',
#     template='plotly_white'
# )

# # fig_stacked.show()
# fig_stacked.write_html("vis/sauna_usage_by_gender_stacked.html")

# Percentage Stacked Bar Chart
# Calculate percentages
pivot_training_sauna_pct = pivot_training_sauna.div(pivot_training_sauna.sum(axis=1), axis=0) * 100

# fig_stacked_pct = go.Figure(data=[
#     go.Bar(
#         name='Uses Sauna',
#         x=pivot_training_sauna_pct.index,
#         y=pivot_training_sauna_pct[True],
#         text=[f'{val:.1f}%' for val in pivot_training_sauna_pct[True]],
#         textposition='auto'
#     ),
#     go.Bar(
#         name='Does Not Use Sauna',
#         x=pivot_training_sauna_pct.index,
#         y=pivot_training_sauna_pct[False],
#         text=[f'{val:.1f}%' for val in pivot_training_sauna_pct[False]],
#         textposition='auto'
#     )
# ])

# fig_stacked_pct.update_layout(
#     title='Sauna Usage Percentage by Gender',
#     xaxis_title='Gender',
#     yaxis_title='Percentage of Members',
#     barmode='stack',
#     template='plotly_white'
# )

# fig_stacked_pct.show()
# fig_stacked_pct.write_html("vis/sauna_usage_by_gender_percentage.html")

# Pie Charts for each gender
fig_pie = make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Female Sauna Usage', 'Male Sauna Usage'],
    specs=[[{'type':'domain'}, {'type':'domain'}]]
)

for idx, gender in enumerate(['Female', 'Male']):
    gender_data = pivot_training_sauna.loc[gender]
    
    fig_pie.add_trace(
        go.Pie(
            labels=['Uses Sauna', 'Does Not Use Sauna'],
            values=gender_data.values,
            textinfo='percent+value',
            marker_colors=['lightgreen', 'lightcoral']
        ),
        row=1, 
        col=idx+1
    )

fig_pie.update_layout(
    title='Sauna Usage Distribution by Gender',
    height=500,
    width=1000,
    template='plotly_white'
)

fig_pie.show()
fig_pie.write_html("vis/sauna_usage_by_gender_pie.html")

### Group Lesson Attendance by Age and Gender



In [37]:
# First, ensure age_group is created
df['age_group'] = pd.cut(df['Age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '55+'])

# Create pivot table
pivot_lesson_age_gender = df.pivot_table(
    values='attend_group_lesson', 
    index='age_group', 
    columns='gender', 
    aggfunc='mean'
)

# Melt the pivot table for easier plotting
pivot_melted = pivot_lesson_age_gender.reset_index().melt(
    id_vars='age_group', 
    var_name='Gender', 
    value_name='Attendance Rate'
)

# Create Plotly bar plot
fig = go.Figure()

# Add bars for each gender
for gender in pivot_melted['Gender'].unique():
    gender_data = pivot_melted[pivot_melted['Gender'] == gender]
    fig.add_trace(go.Bar(
        x=gender_data['age_group'],
        y=gender_data['Attendance Rate'],
        name=gender,
        text=[f'{val:.2%}' for val in gender_data['Attendance Rate']],
        textposition='outside'
    ))

# Customize layout
fig.update_layout(
    title='Group Lesson Attendance Rate by Age Group and Gender',
    xaxis_title='Age Group',
    yaxis_title='Attendance Rate',
    yaxis_tickformat='.0%',  # Format y-axis as percentage
    barmode='group',  # Side-by-side bars
    template='plotly_white',
    colorway=['#F63366', '#FFB300', '#00C49A', '#FF8C00', '#1DE9B6']
)

# Show the plot
fig.show()

# Optional: Save to HTML
fig.write_html("vis/group_lesson_attendance.html")

### Average Visits per Week by Age Group and Abonoment Type

In [38]:
# Visualization for Visits per Week by Age Group and Abonoment Type
import plotly.express as px

# Melt the pivot table for easier plotting
pivot_visits_melted = pivot_visits_age_abo.reset_index().melt(
    id_vars='age_group', 
    var_name='Abonoment Type', 
    value_name='Average Visits per Week'
)

fig = px.bar(
    pivot_visits_melted, 
    x='age_group', 
    y='Average Visits per Week', 
    color='Abonoment Type',
    title='Average Visits per Week by Age Group and Abonoment Type',
    labels={'Average Visits per Week': 'Avg Visits per Week'},
    barmode='group'
)
fig.show()

### Analysis of Age for gym visitors

In [39]:
df.sort_values(by='Age', ascending=False).head(10)

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
215,216,Male,1975-04-11,49,Premium,3,"Fri, Mon, Wed",False,,12:37:00,13:15:00,38,True,lemon,True,Mike,False,46-55
821,822,Male,1975-10-11,49,Premium,3,"Mon, Sat, Sun",False,,19:44:00,20:26:00,42,True,"orange, black_currant",True,Chantal,True,46-55
601,602,Male,1975-04-28,49,Standard,2,"Sat, Wed",False,,15:41:00,17:45:00,124,True,"passion_fruit, orange",False,,False,46-55
417,418,Female,1975-09-25,49,Premium,3,"Mon, Sun, Wed",False,,19:37:00,21:31:00,114,True,"berry_boost, coconut_pineapple",False,,False,46-55
645,646,Male,1975-08-16,49,Premium,3,"Fri, Sun, Thu",False,,17:16:00,19:52:00,156,True,passion_fruit,False,,False,46-55
116,117,Male,1975-04-04,49,Standard,1,Tue,True,Pilates,18:02:00,18:44:00,42,True,"berry_boost, lemon",False,,False,46-55
271,272,Male,1975-01-12,49,Premium,2,"Fri, Tue",True,Yoga,13:20:00,14:26:00,66,False,,True,Chantal,True,46-55
201,202,Male,1975-05-25,49,Premium,4,"Mon, Sat, Tue, Wed",False,,08:21:00,08:52:00,31,True,black_currant,False,,False,46-55
570,571,Female,1975-04-20,49,Premium,2,"Tue, Wed",False,,10:11:00,11:28:00,77,True,berry_boost,True,Hanna,False,46-55
936,937,Male,1975-08-05,49,Premium,3,"Sat, Sun, Tue",True,Zumba,10:55:00,12:18:00,83,True,orange,False,,True,46-55


In [40]:
df.sort_values(by='Age', ascending=False).head(20)['visit_per_week'].mean()

2.45

In [41]:
df.sort_values(by='Age', ascending=False).head(20)['avg_time_in_gym'].mean()

80.45

In [42]:
top_age=df[df['Age']>=45]
top_age.to_csv('data/over_45_age.csv', index=False)
top_age

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False,46-55
8,9,Male,1978-07-28,46,Premium,3,"Sat, Sun, Thu",True,BodyPump,09:45:00,11:17:00,92,True,"orange, lemon",True,Mike,False,46-55
33,34,Female,1976-12-07,47,Premium,2,"Mon, Sat",True,"XCore, Running, BodyBalance",14:42:00,16:08:00,86,False,,True,Hanna,True,46-55
44,45,Male,1976-01-19,48,Premium,2,"Tue, Wed",False,,09:57:00,12:35:00,158,False,,True,Hanna,True,46-55
48,49,Female,1975-10-23,48,Premium,3,"Fri, Thu, Tue",True,"Yoga, BodyBalance",11:56:00,14:13:00,137,False,,True,Chantal,True,46-55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,970,Male,1978-05-19,46,Premium,2,"Thu, Tue",False,,08:22:00,10:52:00,150,False,,True,Mike,True,46-55
974,975,Male,1976-10-08,48,Standard,3,"Fri, Sun, Wed",False,,11:07:00,12:43:00,96,True,"berry_boost, coconut_pineapple",True,Hanna,True,46-55
984,985,Female,1975-10-12,49,Premium,1,Fri,True,"Pilates, Kickboxen, BodyBalance",15:17:00,17:00:00,103,True,coconut_pineapple,False,,False,46-55
986,987,Female,1979-03-14,45,Premium,5,"Fri, Mon, Sun, Thu, Wed",False,,19:04:00,20:44:00,100,True,orange,True,Chantal,True,36-45


In [43]:
bottom_age=df[df['Age']<=14]
bottom_age.to_csv('data/lower_than_15_age.csv', index=False)
bottom_age

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
17,18,Female,2011-04-30,13,Standard,3,"Sat, Thu, Tue",False,,16:54:00,18:53:00,119,False,,True,Mike,True,18-25
22,23,Female,2010-10-22,13,Standard,2,"Mon, Tue",True,HIT,20:56:00,23:36:00,160,True,"passion_fruit, orange",True,Mike,False,18-25
23,24,Male,2010-01-24,14,Premium,3,"Sat, Tue, Wed",False,,14:34:00,16:33:00,119,True,"lemon, coconut_pineapple",True,Jeffrey,True,18-25
94,95,Female,2012-05-10,12,Standard,3,"Fri, Thu, Wed",True,BodyBalance,10:10:00,12:50:00,160,True,berry_boost,False,,True,18-25
105,106,Male,2012-08-28,12,Premium,3,"Fri, Sun, Thu",True,"Pilates, LesMiles",19:57:00,21:28:00,91,True,"orange, black_currant",False,,False,18-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,956,Female,2010-08-27,14,Premium,2,"Sat, Sun",True,Kickboxen,14:18:00,15:13:00,55,True,"lemon, coconut_pineapple",False,,True,18-25
957,958,Female,2010-03-25,14,Standard,3,"Fri, Sun, Thu",True,Yoga,13:59:00,16:41:00,162,True,lemon,False,,True,18-25
962,963,Male,2011-03-17,13,Premium,2,"Fri, Tue",True,XCore,09:23:00,12:02:00,159,False,,False,,True,18-25
964,965,Male,2009-11-01,14,Standard,1,Mon,True,"Yoga, BodyBalance, XCore",11:56:00,13:37:00,101,False,,False,,False,18-25


### Summary Statistics by Gender

In [44]:
with open('outputs/gender_summary_stats_age.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["Age"]].describe().round(1)))
df.groupby('gender')[['Age']].describe().round(1)

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,503.0,29.8,10.6,12.0,21.0,29.0,39.0,49.0
Male,497.0,31.4,11.0,12.0,22.0,32.0,41.0,49.0


In [45]:
with open('outputs/gender_summary_stats_visit.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["visit_per_week"]].describe().round(1)))
df.groupby('gender')[['visit_per_week']].describe().round(1)

Unnamed: 0_level_0,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,503.0,2.7,1.3,1.0,2.0,3.0,3.0,5.0
Male,497.0,2.6,1.2,1.0,2.0,3.0,3.0,5.0


In [46]:
with open('outputs/gender_summary_stats_time.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["avg_time_in_gym"]].describe().round(1)))
df.groupby('gender')[['avg_time_in_gym']].describe().round(1)

Unnamed: 0_level_0,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,503.0,106.3,43.4,30.0,69.0,106.0,144.0,180.0
Male,497.0,104.2,43.7,30.0,67.0,102.0,141.0,180.0


In [47]:
with open('outputs/gender_summary_stats_abo.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["abonoment_type"]].describe().round(1)))
df.groupby('gender')[['abonoment_type']].describe().round(1)

Unnamed: 0_level_0,abonoment_type,abonoment_type,abonoment_type,abonoment_type
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,503,2,Standard,261
Male,497,2,Premium,251


In [48]:
with open('outputs/gender_summary_stats_attend.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["attend_group_lesson"]].describe().round(1)))
df.groupby('gender')[['attend_group_lesson']].describe().round(1)

Unnamed: 0_level_0,attend_group_lesson,attend_group_lesson,attend_group_lesson,attend_group_lesson
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,503,2,False,255
Male,497,2,True,255


In [49]:
with open('outputs/gender_summary_stats_drink.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["drink_abo"]].describe().round(1)))
df.groupby('gender')[['drink_abo']].describe().round(1)

Unnamed: 0_level_0,drink_abo,drink_abo,drink_abo,drink_abo
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,503,2,False,258
Male,497,2,True,251


In [50]:
with open('outputs/gender_summary_stats_personal.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["personal_training"]].describe().round(1)))
df.groupby('gender')[['personal_training']].describe().round(1)

Unnamed: 0_level_0,personal_training,personal_training,personal_training,personal_training
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,503,2,False,255
Male,497,2,True,270


In [51]:

# Get the value counts for male personal trainers
male_trainers_counts = df[df['gender']=="Male"]['name_personal_trainer'].value_counts()

# Create a Plotly bar plot
fig = px.bar(
    x=male_trainers_counts.index, 
    y=male_trainers_counts.values, 
    title='Number of Male Clients per Personal Trainer',
    labels={'x': 'Personal Trainer', 'y': 'Number of Male Clients'}
)

# Optional: Improve readability
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis/male_trainers_distribution.html')

In [52]:

# Get the value counts for male personal trainers
female_trainers_counts = df[df['gender']=="Female"]['name_personal_trainer'].value_counts()

# Create a Plotly bar plot
fig = px.bar(
    x=female_trainers_counts.index, 
    y=female_trainers_counts.values, 
    title='Number of Female Clients per Personal Trainer',
    labels={'x': 'Personal Trainer', 'y': 'Number of Female Clients'}
)

# Optional: Improve readability
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis/female_trainers_distribution.html')

In [53]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True,26-35
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False,46-55
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False,36-45
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True,36-45
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False,36-45


In [54]:
# Get the value counts for female sauna usage
female_sauna_counts = df[df['gender']=="Female"]['uses_sauna'].value_counts()

# Create a Plotly pie plot
fig = px.pie(
    values=female_sauna_counts.values, 
    names=female_sauna_counts.index, 
    title='Female Clients Sauna Usage',
    hole=0.3,
    # color_discrete_sequence=['blue', 'red']  # Specify exact colors
  # Optional: creates a donut chart effect
)

# Optional: Customize layout
fig.update_layout(
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis/females_using_sauna.html')

In [55]:
# Get the value counts for female sauna usage
male_sauna_counts = df[df['gender']=="Male"]['uses_sauna'].value_counts()

# Create a Plotly pie plot
fig = px.pie(
    values=male_sauna_counts.values, 
    names=male_sauna_counts.index, 
    title='Male Clients Sauna Usage',
    hole=0.3,
    color_discrete_sequence=px.colors.qualitative.Pastel  # Pastel color palette

)

# Optional: Customize layout
fig.update_layout(
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis/males_using_sauna.html')

In [56]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True,26-35
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False,46-55
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False,36-45
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True,36-45
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False,36-45


In [57]:
def plot_lessons_count(gender):
    lessons=[]
    for i,lesson_row in enumerate(df[df['gender']==gender]['fav_group_lesson'].values): 
        # print(i,lesson_row)
        if type(lesson_row)!=str:
            continue
        lessons.extend(lesson_row.split(', '))
    # print(lessons)
    dict(Counter(lessons))
    lessons_count=dict(Counter(lessons))
    sorted_lessons_count = dict(sorted(lessons_count.items(), key=lambda item: item[1],reverse=True))

    # Optional: Create a bar plot to visualize the results
    import plotly.graph_objs as go

    fig = go.Figure(data=[go.Bar(
        x=list(sorted_lessons_count.keys()),
        y=list(sorted_lessons_count.values()),
        marker_color='skyblue'
    )])

    fig.update_layout(
        title=f'Lessons Count by {gender}',
        xaxis_title='Lesson',
        yaxis_title=f'Count of {gender}',
        template='plotly_white'
    )

    fig.show()
    fig.write_html(f"vis/{gender}_Lessons_Count.html")

plot_lessons_count("Female")

In [58]:
plot_lessons_count("Male")

In [59]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True,26-35
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False,46-55
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False,36-45
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True,36-45
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False,36-45


### Age Group Summary Statistics by Gender

In [60]:
import plotly.graph_objs as go
import plotly.express as px

# Prepare the data
gender_age_counts = df.groupby(['age_group', 'gender']).size().reset_index(name='count')

# Create a grouped bar chart using Plotly Express
fig = px.bar(
    gender_age_counts, 
    x='age_group', 
    y='count', 
    color='gender', 
    title='Gender Distribution Across Age Groups',
    labels={'count': 'Number of Members', 'age_group': 'Age Group'},
    barmode='group'
)

# Customize the layout
fig.update_layout(
    xaxis_title='Age Group',
    yaxis_title='Number of Members',
    legend_title='Gender',
    template='plotly_white'
)

# Show the plot
fig.show()

fig.write_html('vis/Gender_Age_Distribution_by_Group.html')


In [61]:
import plotly.express as px
import plotly.graph_objs as go

# Prepare the data
trainer_age_counts = df.groupby(['age_group', 'name_personal_trainer']).size().reset_index(name="count")

# Create a heatmap visualization
# fig = px.density_heatmap(
#     trainer_age_counts, 
#     x='name_personal_trainer', 
#     y='age_group', 
#     z='count',
#     title='Distribution of Personal Trainers Across Age Groups',
#     labels={'count': 'Number of Members', 'name_personal_trainer': 'Personal Trainer'},
#     color_continuous_scale='Viridis'
# )

# # Customize the layout
# fig.update_layout(
#     xaxis_title='Personal Trainer',
#     yaxis_title='Age Group',
#     template='plotly_white'
# )

# # Show the plot
# fig.show()

# Alternative: Grouped Bar Chart
fig_bar = px.bar(
    trainer_age_counts, 
    x='name_personal_trainer', 
    y='count', 
    color='age_group',
    title='Personal Trainers and Age Group Distribution',
    labels={'count': 'Number of Members', 'name_personal_trainer': 'Personal Trainer'},
    barmode='group'
)

# Customize the layout
fig_bar.update_layout(
    xaxis_title='Personal Trainer',
    yaxis_title='Number of Members',
    legend_title='Age Group',
    template='plotly_white',
    height=600,
    width=1000
)

# Show the plot
fig_bar.show()
fig_bar.write_html('vis/Trainer_Age_Distribution_by_Group.html')

In [62]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True,26-35
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False,46-55
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False,36-45
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True,36-45
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False,36-45


### Age Group Summary Statistics by Personal Training

In [63]:
import plotly.express as px
import plotly.graph_objs as go

# Prepare the data
trainer_age_counts = df.groupby(['age_group', 'personal_training']).size().reset_index(name="count")

# Alternative: Grouped Bar Chart
fig_bar = px.bar(
    trainer_age_counts, 
    x='personal_training', 
    y='count', 
    color='age_group',
    title='Personal Training and Age Group Distribution',
    labels={'count': 'Number of Members', 'personal_training': 'Personal Training'},
    barmode='group'
)

# Customize the layout
fig_bar.update_layout(
    xaxis_title='Personal Training',
    yaxis_title='Number of Members',
    legend_title='Age Group',
    template='plotly_white',
    height=600,
    width=1000
)

# Show the plot
fig_bar.show()
fig_bar.write_html('vis/Personal_Training_Age_Distribution_by_Group.html')

In [64]:
df.groupby(['age_group', 'fav_group_lesson']).size().reset_index(name="count")
# for i, row in df.groupby(['age_group', 'fav_group_lesson']).size().reset_index(name="count").iterrows():
    

Unnamed: 0,age_group,fav_group_lesson,count
0,18-25,BodyBalance,4
1,18-25,"BodyBalance, BodyPump",0
2,18-25,"BodyBalance, BodyPump, Running",0
3,18-25,"BodyBalance, HIT",1
4,18-25,"BodyBalance, Kickboxen",1
...,...,...,...
1260,55+,"Zumba, Running, Spinning",0
1261,55+,"Zumba, Spinning, BodyBalance",0
1262,55+,"Zumba, XCore",0
1263,55+,"Zumba, Yoga, Pilates",0


In [65]:
# First, let's split the combined group lessons
def split_group_lessons(lessons):
    return [lesson.strip() for lesson in lessons.split(',')]

# Explode the group lessons to get individual lessons
df_exploded = df.assign(
    fav_group_lesson=df['fav_group_lesson'].str.split(', ')
).explode('fav_group_lesson')

# Count of each group lesson for every age group
group_lesson_counts = df_exploded.groupby(['age_group', 'fav_group_lesson']).size().reset_index(name='count')

# Plotly Visualization
import plotly.express as px
import plotly.graph_objs as go

# Heatmap
fig_heatmap = px.density_heatmap(
    group_lesson_counts, 
    x='fav_group_lesson', 
    y='age_group', 
    z='count',
    title='Distribution of Favorite Group Lessons Across Age Groups',
    labels={'count': 'Number of Members', 'fav_group_lesson': 'Group Lesson'},
    color_continuous_scale='Viridis'
)

fig_heatmap.update_layout(
    xaxis_title='Group Lesson',
    yaxis_title='Age Group',
    template='plotly_white',
    height=600,
    width=1000
)

fig_heatmap.show()

# Grouped Bar Chart
fig_bar = px.bar(
    group_lesson_counts, 
    x='fav_group_lesson', 
    y='count', 
    color='age_group',
    title='Favorite Group Lessons Distribution Across Age Groups',
    labels={'count': 'Number of Members', 'fav_group_lesson': 'Group Lesson'},
    barmode='group'
)

fig_bar.update_layout(
    xaxis_title='Group Lesson',
    yaxis_title='Number of Members',
    legend_title='Age Group',
    template='plotly_white',
    height=600,
    width=1000,
    xaxis_tickangle=-45
)

fig_bar.show()

# Percentage Distribution
group_lesson_percentages = group_lesson_counts.groupby('fav_group_lesson').apply(
    lambda x: x.assign(percentage=x['count'] / x['count'].sum() * 100)
).reset_index(drop=True)

fig_percentage = px.bar(
    group_lesson_percentages, 
    x='fav_group_lesson', 
    y='percentage', 
    color='age_group',
    title='Percentage of Favorite Group Lessons Across Age Groups',
    labels={'percentage': 'Percentage of Members', 'fav_group_lesson': 'Group Lesson'},
    barmode='group'
)

fig_percentage.update_layout(
    xaxis_title='Group Lesson',
    yaxis_title='Percentage of Members',
    legend_title='Age Group',
    template='plotly_white',
    height=600,
    width=1000,
    xaxis_tickangle=-45
)

fig_percentage.show()

# Print top 10 group lessons by total count
print("Top 10 Group Lessons by Total Count:")
print(group_lesson_counts.groupby('fav_group_lesson')['count'].sum().sort_values(ascending=False).head(10))

Top 10 Group Lessons by Total Count:
fav_group_lesson
BodyPump       112
LesMiles        99
HIT             97
Pilates         95
XCore           90
Yoga            90
BodyBalance     87
Kickboxen       85
Spinning        85
Running         82
Name: count, dtype: int64


### Group Lessons Distribution Across Age Groups

In [66]:
# Explode the group lessons to get individual lessons
df_exploded = df.assign(
    fav_group_lesson=df['fav_group_lesson'].str.split(', ')
).explode('fav_group_lesson')

# Count of each age group for every group lesson
group_lesson_counts = df_exploded.groupby(['age_group', 'fav_group_lesson']).size().reset_index(name='count')

# Plotly Visualization
import plotly.express as px
import plotly.graph_objs as go

# Heatmap
fig_heatmap = px.density_heatmap(
    group_lesson_counts, 
    x='age_group', 
    y='fav_group_lesson', 
    z='count',
    title='Distribution of Age Groups Across Favorite Group Lessons',
    labels={'count': 'Number of Members', 'fav_group_lesson': 'Group Lesson'},
    color_continuous_scale='Viridis'
)

fig_heatmap.update_layout(
    xaxis_title='Age Group',
    yaxis_title='Group Lesson',
    template='plotly_white',
    height=600,
    width=1000
)

fig_heatmap.show()
fig_heatmap.write_html('vis/Heatmap_Distribution_of_Group_Lessons_by_Age_Group.html')

# Grouped Bar Chart
fig_bar = px.bar(
    group_lesson_counts, 
    x='age_group', 
    y='count', 
    color='fav_group_lesson',
    title='Age Groups Distribution Across Favorite Group Lessons',
    labels={'count': 'Number of Members', 'fav_group_lesson': 'Group Lesson'},
    barmode='group'
)

fig_bar.update_layout(
    xaxis_title='Age Group',
    yaxis_title='Number of Members',
    legend_title='Group Lesson',
    template='plotly_white',
    height=600,
    width=1000,
    xaxis_tickangle=-45
)

fig_bar.show()
fig_bar.write_html('vis/Distribution_of_Group_Lessons_by_Age_Group.html')

# Percentage Distribution
group_lesson_percentages = group_lesson_counts.groupby('age_group').apply(
    lambda x: x.assign(percentage=x['count'] / x['count'].sum() * 100)
).reset_index(drop=True)

fig_percentage = px.bar(
    group_lesson_percentages, 
    x='age_group', 
    y='percentage', 
    color='fav_group_lesson',
    title='Percentage of Group Lessons Within Each Age Group',
    labels={'percentage': 'Percentage of Members', 'fav_group_lesson': 'Group Lesson'},
    barmode='group'
)

fig_percentage.update_layout(
    xaxis_title='Age Group',
    yaxis_title='Percentage of Members',
    legend_title='Group Lesson',
    template='plotly_white',
    height=600,
    width=1000,
    xaxis_tickangle=-45
)

fig_percentage.show()
fig_percentage.write_html('vis/Percentage_Distribution_of_Group_Lessons_by_Age_Group.html')
# Print top 10 group lessons for each age group
print("Top 10 Group Lessons by Age Group:")
for age_group in group_lesson_counts['age_group'].unique():
    print(f"\n{age_group}:")
    top_lessons = group_lesson_counts[group_lesson_counts['age_group'] == age_group].sort_values('count', ascending=False).head(10)
    print(top_lessons)

Top 10 Group Lessons by Age Group:

18-25:
   age_group fav_group_lesson  count
4      18-25         LesMiles     43
2      18-25              HIT     39
5      18-25          Pilates     39
3      18-25        Kickboxen     36
0      18-25      BodyBalance     35
8      18-25            XCore     35
7      18-25         Spinning     33
10     18-25            Zumba     33
1      18-25         BodyPump     32
6      18-25          Running     30

26-35:
   age_group fav_group_lesson  count
12     26-35         BodyPump     32
18     26-35         Spinning     27
15     26-35         LesMiles     26
20     26-35             Yoga     26
16     26-35          Pilates     24
11     26-35      BodyBalance     23
19     26-35            XCore     23
14     26-35        Kickboxen     22
17     26-35          Running     20
21     26-35            Zumba     19

36-45:
   age_group fav_group_lesson  count
24     36-45              HIT     34
23     36-45         BodyPump     31
31     36-45    

In [67]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,1,Female,1997-04-18,27,Premium,4,"Mon, Sat, Tue, Wed",True,"Kickboxen, BodyPump, Zumba",19:31:00,21:27:00,116,False,,False,,True,26-35
1,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False,46-55
2,3,Male,1983-03-30,41,Premium,1,Sat,True,XCore,08:29:00,10:32:00,123,True,"berry_boost, lemon",True,Mike,False,36-45
3,4,Male,1980-04-12,44,Premium,3,"Sat, Tue, Wed",False,,09:54:00,11:33:00,99,True,passion_fruit,True,Mike,True,36-45
4,5,Male,1980-09-10,44,Standard,2,"Thu, Wed",True,"Running, Yoga, Zumba",08:29:00,09:19:00,50,False,,True,Mike,False,36-45


### Time Range people count

In [68]:
def get_people_hour_count(start_hour):
    # Ensure start_hour is within 0-23 range
    start_hour = start_hour % 24
    
    # Convert the time column to datetime time format
    df['avg_time_check_in'] = pd.to_datetime(df['avg_time_check_in'], format='%H:%M:%S').dt.time
    
    # Handle the special case for 23:00 to 00:00
    if start_hour == 23:
        df_filtered = df[(df['avg_time_check_in'] >= pd.to_datetime('23:00:00').time()) | 
                         (df['avg_time_check_in'] < pd.to_datetime('00:00:00').time())]
    else:
        # Filter the DataFrame for times between start_hour:00:00 and (start_hour+1):00:00
        df_filtered = df[(df['avg_time_check_in'] >= pd.to_datetime(f'{start_hour:02d}:00:00').time()) & 
                         (df['avg_time_check_in'] < pd.to_datetime(f'{(start_hour+1):02d}:00:00').time())]

    # Return the count
    return len(df_filtered)

# Collect results in a dictionary
hourly_attendance = {f"{hour:02d}:00": get_people_hour_count(hour) for hour in range(8, 24)}

# Visualization
import plotly.express as px
import plotly.graph_objs as go

# Create a bar chart of hourly attendance
fig = px.bar(
    x=list(hourly_attendance.keys()), 
    y=list(hourly_attendance.values()),
    title='Hourly Gym Attendance (8:00 AM - 11:59 PM)',
    labels={'x': 'Time', 'y': 'Number of People'}
)

fig.update_layout(
    xaxis_title='Time',
    yaxis_title='Number of People',
    template='plotly_white',
    xaxis_tickangle=-45,
    height=600,
    width=1000
)

fig.show()

# Print the results
# print("\nHourly Attendance:")
# for time, count in hourly_attendance.items():
#     print(f"{time}: {count} people")

# Line plot for trend visualization
fig_line = px.line(
    x=list(hourly_attendance.keys()), 
    y=list(hourly_attendance.values()),
    title='Hourly Gym Attendance Trend (8:00 AM - 11:59 PM)',
    labels={'x': 'Time', 'y': 'Number of People'}
)

fig_line.update_layout(
    xaxis_title='Time',
    yaxis_title='Number of People',
    template='plotly_white',
    xaxis_tickangle=-45,
    height=600,
    width=1000,
    # markers=True
)

fig_line.show()
fig_line.write_html('vis/Hourly_Gym_Attendance_Trend.html') 

### People staying at gym for most time 

In [69]:
df.sort_values(by='avg_time_in_gym',ascending=False).head(10) 

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
487,488,Male,1981-04-25,43,Premium,5,"Fri, Mon, Sat, Thu, Wed",False,,17:52:00,20:52:00,180,False,,True,Mike,False,36-45
78,79,Female,1981-04-24,43,Standard,5,"Fri, Sat, Thu, Tue, Wed",True,XCore,12:44:00,15:44:00,180,False,,False,,False,36-45
5,6,Female,2009-06-29,15,Standard,1,Mon,False,,17:19:00,20:19:00,180,False,,False,,True,18-25
381,382,Male,1978-12-06,45,Premium,2,"Mon, Wed",True,"HIT, Spinning",14:20:00,17:20:00,180,True,black_currant,True,Hanna,True,36-45
567,568,Female,1989-08-26,35,Premium,5,"Sat, Sun, Thu, Tue, Wed",False,,10:41:00,13:41:00,180,False,,False,,True,26-35
734,735,Female,1991-02-05,33,Standard,2,"Fri, Tue",True,HIT,09:24:00,12:24:00,180,False,,True,Jeffrey,False,26-35
426,427,Male,2010-05-02,14,Premium,3,"Fri, Mon, Sun",False,,16:05:00,19:05:00,180,True,black_currant,True,Mike,False,18-25
441,442,Female,1994-05-17,30,Standard,1,Sat,False,,14:08:00,17:08:00,180,False,,True,Hanna,False,26-35
637,638,Female,2005-11-02,18,Premium,2,"Mon, Sat",False,,19:45:00,22:44:00,179,False,,True,Chantal,False,18-25
237,238,Female,2003-06-25,21,Premium,5,"Fri, Mon, Sat, Sun, Thu",True,"Pilates, BodyPump, BodyBalance",20:15:00,23:14:00,179,True,"coconut_pineapple, passion_fruit",False,,True,18-25


### People staying at gym for least time

In [70]:
df.sort_values(by='avg_time_in_gym',ascending=True).head(10) 

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
501,502,Male,2010-11-29,13,Premium,1,Fri,False,,16:10:00,16:40:00,30,True,lemon,False,,False,18-25
660,661,Male,2009-12-05,14,Standard,3,"Sat, Sun, Thu",True,"XCore, Pilates, LesMiles",17:02:00,17:32:00,30,False,,False,,False,18-25
521,522,Female,2000-03-03,24,Standard,2,"Mon, Thu",True,Zumba,20:25:00,20:55:00,30,False,,False,,True,18-25
254,255,Male,2003-04-14,21,Premium,1,Sun,True,"Kickboxen, BodyBalance, LesMiles",14:34:00,15:04:00,30,True,coconut_pineapple,True,Mike,True,18-25
513,514,Female,2001-05-13,23,Standard,3,"Fri, Mon, Wed",True,"Zumba, XCore",12:43:00,13:13:00,30,True,passion_fruit,False,,False,18-25
57,58,Male,1977-12-30,46,Premium,5,"Fri, Mon, Sat, Thu, Tue",True,"BodyBalance, BodyPump",12:19:00,12:49:00,30,False,,True,Chantal,True,46-55
773,774,Male,1988-04-18,36,Premium,4,"Mon, Sun, Thu, Wed",False,,12:32:00,13:02:00,30,False,,False,,False,36-45
402,403,Female,2007-12-04,16,Premium,3,"Sat, Sun, Tue",False,,15:15:00,15:46:00,31,True,passion_fruit,True,Chantal,False,18-25
201,202,Male,1975-05-25,49,Premium,4,"Mon, Sat, Tue, Wed",False,,08:21:00,08:52:00,31,True,black_currant,False,,False,46-55
525,526,Male,2008-01-07,16,Standard,4,"Mon, Sat, Sun, Thu",True,"Zumba, Running, Spinning",13:43:00,14:14:00,31,True,passion_fruit,True,Mike,False,18-25


## Correlations

In [71]:
numerical_cols = df.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation matrix for numerical columns
correlation_matrix = df[numerical_cols].corr()
correlation_matrix

Unnamed: 0,Age,visit_per_week,avg_time_in_gym
Age,1.0,-0.040228,-0.041017
visit_per_week,-0.040228,1.0,0.009931
avg_time_in_gym,-0.041017,0.009931,1.0


In [72]:
# Select only numerical columns
numerical_cols = df.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation matrix for numerical columns
correlation_matrix = df[numerical_cols].corr()

# Visualization using Plotly
import plotly.express as px

# Heatmap of correlation matrix
fig = px.imshow(
    correlation_matrix, 
    title='Correlation Heatmap of Numerical Columns',
    color_continuous_scale='RdBu_r',  # Red-Blue diverging color scale
    text_auto=True  # Show correlation values
)

# Customize layout
fig.update_layout(
    width=800,
    height=800
)

fig.show()

# Print the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Optional: Identify highly correlated features
def get_high_correlations(corr_matrix, threshold=0.5):
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    corr_matrix.iloc[i, j]
                ))
    return high_corr

high_correlations = get_high_correlations(correlation_matrix)
print("\nHighly Correlated Features (|correlation| > 0.5):")
for feat1, feat2, corr_value in high_correlations:
    print(f"{feat1} - {feat2}: {corr_value:.2f}")


Correlation Matrix:
                      Age  visit_per_week  avg_time_in_gym
Age              1.000000       -0.040228        -0.041017
visit_per_week  -0.040228        1.000000         0.009931
avg_time_in_gym -0.041017        0.009931         1.000000

Highly Correlated Features (|correlation| > 0.5):
