# EDA for Lower than 15 yeas members

### Importing Libraries

In [85]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot, init_notebook_mode, plot, iplot 
import plotly.express as px 
import os 
init_notebook_mode(connected=True)

In [86]:
os.makedirs('vis_15', exist_ok=True) 

df = pd.read_csv('data/lower_than_15_age.csv')
df.head() 

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,18,Female,2011-04-30,13,Standard,3,"Sat, Thu, Tue",False,,16:54:00,18:53:00,119,False,,True,Mike,True,18-25
1,23,Female,2010-10-22,13,Standard,2,"Mon, Tue",True,HIT,20:56:00,23:36:00,160,True,"passion_fruit, orange",True,Mike,False,18-25
2,24,Male,2010-01-24,14,Premium,3,"Sat, Tue, Wed",False,,14:34:00,16:33:00,119,True,"lemon, coconut_pineapple",True,Jeffrey,True,18-25
3,95,Female,2012-05-10,12,Standard,3,"Fri, Thu, Wed",True,BodyBalance,10:10:00,12:50:00,160,True,berry_boost,False,,True,18-25
4,106,Male,2012-08-28,12,Premium,3,"Fri, Sun, Thu",True,"Pilates, LesMiles",19:57:00,21:28:00,91,True,"orange, black_currant",False,,False,18-25


In [87]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     77 non-null     int64 
 1   gender                 77 non-null     object
 2   birthday               77 non-null     object
 3   Age                    77 non-null     int64 
 4   abonoment_type         77 non-null     object
 5   visit_per_week         77 non-null     int64 
 6   days_per_week          77 non-null     object
 7   attend_group_lesson    77 non-null     bool  
 8   fav_group_lesson       43 non-null     object
 9   avg_time_check_in      77 non-null     object
 10  avg_time_check_out     77 non-null     object
 11  avg_time_in_gym        77 non-null     int64 
 12  drink_abo              77 non-null     bool  
 13  fav_drink              37 non-null     object
 14  personal_training      77 non-null     bool  
 15  name_personal_trainer  28

In [88]:
df.shape

(77, 18)

In [89]:
df.columns

Index(['id', 'gender', 'birthday', 'Age', 'abonoment_type', 'visit_per_week',
       'days_per_week', 'attend_group_lesson', 'fav_group_lesson',
       'avg_time_check_in', 'avg_time_check_out', 'avg_time_in_gym',
       'drink_abo', 'fav_drink', 'personal_training', 'name_personal_trainer',
       'uses_sauna', 'age_group'],
      dtype='object')

In [90]:

df.describe().T.drop('id', axis=0).drop("count", axis=1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Age,13.246753,0.797295,12.0,13.0,13.0,14.0,14.0
visit_per_week,2.844156,1.203776,1.0,2.0,3.0,3.0,5.0
avg_time_in_gym,103.467532,44.415379,30.0,67.0,105.0,138.0,180.0


### Gender Distribution

In [91]:
init_notebook_mode(connected=True)
fig=go.Figure()
fig.add_trace(go.Pie(
    labels=df['gender'].value_counts().index,
    values=df['gender'].value_counts().values,
    title='Gender Distribution'
))
iplot(fig)
fig.write_html("vis_15/Gender_Distribution.html") 


### Distribution of Age 

In [92]:
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Create histogram trace
fig = go.Figure(data=[go.Histogram(
    x=df['Age'], 
    nbinsx=20,  # Adjust number of bins as needed
    marker_color='skyblue',
    opacity=0.7
)])

# Customize the layout
fig.update_layout(
    title='Distribution of Age',
    xaxis_title='Age',
    yaxis_title='Frequency',
    template='plotly_white'
)

# Show the plot
fig.show()

fig.write_html("vis_15/Age_Distribution_with_Density_Curve.html")

In [93]:
import numpy as np
hist_data = [df['Age']]
group_labels = ['Age Distribution']
fig_kde = ff.create_distplot(
    hist_data, 
    group_labels, 
    show_hist=False, 
    show_curve=True,
    colors=['skyblue']
)
fig_kde.update_layout(
    title='Age Distribution with Density Curve',
    xaxis_title='Age',
    yaxis_title='Density',
    template='plotly_white'
)
fig_kde.show()
fig_kde.write_html("vis_15/Age_Distribution.html")

### Abonoment Distribution

In [94]:
fig=go.Figure()
fig.add_trace(go.Pie(
    labels=df['abonoment_type'].value_counts().index,
    values=df['abonoment_type'].value_counts().values,
    title='Abonoment Distribution'
))
iplot(fig)
fig.write_html("vis_15/abonoment_type.html") 

### Distribution of Visit Per Week 

In [95]:
fig = go.Figure()
fig.add_trace(go.Pie(
    labels=df['visit_per_week'].value_counts().index,
    values=df['visit_per_week'].value_counts().values,
    title='Visit Per Week Distribution',
    textinfo='label+percent+value',  # This will show label, percentage, and actual value
    texttemplate='%{label}<br> (%{percent})',  # Custom formatting
    textposition='auto'  # Automatically positions the text
))
iplot(fig)
fig.write_html("vis_15/visit_per_week_count.html")

### Count of Days Attendance

In [96]:
week_days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
d = {day: 0 for day in week_days}  # Initialize dictionary with all days set to 0

def count_days(txt):
    for day in week_days:
        if day in txt:
            d[day] += 1

for t in df['days_per_week'].values:
    count_days(t)

sorted_day_count = dict(sorted(d.items(), key=lambda item: item[1],reverse=True))

# Optional: Create a bar plot to visualize the results
import plotly.graph_objs as go

fig = go.Figure(data=[go.Bar(
    x=list(sorted_day_count.keys()),
    y=list(sorted_day_count.values()),
    marker_color='skyblue'
)])

fig.update_layout(
    title='Gym Visits by Day of Week',
    xaxis_title='Day of Week',
    yaxis_title='Number of Visits',
    template='plotly_white'
)

fig.show()
fig.write_html("vis_15/Gym_Visits_by_Day_of_Week.html")

### Attend Group Lesson Distribution

In [97]:
fig=go.Figure()
fig.add_trace(go.Pie(
    labels=df['attend_group_lesson'].value_counts().index,
    values=df['attend_group_lesson'].value_counts().values,
    title='Attend Group Lesson Distribution'
))
iplot(fig)
fig.write_html("vis_15/attend_group_lesson_count.html") 

### Fav Group Lesson Distribution

In [98]:
from collections import Counter 
lessons=[]
for i,lesson_row in enumerate(df['fav_group_lesson'].values): 
    # print(i,lesson_row)
    if type(lesson_row)!=str:
        continue
    lessons.extend(lesson_row.split(', '))
print(lessons)
dict(Counter(lessons))

['HIT', 'BodyBalance', 'Pilates', 'LesMiles', 'Spinning', 'Kickboxen', 'Zumba', 'XCore', 'Yoga', 'LesMiles', 'Running', 'BodyBalance', 'Kickboxen', 'XCore', 'Pilates', 'Kickboxen', 'BodyPump', 'Zumba', 'BodyPump', 'XCore', 'LesMiles', 'Spinning', 'XCore', 'LesMiles', 'HIT', 'Zumba', 'Kickboxen', 'BodyBalance', 'BodyBalance', 'Zumba', 'BodyPump', 'XCore', 'XCore', 'Spinning', 'XCore', 'BodyPump', 'BodyBalance', 'HIT', 'Pilates', 'BodyBalance', 'LesMiles', 'Pilates', 'Yoga', 'HIT', 'Running', 'LesMiles', 'BodyBalance', 'Kickboxen', 'HIT', 'Yoga', 'BodyBalance', 'HIT', 'XCore', 'Pilates', 'LesMiles', 'Yoga', 'BodyBalance', 'Zumba', 'HIT', 'BodyPump', 'BodyBalance', 'BodyBalance', 'LesMiles', 'Pilates', 'HIT', 'Kickboxen', 'BodyBalance', 'XCore', 'Kickboxen', 'Yoga', 'LesMiles', 'BodyPump', 'Zumba', 'Running', 'Running', 'Spinning', 'BodyPump', 'Pilates', 'BodyPump', 'Spinning', 'XCore', 'BodyPump', 'LesMiles', 'Pilates', 'Kickboxen', 'Yoga', 'XCore', 'Yoga', 'BodyBalance', 'XCore']


{'HIT': 8,
 'BodyBalance': 13,
 'Pilates': 8,
 'LesMiles': 10,
 'Spinning': 5,
 'Kickboxen': 8,
 'Zumba': 6,
 'XCore': 12,
 'Yoga': 7,
 'Running': 4,
 'BodyPump': 9}

In [99]:
lessons_count=dict(Counter(lessons))
sorted_lessons_count = dict(sorted(lessons_count.items(), key=lambda item: item[1],reverse=True))

# Optional: Create a bar plot to visualize the results
import plotly.graph_objs as go

fig = go.Figure(data=[go.Bar(
    x=list(sorted_lessons_count.keys()),
    y=list(sorted_lessons_count.values()),
    marker_color='skyblue'
)])

fig.update_layout(
    title='Lessons Count',
    xaxis_title='Lesson',
    yaxis_title='Count of people',
    template='plotly_white'
)

fig.show()
fig.write_html("vis_15/Lessons_Count.html")

### Check-in Hours

In [100]:
check_in_hours=[]
for t in df['avg_time_check_in'].str.split(':'): 
    check_in_hours.append(int(t[0]))
check_in_hours_count=dict(Counter(check_in_hours))
check_in_hours_count    

{16: 4,
 20: 6,
 14: 9,
 10: 9,
 19: 4,
 8: 8,
 12: 6,
 17: 5,
 9: 9,
 13: 9,
 15: 3,
 11: 4,
 18: 1}

In [101]:
df['avg_time_check_in'] = df['avg_time_check_in'].astype(str)
df['avg_time_check_in'].str.split(':')

0     [16, 54, 00]
1     [20, 56, 00]
2     [14, 34, 00]
3     [10, 10, 00]
4     [19, 57, 00]
          ...     
72    [14, 18, 00]
73    [13, 59, 00]
74    [09, 23, 00]
75    [11, 56, 00]
76    [13, 36, 00]
Name: avg_time_check_in, Length: 77, dtype: object

In [102]:
def plot_check_hours_scatter(column_name):
    # First, prepare the data
    check_out_hours = []
    df[column_name] = df[column_name].astype(str)

    for t in df[column_name].str.split(':'): 
        check_out_hours.append(int(t[0]))
    check_out_hours_count = dict(Counter(check_out_hours))

    # Sort the dictionary by keys (hours) to ensure correct line connection
    sorted_hours = sorted(check_out_hours_count.items())

    fig = go.Figure(data=go.Scatter(
        x=[hour for hour, _ in sorted_hours],
        y=[count for _, count in sorted_hours],
        mode='markers+lines',  # Combines scatter points with connecting lines
        marker=dict(
            size=12,
            color=[hour for hour, _ in sorted_hours],  # Color based on hour
            colorscale='Cividis',  # You can change the colorscale
            showscale=False
        ),
        line=dict(color='lightgray', width=2),  # Customize line appearance
        text=[f'Hour: {hour}, Visits: {count}' for hour, count in sorted_hours],
        hoverinfo='text'
    ))
    in_out=str(column_name.split("_")[-1])
    fig.update_layout(
        title=f'Check-{in_out} Hours Distribution',
        xaxis_title=f'Hour of Check-{in_out}',
        yaxis_title='Number of Visits',
        template='plotly_white',
        xaxis=dict(
            tickmode='linear',
            tick0=min(check_out_hours_count.keys()),
            dtick=1
        )
    )

    fig.show()

    # Optional: If you want to save the plot
    fig.write_html(f"vis_15/{column_name}.html")

plot_check_hours_scatter('avg_time_check_in')

### Check-out Hours

In [103]:
plot_check_hours_scatter('avg_time_check_out')

#### Avg time in gym distribution

In [104]:
df.columns

Index(['id', 'gender', 'birthday', 'Age', 'abonoment_type', 'visit_per_week',
       'days_per_week', 'attend_group_lesson', 'fav_group_lesson',
       'avg_time_check_in', 'avg_time_check_out', 'avg_time_in_gym',
       'drink_abo', 'fav_drink', 'personal_training', 'name_personal_trainer',
       'uses_sauna', 'age_group'],
      dtype='object')

In [105]:
hist_data = [df['avg_time_in_gym']]
group_labels = ['Time in Gym Distribution']
fig_kde = ff.create_distplot(
    hist_data, 
    group_labels, 
    show_hist=False, 
    show_curve=True,
    colors=['skyblue']
)
fig_kde.update_layout(
    title=f'Distribution of Average Time Spent in Gym ',
    xaxis_title='Average Time in Gym (Hours)',
    yaxis_title='Number of Gym Members',
    template='plotly_white'
)
fig_kde.show()
fig_kde.write_html("vis_15/avg_time_in_gym_Distribution.html")

In [106]:
nbins=10
fig = go.Figure(data=[go.Histogram(
    x=df['avg_time_in_gym'],
    nbinsx=nbins,  # Specify 10 bins
    marker_color='skyblue',
    opacity=0.7
)])

fig.update_layout(
    title=f'Distribution of Average Time Spent in Gym ',
    xaxis_title='Average Time in Gym (Hours)',
    yaxis_title='Number of Gym Members',
    template='plotly_white'
)
fig.show()
fig.write_html("vis_15/avg_time_in_gym_Histogram.html")

### Drink Abonoment

In [107]:
# Drink Abonoment
fig_drink_abo = go.Figure(data=go.Pie(
    labels=df['drink_abo'].value_counts().index,
    values=df['drink_abo'].value_counts().values,
    title='Distribution of Drink Abonoment',
    textinfo='label+percent+value',
    textposition='auto',
    marker_colors=px.colors.qualitative.Pastel
))
fig_drink_abo.show()
fig_drink_abo.write_html("vis_15/drink_abo_distribution.html")

### Favorite Drink


In [108]:
# Favorite Drink
included=10
fig_fav_drink = go.Figure(data=go.Bar(
    x=df['fav_drink'].value_counts().sort_values(ascending=False)[:included].index,
    y=df['fav_drink'].value_counts().sort_values(ascending=False)[:included].values,
    marker_color='skyblue',
    text=df['fav_drink'].value_counts().values,
    textposition='outside'
))
fig_fav_drink.update_layout(
    title='Distribution of Favorite Drinks',
    xaxis_title='Drink',
    yaxis_title='Number of Members',
    template='plotly_white'
)
fig_fav_drink.show()
fig_fav_drink.write_html("vis_15/fav_drink_distribution.html")

### Personal Training


In [109]:
# Personal Training
fig_personal_training = go.Figure(data=go.Pie(
    labels=df['personal_training'].value_counts().index.astype(str),
    values=df['personal_training'].value_counts().values,
    title='Personal Training Participation',
    textinfo='label+percent+value',
    textposition='auto',
    marker_colors=['lightgreen', 'lightcoral']
))
fig_personal_training.show()
fig_personal_training.write_html("vis_15/personal_training_distribution.html")

### Personal Trainer Names

In [110]:
# Personal Trainer Names
trainer_counts = df['name_personal_trainer'].value_counts()
fig_trainer = go.Figure(data=go.Bar(
    x=trainer_counts.index,
    y=trainer_counts.values,
    marker_color=px.colors.qualitative.Pastel,
    text=trainer_counts.values,
    textposition='outside'
))
fig_trainer.update_layout(
    title='Distribution of Personal Trainers',
    xaxis_title='Trainer Name',
    yaxis_title='Number of Clients',
    template='plotly_white',
    xaxis_tickangle=-45
)
fig_trainer.show()
fig_trainer.write_html("vis_15/personal_trainer_distribution.html")

### Sauna Usage


In [111]:
# Sauna Usage
fig_sauna = go.Figure(data=go.Pie(
    labels=df['uses_sauna'].value_counts().index.astype(str),
    values=df['uses_sauna'].value_counts().values,
    title='Sauna Usage Distribution',
    textinfo='label+percent+value',
    textposition='auto',
    marker_colors=['lightsalmon', 'lightblue']
))
fig_sauna.show()
fig_sauna.write_html("vis_15/sauna_usage_distribution.html")

### Average Time in Gym by Abonoment Type and Gender

In [112]:
pivot_time_abo_gender = df.pivot_table(
    values='avg_time_in_gym', 
    index='abonoment_type', 
    columns='gender', 
    aggfunc='mean'
)
print("Average Time in Gym by Abonoment Type and Gender:")
print(pivot_time_abo_gender)

Average Time in Gym by Abonoment Type and Gender:
gender              Female        Male
abonoment_type                        
Premium          90.166667  108.500000
Standard        110.909091  102.684211


In [113]:
# Create pivot table
pivot_time_abo_gender = df.pivot_table(
    values='avg_time_in_gym', 
    index='abonoment_type', 
    columns='gender', 
    aggfunc='mean'
)

print("Average Time in Gym by Abonoment Type and Gender:")
print(pivot_time_abo_gender)

# Visualization 1: Grouped Bar Chart
fig_bar = go.Figure(data=[
    go.Bar(
        name='Female',
        x=pivot_time_abo_gender.index,
        y=pivot_time_abo_gender['Female'],
        text=[f'{val:.2f}' for val in pivot_time_abo_gender['Female']],
        textposition='auto'
    ),
    go.Bar(
        name='Male',
        x=pivot_time_abo_gender.index,
        y=pivot_time_abo_gender['Male'],
        text=[f'{val:.2f}' for val in pivot_time_abo_gender['Male']],
        textposition='auto'
    )
])

fig_bar.update_layout(
    title='Average Time in Gym by Abonoment Type and Gender',
    xaxis_title='Abonoment Type',
    yaxis_title='Average Time in Gym (Hours)',
    barmode='group',
    template='plotly_white'
)

# Visualization 2: Heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=pivot_time_abo_gender.values,
    x=pivot_time_abo_gender.columns,
    y=pivot_time_abo_gender.index,
    colorscale='Viridis',
    text=pivot_time_abo_gender.values.round(2),
    texttemplate='%{text}',
    textfont={"size":10}
))

fig_heatmap.update_layout(
    title='Heatmap: Average Time in Gym by Abonoment Type and Gender',
    xaxis_title='Gender',
    yaxis_title='Abonoment Type',
    template='plotly_white'
)

# Visualization 3: Pie Charts for each Abonoment Type
fig_pie = make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Premium Membership', 'Standard Membership'],
    specs=[[{'type':'domain'}, {'type':'domain'}]]
)

for idx, abo_type in enumerate(pivot_time_abo_gender.index):
    fig_pie.add_trace(
        go.Pie(
            labels=['Female', 'Male'],
            values=pivot_time_abo_gender.loc[abo_type].values,
            textinfo='label+percent+value',
            marker_colors=['lightpink', 'lightblue'],
            
        ),
        row=1, 
        col=idx+1,
    )

fig_pie.update_layout(
    title='Distribution of Average Gym Time by Gender for Each Membership Type',
    height=500,
    width=1000,
    template='plotly_white',
)

# Display and save visualizations
fig_bar.show()
fig_bar.write_html("vis_15/avg_gym_time_by_abo_gender_bar.html")

# fig_heatmap.show()
fig_heatmap.write_html("vis_15/avg_gym_time_by_abo_gender_heatmap.html")

fig_pie.show()
fig_pie.write_html("vis_15/avg_gym_time_by_abo_gender_pie.html")

# Additional statistical summary
print("\nDescriptive Statistics:")
print(pivot_time_abo_gender.describe())

Average Time in Gym by Abonoment Type and Gender:
gender              Female        Male
abonoment_type                        
Premium          90.166667  108.500000
Standard        110.909091  102.684211



Descriptive Statistics:
gender      Female        Male
count     2.000000    2.000000
mean    100.537879  105.592105
std      14.667109    4.112384
min      90.166667  102.684211
25%      95.352273  104.138158
50%     100.537879  105.592105
75%     105.723485  107.046053
max     110.909091  108.500000


### Visit Frequency by Age Group and Abonoment Type


In [114]:
# Create age groups
df['age_group'] = pd.cut(df['Age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '55+'])

pivot_visits_age_abo = df.pivot_table(
    values='visit_per_week', 
    index='age_group', 
    columns='abonoment_type', 
    aggfunc='count'
)
print("\nAverage Visits per Week by Age Group and Abonoment Type:")
print(pivot_visits_age_abo)


Average Visits per Week by Age Group and Abonoment Type:
abonoment_type  Premium  Standard
age_group                        
18-25                36        41
26-35                 0         0
36-45                 0         0
46-55                 0         0
55+                   0         0


### Personal Training and Sauna Usage by Gender


In [115]:
pivot_training_sauna = df.pivot_table(
    values='personal_training', 
    index='gender', 
    columns='uses_sauna', 
    aggfunc='count'
)
print("\nPersonal Training and Sauna Usage by Gender:")
print(pivot_training_sauna)


Personal Training and Sauna Usage by Gender:
uses_sauna  False  True 
gender                  
Female         20     20
Male           19     18


In [116]:
# Create pivot table
pivot_training_sauna = df.pivot_table(
    values='personal_training', 
    index='gender', 
    columns='uses_sauna', 
    aggfunc='count'
)

# Stacked Bar Chart
# fig_stacked = go.Figure(data=[
#     go.Bar(
#         name='Uses Sauna',
#         x=pivot_training_sauna.index,
#         y=pivot_training_sauna[True],
#         text=pivot_training_sauna[True],
#         textposition='auto'
#     ),
#     go.Bar(
#         name='Does Not Use Sauna',
#         x=pivot_training_sauna.index,
#         y=pivot_training_sauna[False],
#         text=pivot_training_sauna[False],
#         textposition='auto'
#     )
# ])

# fig_stacked.update_layout(
#     title='Sauna Usage by Gender',
#     xaxis_title='Gender',
#     yaxis_title='Number of Members',
#     barmode='stack',
#     template='plotly_white'
# )

# # fig_stacked.show()
# fig_stacked.write_html("vis_15/sauna_usage_by_gender_stacked.html")

# Percentage Stacked Bar Chart
# Calculate percentages
pivot_training_sauna_pct = pivot_training_sauna.div(pivot_training_sauna.sum(axis=1), axis=0) * 100

# fig_stacked_pct = go.Figure(data=[
#     go.Bar(
#         name='Uses Sauna',
#         x=pivot_training_sauna_pct.index,
#         y=pivot_training_sauna_pct[True],
#         text=[f'{val:.1f}%' for val in pivot_training_sauna_pct[True]],
#         textposition='auto'
#     ),
#     go.Bar(
#         name='Does Not Use Sauna',
#         x=pivot_training_sauna_pct.index,
#         y=pivot_training_sauna_pct[False],
#         text=[f'{val:.1f}%' for val in pivot_training_sauna_pct[False]],
#         textposition='auto'
#     )
# ])

# fig_stacked_pct.update_layout(
#     title='Sauna Usage Percentage by Gender',
#     xaxis_title='Gender',
#     yaxis_title='Percentage of Members',
#     barmode='stack',
#     template='plotly_white'
# )

# fig_stacked_pct.show()
# fig_stacked_pct.write_html("vis_15/sauna_usage_by_gender_percentage.html")

# Pie Charts for each gender
fig_pie = make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Female Sauna Usage', 'Male Sauna Usage'],
    specs=[[{'type':'domain'}, {'type':'domain'}]]
)

for idx, gender in enumerate(['Female', 'Male']):
    gender_data = pivot_training_sauna.loc[gender]
    
    fig_pie.add_trace(
        go.Pie(
            labels=['Uses Sauna', 'Does Not Use Sauna'],
            values=gender_data.values,
            textinfo='percent+value',
            marker_colors=['lightgreen', 'lightcoral']
        ),
        row=1, 
        col=idx+1
    )

fig_pie.update_layout(
    title='Sauna Usage Distribution by Gender',
    height=500,
    width=1000,
    template='plotly_white'
)

fig_pie.show()
fig_pie.write_html("vis_15/sauna_usage_by_gender_pie.html")

### 



In [117]:
average_time = df.sort_values(by='Age', ascending=False).head(20)['avg_time_in_gym'].mean()
print(f"average time spent in gym: {average_time} minute")

average time spent in gym: 124.8 minute


### Summary Statistics by Gender

In [118]:
with open('outputs/gender_summary_stats_visit.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["visit_per_week"]].describe().round(1)))
df.groupby('gender')[['visit_per_week']].describe().round(1)

Unnamed: 0_level_0,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,40.0,3.0,1.2,1.0,2.0,3.0,3.0,5.0
Male,37.0,2.7,1.2,1.0,2.0,3.0,3.0,5.0


In [119]:
with open('outputs/gender_summary_stats_time.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["avg_time_in_gym"]].describe().round(1)))
df.groupby('gender')[['avg_time_in_gym']].describe().round(1)

Unnamed: 0_level_0,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,40.0,101.6,44.9,32.0,64.5,105.0,137.5,176.0
Male,37.0,105.5,44.4,30.0,70.0,107.0,138.0,180.0


In [120]:
with open('outputs/gender_summary_stats_abo.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["abonoment_type"]].describe().round(1)))
df.groupby('gender')[['abonoment_type']].describe().round(1)

Unnamed: 0_level_0,abonoment_type,abonoment_type,abonoment_type,abonoment_type
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,40,2,Standard,22
Male,37,2,Standard,19


In [121]:
with open('outputs/gender_summary_stats_attend.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["attend_group_lesson"]].describe().round(1)))
df.groupby('gender')[['attend_group_lesson']].describe().round(1)

Unnamed: 0_level_0,attend_group_lesson,attend_group_lesson,attend_group_lesson,attend_group_lesson
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,40,2,True,24
Male,37,2,True,19


In [122]:
with open('outputs/gender_summary_stats_drink.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["drink_abo"]].describe().round(1)))
df.groupby('gender')[['drink_abo']].describe().round(1)

Unnamed: 0_level_0,drink_abo,drink_abo,drink_abo,drink_abo
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,40,2,False,21
Male,37,2,False,19


In [123]:
with open('outputs/gender_summary_stats_personal.txt', 'w') as f:
    f.write(str(df.groupby('gender')[["personal_training"]].describe().round(1)))
df.groupby('gender')[['personal_training']].describe().round(1)

Unnamed: 0_level_0,personal_training,personal_training,personal_training,personal_training
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,40,2,False,25
Male,37,2,False,24


In [124]:

# Get the value counts for male personal trainers
male_trainers_counts = df[df['gender']=="Male"]['name_personal_trainer'].value_counts()

# Create a Plotly bar plot
fig = px.bar(
    x=male_trainers_counts.index, 
    y=male_trainers_counts.values, 
    title='Number of Male Clients per Personal Trainer',
    labels={'x': 'Personal Trainer', 'y': 'Number of Male Clients'}
)

# Optional: Improve readability
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis_15/male_trainers_distribution.html')

In [125]:

# Get the value counts for male personal trainers
female_trainers_counts = df[df['gender']=="Female"]['name_personal_trainer'].value_counts()

# Create a Plotly bar plot
fig = px.bar(
    x=female_trainers_counts.index, 
    y=female_trainers_counts.values, 
    title='Number of Female Clients per Personal Trainer',
    labels={'x': 'Personal Trainer', 'y': 'Number of Female Clients'}
)

# Optional: Improve readability
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis_15/female_trainers_distribution.html')

In [126]:
# Get the value counts for female sauna usage
female_sauna_counts = df[df['gender']=="Female"]['uses_sauna'].value_counts()

# Create a Plotly pie plot
fig = px.pie(
    values=female_sauna_counts.values, 
    names=female_sauna_counts.index, 
    title='Female Clients Sauna Usage',
    hole=0.3,
    # color_discrete_sequence=['blue', 'red']  # Specify exact colors
  # Optional: creates a donut chart effect
)

# Optional: Customize layout
fig.update_layout(
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis_15/females_using_sauna.html')

In [127]:
# Get the value counts for female sauna usage
male_sauna_counts = df[df['gender']=="Male"]['uses_sauna'].value_counts()

# Create a Plotly pie plot
fig = px.pie(
    values=male_sauna_counts.values, 
    names=male_sauna_counts.index, 
    title='Male Clients Sauna Usage',
    hole=0.3,
    color_discrete_sequence=px.colors.qualitative.Pastel  # Pastel color palette

)

# Optional: Customize layout
fig.update_layout(
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
fig.show()

# Optional: Save the plot as an interactive HTML file
fig.write_html('vis_15/males_using_sauna.html')

In [128]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,18,Female,2011-04-30,13,Standard,3,"Sat, Thu, Tue",False,,16:54:00,18:53:00,119,False,,True,Mike,True,18-25
1,23,Female,2010-10-22,13,Standard,2,"Mon, Tue",True,HIT,20:56:00,23:36:00,160,True,"passion_fruit, orange",True,Mike,False,18-25
2,24,Male,2010-01-24,14,Premium,3,"Sat, Tue, Wed",False,,14:34:00,16:33:00,119,True,"lemon, coconut_pineapple",True,Jeffrey,True,18-25
3,95,Female,2012-05-10,12,Standard,3,"Fri, Thu, Wed",True,BodyBalance,10:10:00,12:50:00,160,True,berry_boost,False,,True,18-25
4,106,Male,2012-08-28,12,Premium,3,"Fri, Sun, Thu",True,"Pilates, LesMiles",19:57:00,21:28:00,91,True,"orange, black_currant",False,,False,18-25


In [129]:
def plot_lessons_count(gender):
    lessons=[]
    for i,lesson_row in enumerate(df[df['gender']==gender]['fav_group_lesson'].values): 
        # print(i,lesson_row)
        if type(lesson_row)!=str:
            continue
        lessons.extend(lesson_row.split(', '))
    # print(lessons)
    dict(Counter(lessons))
    lessons_count=dict(Counter(lessons))
    sorted_lessons_count = dict(sorted(lessons_count.items(), key=lambda item: item[1],reverse=True))

    # Optional: Create a bar plot to visualize the results
    import plotly.graph_objs as go

    fig = go.Figure(data=[go.Bar(
        x=list(sorted_lessons_count.keys()),
        y=list(sorted_lessons_count.values()),
        marker_color='skyblue'
    )])

    fig.update_layout(
        title=f'Lessons Count by {gender}',
        xaxis_title='Lesson',
        yaxis_title=f'Count of {gender}',
        template='plotly_white'
    )

    fig.show()
    fig.write_html(f"vis_15/{gender}_Lessons_Count.html")

plot_lessons_count("Female")

In [130]:
plot_lessons_count("Male")

### Age Group Summary Statistics by Gender

In [131]:
df.head()

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,18,Female,2011-04-30,13,Standard,3,"Sat, Thu, Tue",False,,16:54:00,18:53:00,119,False,,True,Mike,True,18-25
1,23,Female,2010-10-22,13,Standard,2,"Mon, Tue",True,HIT,20:56:00,23:36:00,160,True,"passion_fruit, orange",True,Mike,False,18-25
2,24,Male,2010-01-24,14,Premium,3,"Sat, Tue, Wed",False,,14:34:00,16:33:00,119,True,"lemon, coconut_pineapple",True,Jeffrey,True,18-25
3,95,Female,2012-05-10,12,Standard,3,"Fri, Thu, Wed",True,BodyBalance,10:10:00,12:50:00,160,True,berry_boost,False,,True,18-25
4,106,Male,2012-08-28,12,Premium,3,"Fri, Sun, Thu",True,"Pilates, LesMiles",19:57:00,21:28:00,91,True,"orange, black_currant",False,,False,18-25


### Time Range people count

In [132]:
def get_people_hour_count(start_hour):
    # Ensure start_hour is within 0-23 range
    start_hour = start_hour % 24
    
    # Convert the time column to datetime time format
    df['avg_time_check_in'] = pd.to_datetime(df['avg_time_check_in'], format='%H:%M:%S').dt.time
    
    # Handle the special case for 23:00 to 00:00
    if start_hour == 23:
        df_filtered = df[(df['avg_time_check_in'] >= pd.to_datetime('23:00:00').time()) | 
                         (df['avg_time_check_in'] < pd.to_datetime('00:00:00').time())]
    else:
        # Filter the DataFrame for times between start_hour:00:00 and (start_hour+1):00:00
        df_filtered = df[(df['avg_time_check_in'] >= pd.to_datetime(f'{start_hour:02d}:00:00').time()) & 
                         (df['avg_time_check_in'] < pd.to_datetime(f'{(start_hour+1):02d}:00:00').time())]

    # Return the count
    return len(df_filtered)

# Collect results in a dictionary
hourly_attendance = {f"{hour:02d}:00": get_people_hour_count(hour) for hour in range(8, 24)}

# Visualization
import plotly.express as px
import plotly.graph_objs as go

# Create a bar chart of hourly attendance
fig = px.bar(
    x=list(hourly_attendance.keys()), 
    y=list(hourly_attendance.values()),
    title='Hourly Gym Attendance (8:00 AM - 11:59 PM)',
    labels={'x': 'Time', 'y': 'Number of People'}
)

fig.update_layout(
    xaxis_title='Time',
    yaxis_title='Number of People',
    template='plotly_white',
    xaxis_tickangle=-45,
    height=600,
    width=1000
)

fig.show()

# Print the results
# print("\nHourly Attendance:")
# for time, count in hourly_attendance.items():
#     print(f"{time}: {count} people")

# Line plot for trend visualization
fig_line = px.line(
    x=list(hourly_attendance.keys()), 
    y=list(hourly_attendance.values()),
    title='Hourly Gym Attendance Trend (8:00 AM - 11:59 PM)',
    labels={'x': 'Time', 'y': 'Number of People'}
)

fig_line.update_layout(
    xaxis_title='Time',
    yaxis_title='Number of People',
    template='plotly_white',
    xaxis_tickangle=-45,
    height=600,
    width=1000,
    # markers=True
)

fig_line.show()
fig_line.write_html('vis_15/Hourly_Gym_Attendance_Trend.html') 

### People staying at gym for most time 

In [133]:
df.sort_values(by='avg_time_in_gym',ascending=False).head(10) 

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
31,427,Male,2010-05-02,14,Premium,3,"Fri, Mon, Sun",False,,16:05:00,19:05:00,180,True,black_currant,True,Mike,False,18-25
60,834,Male,2010-03-28,14,Standard,3,"Sat, Sun, Wed",False,,19:16:00,22:13:00,177,True,passion_fruit,True,Hanna,False,18-25
23,276,Male,2010-04-02,14,Standard,2,"Thu, Tue",True,LesMiles,11:21:00,14:18:00,177,False,,False,,True,18-25
14,182,Female,2010-02-16,14,Standard,3,"Mon, Sun, Wed",True,XCore,09:01:00,11:57:00,176,False,,True,Chantal,True,18-25
65,883,Female,2010-10-04,14,Premium,3,"Fri, Mon, Tue",True,Spinning,15:31:00,18:22:00,171,False,,False,,True,18-25
43,598,Female,2009-11-05,14,Standard,2,"Fri, Thu",True,"BodyBalance, LesMiles, Pilates",09:21:00,12:12:00,171,False,,True,Mike,False,18-25
73,958,Female,2010-03-25,14,Standard,3,"Fri, Sun, Thu",True,Yoga,13:59:00,16:41:00,162,True,lemon,False,,True,18-25
63,875,Male,2010-01-16,14,Premium,2,"Fri, Tue",True,"BodyPump, Zumba, Running",13:45:00,16:26:00,161,False,,False,,False,18-25
3,95,Female,2012-05-10,12,Standard,3,"Fri, Thu, Wed",True,BodyBalance,10:10:00,12:50:00,160,True,berry_boost,False,,True,18-25
1,23,Female,2010-10-22,13,Standard,2,"Mon, Tue",True,HIT,20:56:00,23:36:00,160,True,"passion_fruit, orange",True,Mike,False,18-25


### People staying at gym for least time

In [134]:
df.sort_values(by='avg_time_in_gym',ascending=True).head(10) 

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
37,502,Male,2010-11-29,13,Premium,1,Fri,False,,16:10:00,16:40:00,30,True,lemon,False,,False,18-25
50,661,Male,2009-12-05,14,Standard,3,"Sat, Sun, Thu",True,"XCore, Pilates, LesMiles",17:02:00,17:32:00,30,False,,False,,False,18-25
76,968,Female,2011-05-25,13,Standard,3,"Mon, Tue, Wed",False,,13:36:00,14:08:00,32,False,,True,Mike,False,18-25
56,745,Female,2011-05-01,13,Standard,3,"Fri, Sat, Wed",True,"BodyBalance, LesMiles, Pilates",13:37:00,14:09:00,32,True,passion_fruit,False,,False,18-25
22,275,Male,2011-06-09,13,Standard,2,"Fri, Tue",False,,09:38:00,10:11:00,33,False,,False,,True,18-25
69,921,Female,2011-02-04,13,Premium,1,Tue,True,"BodyPump, Pilates",12:39:00,13:13:00,34,False,,False,,False,18-25
17,190,Male,2011-09-08,13,Standard,4,"Sat, Sun, Thu, Tue",False,,14:47:00,15:21:00,34,False,,False,,False,18-25
5,115,Female,2010-09-16,14,Premium,3,"Sat, Sun, Tue",False,,14:36:00,15:11:00,35,False,,True,Jeffrey,False,18-25
71,936,Male,2010-12-06,13,Premium,3,"Mon, Thu, Wed",True,"BodyPump, LesMiles, Pilates",15:31:00,16:07:00,36,True,orange,False,,True,18-25
51,664,Female,2011-03-27,13,Standard,1,Thu,False,,10:20:00,10:57:00,37,True,coconut_pineapple,False,,True,18-25


## Correlations

In [135]:
numerical_cols = df.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation matrix for numerical columns
correlation_matrix = df[numerical_cols].corr()
correlation_matrix

Unnamed: 0,Age,visit_per_week,avg_time_in_gym
Age,1.0,-0.014244,0.124146
visit_per_week,-0.014244,1.0,0.20195
avg_time_in_gym,0.124146,0.20195,1.0


In [136]:
# Select only numerical columns
numerical_cols = df.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation matrix for numerical columns
correlation_matrix = df[numerical_cols].corr()

# Visualization using Plotly
import plotly.express as px

# Heatmap of correlation matrix
fig = px.imshow(
    correlation_matrix, 
    title='Correlation Heatmap of Numerical Columns',
    color_continuous_scale='RdBu_r',  # Red-Blue diverging color scale
    text_auto=True  # Show correlation values
)

# Customize layout
fig.update_layout(
    width=800,
    height=800
)

fig.show()

# Print the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Optional: Identify highly correlated features
def get_high_correlations(corr_matrix, threshold=0.5):
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    corr_matrix.iloc[i, j]
                ))
    return high_corr

high_correlations = get_high_correlations(correlation_matrix)
print("\nHighly Correlated Features (|correlation| > 0.5):")
for feat1, feat2, corr_value in high_correlations:
    print(f"{feat1} - {feat2}: {corr_value:.2f}")


Correlation Matrix:
                      Age  visit_per_week  avg_time_in_gym
Age              1.000000       -0.014244         0.124146
visit_per_week  -0.014244        1.000000         0.201950
avg_time_in_gym  0.124146        0.201950         1.000000

Highly Correlated Features (|correlation| > 0.5):
