# Exploratory Data Analysis (EDA) of Gym Membership Dataset

## Project Overview
This notebook performs an in-depth exploratory data analysis of a gym membership dataset. We'll investigate member demographics, attendance patterns, and other key insights.

**Dataset Source**: Local CSV file at `../data/gym_membership.csv`
**Total Entries**: 1000 gym members

## Importing Libraries

In [105]:
# Import data manipulation and analysis libraries
import pandas as pd 
import numpy as np 

# Import visualization libraries
import matplotlib.pyplot as plt 
import seaborn as sns 

# Import interactive plotting libraries
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot, init_notebook_mode, plot 
import plotly.express as px 
import plotly.graph_objs as go
import plotly.figure_factory as ff


# Import system libraries
import os


# Initialize Plotly notebook mode for inline plotting
init_notebook_mode(connected=True)

## Loading Dataset 

In [106]:
# Load the gym membership dataset
df = pd.read_csv('../data/over_45_age.csv')

# Display the first 5 rows to get an initial view of the data
print("First 5 rows of the dataset:")
df.head()

First 5 rows of the dataset:


Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
0,2,Female,1977-09-18,47,Standard,3,"Mon, Sat, Wed",False,,19:31:00,20:19:00,48,False,,True,Chantal,False,46-55
1,9,Male,1978-07-28,46,Premium,3,"Sat, Sun, Thu",True,BodyPump,09:45:00,11:17:00,92,True,"orange, lemon",True,Mike,False,46-55
2,34,Female,1976-12-07,47,Premium,2,"Mon, Sat",True,"XCore, Running, BodyBalance",14:42:00,16:08:00,86,False,,True,Hanna,True,46-55
3,45,Male,1976-01-19,48,Premium,2,"Tue, Wed",False,,09:57:00,12:35:00,158,False,,True,Hanna,True,46-55
4,49,Female,1975-10-23,48,Premium,3,"Fri, Thu, Tue",True,"Yoga, BodyBalance",11:56:00,14:13:00,137,False,,True,Chantal,True,46-55


In [107]:
# Display comprehensive information about the dataset
print("Dataset Information:")
df.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     132 non-null    int64 
 1   gender                 132 non-null    object
 2   birthday               132 non-null    object
 3   Age                    132 non-null    int64 
 4   abonoment_type         132 non-null    object
 5   visit_per_week         132 non-null    int64 
 6   days_per_week          132 non-null    object
 7   attend_group_lesson    132 non-null    bool  
 8   fav_group_lesson       64 non-null     object
 9   avg_time_check_in      132 non-null    object
 10  avg_time_check_out     132 non-null    object
 11  avg_time_in_gym        132 non-null    int64 
 12  drink_abo              132 non-null    bool  
 13  fav_drink              76 non-null     object
 14  personal_training      132 non-null    bool  
 15  na

In [108]:
print("\nDataset Dimensions:")
df.shape


Dataset Dimensions:


(132, 18)

In [109]:
print("Dataset Columns:")
df.columns

Dataset Columns:


Index(['id', 'gender', 'birthday', 'Age', 'abonoment_type', 'visit_per_week',
       'days_per_week', 'attend_group_lesson', 'fav_group_lesson',
       'avg_time_check_in', 'avg_time_check_out', 'avg_time_in_gym',
       'drink_abo', 'fav_drink', 'personal_training', 'name_personal_trainer',
       'uses_sauna', 'age_group'],
      dtype='object')

In [110]:
# Generate a descriptive statistical summary of numerical columns
print("Descriptive Statistics for Numerical Columns:")
df.describe().T.drop('id', axis=0).drop("count", axis=1)

Descriptive Statistics for Numerical Columns:


Unnamed: 0,mean,std,min,25%,50%,75%,max
Age,47.060606,1.481476,45.0,46.0,47.0,48.0,49.0
visit_per_week,2.590909,1.178397,1.0,2.0,3.0,3.0,5.0
avg_time_in_gym,100.022727,44.867484,30.0,56.75,92.5,138.0,180.0


## Exploratory Data Analysis workflow for the Gym Membership Dataset

### Gender Distribution

In [111]:
# Create a directory for visualizations if it doesn't exist
os.makedirs('../over_45/vis_45', exist_ok=True)

# Create an interactive pie chart to visualize gender distribution
fig = go.Figure()
fig.add_trace(go.Pie(
    # Use value_counts to get the gender distribution
    labels=df['gender'].value_counts().index,
    values=df['gender'].value_counts().values,
    # Add a title to provide context
    # Optional: Add hover information for more details
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}'
))

# Customize the layout for better readability
fig.update_layout(
    title_text='Gender Distribution of Gym Members',
    title_font_size=16,
    title_x=0.5  # Center the title
)

# Save the interactive plot as an HTML file for future reference
fig.write_html("../over_45/vis_45/Gender_Distribution.html")

# Display the plot inline in the notebook
iplot(fig)


### Distribution of Age 

In [112]:

# Create histogram trace
fig = go.Figure(data=[go.Histogram(
    x=df['Age'], 
    nbinsx=20,  # Adjust number of bins as needed
    marker_color='skyblue',
    opacity=0.7
)])

# Customize the layout
fig.update_layout(
    title='Distribution of Age',
    xaxis_title='Age',
    yaxis_title='Frequency',
    template='plotly_white'
)

# Show the plot
iplot(fig)

# Save the plot as an HTML file
fig.write_html("../over_45/vis_45/Age_Distribution_with_Density_Curve.html")

In [113]:
import numpy as np

# Prepare the age data for distribution visualization
# Wrap the Age column in a list to meet the input requirements of create_distplot
hist_data = [df['Age']]
# Label for the distribution plot
group_labels = ['Age Distribution']

# Create a Kernel Density Estimation (KDE) plot
fig_kde = ff.create_distplot(
    hist_data, 
    group_labels, 
    show_hist=False, 
    show_curve=True,
    colors=['skyblue']
)

# Customize the plot layout for improved readability
fig_kde.update_layout(
    title='Age Distribution of Gym Members',
    xaxis_title='Age',
    yaxis_title='Density',
    template='plotly_white'
)

# Display the interactive plot within the notebook
iplot(fig_kde)

# Save the interactive plot as an HTML file
fig_kde.write_html("../over_45/vis_45/Age_Distribution.html")

### Abonoment Distribution

In [114]:

# Create a new Plotly figure
fig = go.Figure()

# Add a pie chart trace for 'abonoment_type' distribution
fig.add_trace(go.Pie(
    labels=df['abonoment_type'].value_counts().index,  # Get unique abonnement types as labels
    values=df['abonoment_type'].value_counts().values,  # Get counts of each abonnement type as values
    title='Abonoment Distribution'  # Set chart title
))

# Display the plot
iplot(fig)

# Save the plot as an interactive HTML file
fig.write_html("../over_45/vis_45/abonoment_type.html")


### Distribution of Visit Per Week 

In [115]:
import plotly.graph_objs as go
from plotly.offline import iplot

# Create a new Plotly figure
fig = go.Figure()

# Add a pie chart trace for 'visit_per_week' distribution
fig.add_trace(go.Pie(
    labels=df['visit_per_week'].value_counts().index,  # Get unique visit frequencies as labels
    values=df['visit_per_week'].value_counts().values,  # Get counts of each frequency as values
    # title='Visit Per Week Distribution',  # Set chart title
    textinfo='label+percent+value',  # Show label, percentage, and actual count
    texttemplate='%{label}<br> (%{percent})',  # Custom text formatting
    textposition='auto'  # Automatically position text labels
))
fig.update_layout(
    title='Visit Per Week Distribution',
)
# Display the plot
iplot(fig)

# Save the plot as an interactive HTML file
fig.write_html("../over_45/vis_45/visit_per_week_count.html")


### Count of Days Attendance for people attending single day

In [116]:
week_days = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']

# Iterate through each weekday and print the count if it exists
for day in week_days:
    print(day, df['days_per_week'].value_counts().get(day, 0))  # Use .get() to return 0 if the day is missing



Sun 6
Mon 4
Tue 5
Wed 2
Thu 1
Fri 6
Sat 2


### Count of Days Attendance

In [117]:

# Define the order of weekdays
week_days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

# Initialize a dictionary to store visit counts, setting all values to 0
d = {day: 0 for day in week_days}  

# Function to count occurrences of each day in 'days_per_week'
def count_days(txt):
    for day in week_days:
        if day in txt:  # Check if the day exists in the entry
            d[day] += 1

# Iterate through the dataset and update visit counts
for t in df['days_per_week'].values:
    count_days(t)

# Sort days by visit count in descending order
sorted_day_count = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))

# Create a bar chart using Plotly
fig = go.Figure(data=[go.Bar(
    x=list(sorted_day_count.keys()),  # Days of the week
    y=list(sorted_day_count.values()),  # Corresponding visit counts
    marker_color='skyblue'  # Set bar color
)])

# Customize layout
fig.update_layout(
    title='Gym Visits by Day of Week',  # Chart title
    xaxis_title='Day of Week',  # X-axis label
    yaxis_title='Number of Visits',  # Y-axis label
    template='plotly_white'  # Use a clean theme
)

# Display the plot
iplot(fig)

# Save the chart as an interactive HTML file
fig.write_html("../over_45/vis_45/Gym_Visits_by_Day_of_Week.html")


### Attend Group Lesson Distribution

In [118]:

# Create a new Plotly figure
fig = go.Figure()

# Add a pie chart trace for 'attend_group_lesson' distribution
fig.add_trace(go.Pie(
    labels=df['attend_group_lesson'].value_counts().index,  # Get unique attendance labels (Yes/No)
    values=df['attend_group_lesson'].value_counts().values,  # Get counts for each attendance type
    title='Attend Group Lesson Distribution'  # Set chart title
))

# Display the plot
iplot(fig)

# Save the plot as an interactive HTML file
fig.write_html("../over_45/vis_45/attend_group_lesson_count.html")


### Fav Group Lesson Distribution

In [119]:
from collections import Counter
from pprint import pprint
# Initialize an empty list to store all lessons
lessons = []

# Iterate through the 'fav_group_lesson' column
for i, lesson_row in enumerate(df['fav_group_lesson'].values):
    # Skip the row if the value is not a string
    if type(lesson_row) != str:
        continue
    
    # Split the lesson string by ', ' and extend the 'lessons' list with individual lesson names
    lessons.extend(lesson_row.split(', '))

# Print the list of all lessons
print(lessons)

# Count the frequency of each lesson and display the result as a dictionary
lesson_counts = dict(Counter(lessons))

# Print the count of each lesson
pprint(lesson_counts)


['BodyPump', 'XCore', 'Running', 'BodyBalance', 'Yoga', 'BodyBalance', 'BodyBalance', 'BodyPump', 'Kickboxen', 'HIT', 'BodyPump', 'Pilates', 'HIT', 'BodyPump', 'Running', 'BodyBalance', 'XCore', 'Zumba', 'LesMiles', 'Running', 'BodyPump', 'Pilates', 'Kickboxen', 'Running', 'LesMiles', 'HIT', 'XCore', 'Kickboxen', 'Zumba', 'BodyPump', 'Spinning', 'BodyBalance', 'BodyPump', 'Pilates', 'XCore', 'Spinning', 'XCore', 'BodyPump', 'XCore', 'HIT', 'Kickboxen', 'Yoga', 'Pilates', 'Yoga', 'BodyPump', 'BodyPump', 'HIT', 'Running', 'Zumba', 'LesMiles', 'Running', 'BodyPump', 'HIT', 'Spinning', 'Kickboxen', 'BodyBalance', 'LesMiles', 'Yoga', 'Zumba', 'Running', 'LesMiles', 'Yoga', 'Pilates', 'Running', 'LesMiles', 'BodyBalance', 'Yoga', 'Kickboxen', 'BodyPump', 'BodyPump', 'BodyPump', 'LesMiles', 'HIT', 'Kickboxen', 'BodyBalance', 'Yoga', 'Spinning', 'XCore', 'Spinning', 'Pilates', 'LesMiles', 'HIT', 'Running', 'Kickboxen', 'HIT', 'Pilates', 'BodyPump', 'Running', 'Yoga', 'BodyBalance', 'BodyPump',

In [120]:
from collections import Counter
import plotly.graph_objs as go
from plotly.offline import iplot

# Count the frequency of each lesson
lessons_count = dict(Counter(lessons))

# Sort lessons by count in descending order
sorted_lessons_count = dict(sorted(lessons_count.items(), key=lambda item: item[1], reverse=True))

# Create a bar chart to visualize the sorted lesson counts
fig = go.Figure(data=[go.Bar(
    x=list(sorted_lessons_count.keys()),  # Lessons as x-axis
    y=list(sorted_lessons_count.values()),  # Frequency of each lesson as y-axis
    marker_color='skyblue'  # Set bar color
)])

# Customize layout of the bar plot
fig.update_layout(
    title='Lessons Count',  # Chart title
    xaxis_title='Lesson',  # X-axis title
    yaxis_title='Count of People',  # Y-axis title
    template='plotly_white'  # Use clean white theme
)

# Display the plot
iplot(fig)

# Save the plot as an interactive HTML file
fig.write_html("../over_45/vis_45/Lessons_Count.html")


### Check-in Hours

In [121]:
from collections import Counter

# Initialize an empty list to store extracted check-in hours
check_in_hours = []

df=pd.read_csv('../data/over_45_age.csv')
# Iterate through the 'avg_time_check_in' column, splitting by ':' to separate hour and minute
for t in df['avg_time_check_in'].str.split(':'): 
    # Extract the hour part and append it to the check_in_hours list as an integer
    check_in_hours.append(int(t[0]))

# Count the occurrences of each check-in hour
check_in_hours_count = dict(Counter(check_in_hours))

# Display the count of check-in hours
pprint(check_in_hours_count)


{8: 10,
 9: 7,
 10: 11,
 11: 12,
 12: 9,
 13: 10,
 14: 11,
 15: 11,
 16: 9,
 17: 6,
 18: 13,
 19: 14,
 20: 9}


In [122]:
def plot_check_hours_scatter(column_name):
    # First, prepare the data
    check_out_hours = []
    for t in df[column_name].str.split(':'): 
        check_out_hours.append(int(t[0]))
    check_out_hours_count = dict(Counter(check_out_hours))

    # Sort the dictionary by keys (hours) to ensure correct line connection
    sorted_hours = sorted(check_out_hours_count.items())

    fig = go.Figure(data=go.Scatter(
        x=[hour for hour, _ in sorted_hours],
        y=[count for _, count in sorted_hours],
        mode='markers+lines',  # Combines scatter points with connecting lines
        marker=dict(
            size=12,
            color=[hour for hour, _ in sorted_hours],  # Color based on hour
            colorscale='Cividis',  # You can change the colorscale
            showscale=False
        ),
        line=dict(color='lightgray', width=2),  # Customize line appearance
        text=[f'Hour: {hour}, Visits: {count}' for hour, count in sorted_hours],
        hoverinfo='text'
    ))

    fig.update_layout(
        title='Check-out Hours Distribution',
        xaxis_title=f'Hour of Check-{str(column_name.split("_")[-1])}',
        yaxis_title='Number of Visits',
        template='plotly_white',
        xaxis=dict(
            tickmode='linear',
            tick0=min(check_out_hours_count.keys()),
            dtick=1
        )
    )

    iplot(fig)

    # Optional: If you want to save the plot
    fig.write_html(f"../over_45/vis_45/{column_name}.html")

# plot check hours scatter for check in time
plot_check_hours_scatter('avg_time_check_in')

### Check-out Hours

In [123]:
# plot check hours scatter for check out time

plot_check_hours_scatter('avg_time_check_out')

#### Avg time in gym distribution

In [124]:
import plotly.figure_factory as ff
from plotly.offline import iplot

# Prepare the data for the histogram (Time spent in gym column)
hist_data = [df['avg_time_in_gym']]  # Extract the 'avg_time_in_gym' column

# Set the label for the data group
group_labels = ['Time in Gym Distribution']

# Create the distribution plot (KDE) using Plotly
fig_kde = ff.create_distplot(
    hist_data,  # Data to plot
    group_labels,  # Labels for the data
    show_hist=False,  # Hide the histogram bars
    show_curve=True,  # Show the kernel density estimation (KDE) curve
    colors=['skyblue']  # Color of the curve
)

# Customize the layout of the plot
fig_kde.update_layout(
    title=f'Distribution of Average Time Spent in Gym',  # Plot title
    xaxis_title='Average Time in Gym (Hours)',  # X-axis title
    yaxis_title='Number of Gym Members',  # Y-axis title
    template='plotly_white'  # Plot theme
)

# Display the plot
iplot(fig_kde)

# Save the plot as an interactive HTML file
fig_kde.write_html("../over_45/vis_45/avg_time_in_gym_Distribution.html")


In [125]:
import plotly.graph_objs as go
from plotly.offline import iplot

# Set the number of bins for the histogram
nbins = 10

# Create the histogram with specified number of bins
fig = go.Figure(data=[go.Histogram(
    x=df['avg_time_in_gym'],  # Data for the histogram (average time spent in gym)
    nbinsx=nbins,  # Set the number of bins to 10
    marker_color='skyblue',  # Color of the bars
    opacity=0.7  # Set the opacity of the bars to make them semi-transparent
)])

# Update the layout of the plot
fig.update_layout(
    title=f'Distribution of Average Time Spent in Gym',  # Title of the plot
    xaxis_title='Average Time in Gym (Hours)',  # X-axis title
    yaxis_title='Number of Gym Members',  # Y-axis title
    template='plotly_white'  # Use a clean white theme for the plot
)

# Display the plot
iplot(fig)

# Save the plot as an interactive HTML file
fig.write_html("../over_45/vis_45/avg_time_in_gym_Histogram.html")


### Drink Abonoment

In [126]:
# Create a Pie chart to visualize the distribution of 'drink_abo' column
fig_drink_abo = go.Figure(data=go.Pie(
    labels=df['drink_abo'].value_counts().index,  # Get unique values in 'drink_abo' column
    values=df['drink_abo'].value_counts().values,  # Get the count of each unique value
    title='Distribution of Drink Abonoment',  # Title of the pie chart
    textinfo='label+percent+value',  # Display label, percentage, and actual value on the chart
    textposition='auto',  # Automatically position the text labels within the slices
    marker_colors=px.colors.qualitative.Pastel  # Use pastel colors from Plotly's predefined color set
))

# Display the pie chart
iplot(fig_drink_abo)

# Save the pie chart as an interactive HTML file
fig_drink_abo.write_html("../over_45/vis_45/drink_abo_distribution.html")


### Favorite Drink


In [127]:
# Set the number of top favorite drinks to include
included = 10

# Create a Bar chart to visualize the distribution of favorite drinks
fig_fav_drink = go.Figure(data=go.Bar(
    x=df['fav_drink'].value_counts().sort_values(ascending=False)[:included].index,  # Top 'included' favorite drinks
    y=df['fav_drink'].value_counts().sort_values(ascending=False)[:included].values,  # Number of members for each favorite drink
    marker_color='skyblue',  # Color of the bars in the chart
    text=df['fav_drink'].value_counts().values,  # Display the count of members for each drink on the bars
    textposition='outside'  # Position the count labels outside the bars for clarity
))

# Update the layout of the plot (title, axis labels, and template)
fig_fav_drink.update_layout(
    title='Distribution of Favorite Drinks',  # Title of the plot
    xaxis_title='Drink',  # X-axis label
    yaxis_title='Number of Members',  # Y-axis label
    template='plotly_white'  # Use a clean white theme for the plot
)

# Display the bar chart
iplot(fig_fav_drink)

# Save the plot as an interactive HTML file
fig_fav_drink.write_html("../over_45/vis_45/fav_drink_distribution.html")

### Personal Training


In [128]:
# Create a Pie chart for the distribution of personal training participation
fig_personal_training = go.Figure(data=go.Pie(
    labels=df['personal_training'].value_counts().index.astype(str),  # Categories (Yes/No or any other values) of personal training
    values=df['personal_training'].value_counts().values,  # Count of members for each category (participating or not)
    # title='Personal Training Participation',  # Title of the pie chart
    textinfo='label+percent+value',  # Show label, percentage, and actual value in the chart
    textposition='auto',  # Automatically position the text on the chart for better readability
    marker_colors=['lightgreen', 'lightcoral']  # Custom colors for the pie slices (light green for yes, light coral for no)
))
fig_personal_training.update_layout(title='Personal Training Participation', )
# Display the pie chart
iplot(fig_personal_training)

# Save the pie chart as an interactive HTML file
fig_personal_training.write_html("../over_45/vis_45/personal_training_distribution.html")

### Personal Trainer Names

In [129]:
# Get the counts of each personal trainer (number of clients each trainer has)
trainer_counts = df['name_personal_trainer'].value_counts()

# Create a bar chart for the distribution of personal trainers
fig_trainer = go.Figure(data=go.Bar(
    x=trainer_counts.index,  # Trainer names (x-axis)
    y=trainer_counts.values,  # Number of clients (y-axis)
    marker_color=px.colors.qualitative.Pastel,  # Use pastel colors for the bars
    text=trainer_counts.values,  # Show the number of clients as text on top of the bars
    textposition='outside'  # Position the text outside the bars
))

# Update layout settings for better presentation
fig_trainer.update_layout(
    title='Distribution of Personal Trainers',  # Title of the chart
    xaxis_title='Trainer Name',  # Label for the x-axis
    yaxis_title='Number of Clients',  # Label for the y-axis
    template='plotly_white',  # Set chart theme to plotly_white
    xaxis_tickangle=-45  # Rotate x-axis labels to avoid overlap
)

# Display the bar chart
iplot(fig_trainer)

# Save the bar chart as an interactive HTML file
fig_trainer.write_html("../over_45/vis_45/personal_trainer_distribution.html")

### Sauna Usage


In [130]:
# Create a pie chart for the distribution of sauna usage
fig_sauna = go.Figure(data=go.Pie(
    labels=df['uses_sauna'].value_counts().index.astype(str),  # Extract unique values (True/False) from the 'uses_sauna' column
    values=df['uses_sauna'].value_counts().values,  # Get the count of each unique value (number of users for each category)
    title='Sauna Usage Distribution',  # Set the title of the chart
    textinfo='label+percent+value',  # Show label, percentage, and actual value on the pie slices
    textposition='auto',  # Automatically position the text on the pie slices
    marker_colors=['lightsalmon', 'lightblue']  # Set the colors for the pie slices (salmon for users, blue for non-users)
))

# Display the pie chart in an interactive format
iplot(fig_sauna)

# Save the pie chart as an interactive HTML file
fig_sauna.write_html("../over_45/vis_45/sauna_usage_distribution.html")

### Average Time in Gym by Abonoment Type and Gender

In [131]:
# Pivot table to calculate average time spent in the gym based on 'abonoment_type' and 'gender'
pivot_time_abo_gender = df.pivot_table(
    values='avg_time_in_gym',  # Calculate the mean of the 'avg_time_in_gym' column
    index='abonoment_type',  # Group by 'abonoment_type' (e.g., different membership types)
    columns='gender',  # Split the data by 'gender' (male, female)
    aggfunc='mean'  # Aggregate function to calculate the mean time for each group
)

# Display the pivot table
print("Average Time in Gym by Abonoment Type and Gender:")
print(pivot_time_abo_gender)


Average Time in Gym by Abonoment Type and Gender:
gender              Female        Male
abonoment_type                        
Premium          95.433333  101.090909
Standard        104.047619  100.189189


In [132]:
# Create pivot table
pivot_time_abo_gender = df.pivot_table(
    values='avg_time_in_gym', 
    index='abonoment_type', 
    columns='gender', 
    aggfunc='mean'
).round(1)

print("Average Time in Gym by Abonoment Type and Gender:")
print(pivot_time_abo_gender)

# Visualization 1: Grouped Bar Chart
fig_bar = go.Figure(data=[
    go.Bar(
        name='Female',
        x=pivot_time_abo_gender.index,
        y=pivot_time_abo_gender['Female'],
        text=[f'{val:.2f}' for val in pivot_time_abo_gender['Female']],
        textposition='auto'
    ),
    go.Bar(
        name='Male',
        x=pivot_time_abo_gender.index,
        y=pivot_time_abo_gender['Male'],
        text=[f'{val:.2f}' for val in pivot_time_abo_gender['Male']],
        textposition='auto'
    )
])

fig_bar.update_layout(
    title='Average Time in Gym by Abonoment Type and Gender',
    xaxis_title='Abonoment Type',
    yaxis_title='Average Time in Gym (Hours)',
    barmode='group',
    template='plotly_white'
)

# Visualization 2: Heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
    z=pivot_time_abo_gender.values,
    x=pivot_time_abo_gender.columns,
    y=pivot_time_abo_gender.index,
    colorscale='Viridis',
    text=pivot_time_abo_gender.values.round(2),
    texttemplate='%{text}',
    textfont={"size":10}
))

fig_heatmap.update_layout(
    title='Heatmap: Average Time in Gym by Abonoment Type and Gender',
    xaxis_title='Gender',
    yaxis_title='Abonoment Type',
    template='plotly_white'
)

# Visualization 3: Pie Charts for each Abonoment Type
fig_pie = make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Premium Membership', 'Standard Membership'],
    specs=[[{'type':'domain'}, {'type':'domain'}]]
)

for idx, abo_type in enumerate(pivot_time_abo_gender.index):
    fig_pie.add_trace(
        go.Pie(
            labels=['Female', 'Male'],
            values=pivot_time_abo_gender.loc[abo_type].values,
            textinfo='label+percent+value',
            marker_colors=['lightpink', 'lightblue'],
            
        ),
        row=1, 
        col=idx+1,
    )

fig_pie.update_layout(
    title='Distribution of Average Gym Time by Gender for Each Membership Type',
    height=500,
    width=1000,
    template='plotly_white',
)

# Display and save visualizations
iplot(fig_bar)
fig_bar.write_html("../over_45/vis_45/avg_gym_time_by_abo_gender_bar.html")

# fig_heatmap.show()
fig_heatmap.write_html("../over_45/vis_45/avg_gym_time_by_abo_gender_heatmap.html")

iplot(fig_pie)
fig_pie.write_html("../over_45/vis_45/avg_gym_time_by_abo_gender_pie.html")

# Additional statistical summary
print("\nDescriptive Statistics:")
print(pivot_time_abo_gender.describe().round(1))

Average Time in Gym by Abonoment Type and Gender:
gender          Female   Male
abonoment_type               
Premium           95.4  101.1
Standard         104.0  100.2



Descriptive Statistics:
gender  Female   Male
count      2.0    2.0
mean      99.7  100.6
std        6.1    0.6
min       95.4  100.2
25%       97.6  100.4
50%       99.7  100.6
75%      101.8  100.9
max      104.0  101.1


### Personal Training and Sauna Usage by Gender


In [133]:
# Pivot table to calculate count of personal training usage and sauna usage by gender
pivot_training_sauna = df.pivot_table(
    values='personal_training',  # We're counting the occurrences of 'personal_training'
    index='gender',  # Group by 'gender' (e.g., male, female)
    columns='uses_sauna',  # Split the data by whether the person uses the sauna
    aggfunc='count'  # Aggregate function is 'count', which counts the number of occurrences
)

# Display the pivot table
print("\nPersonal Training and Sauna Usage by Gender:")
print(pivot_training_sauna)



Personal Training and Sauna Usage by Gender:
uses_sauna  False  True 
gender                  
Female         26     25
Male           31     50


In [134]:
# Create pivot table
pivot_training_sauna = df.pivot_table(
    values='personal_training', 
    index='gender', 
    columns='uses_sauna', 
    aggfunc='count'
)


# Percentage Stacked Bar Chart
# Calculate percentages
pivot_training_sauna_pct = pivot_training_sauna.div(pivot_training_sauna.sum(axis=1), axis=0) * 100


# Pie Charts for each gender
fig_pie = make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Female Sauna Usage', 'Male Sauna Usage'],
    specs=[[{'type':'domain'}, {'type':'domain'}]]
)

for idx, gender in enumerate(['Female', 'Male']):
    gender_data = pivot_training_sauna.loc[gender]
    
    fig_pie.add_trace(
        go.Pie(
            labels=['Uses Sauna', 'Does Not Use Sauna'],
            values=gender_data.values,
            textinfo='percent+value',
            marker_colors=['lightgreen', 'lightcoral']
        ),
        row=1, 
        col=idx+1
    )

fig_pie.update_layout(
    title='Sauna Usage Distribution by Gender',
    height=500,
    width=1000,
    template='plotly_white'
)

# display plot
iplot(fig_pie)
# saving plot 
fig_pie.write_html("../over_45/vis_45/sauna_usage_by_gender_pie.html")

### Analysis of Age for gym visitors

In [135]:
#  data for top  members by age
df.sort_values(by='Age', ascending=False).head(10)

Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
34,222,Male,1975-04-08,49,Premium,3,"Fri, Sun, Thu",True,"Pilates, XCore, Spinning",20:01:00,22:34:00,153,False,,True,Mike,False,46-55
109,825,Male,1975-07-30,49,Premium,3,"Sat, Sun, Tue",True,"Zumba, BodyPump",10:53:00,13:29:00,156,False,,True,Mike,False,46-55
41,272,Male,1975-01-12,49,Premium,2,"Fri, Tue",True,Yoga,13:20:00,14:26:00,66,False,,True,Chantal,True,46-55
22,174,Male,1975-03-17,49,Premium,1,Sat,False,,11:21:00,12:46:00,85,False,,False,,True,46-55
55,444,Male,1975-02-23,49,Standard,2,"Mon, Sun",False,,18:40:00,20:46:00,126,False,,True,Hanna,True,46-55
25,186,Female,1975-03-24,49,Standard,3,"Fri, Sun, Wed",False,,17:42:00,19:00:00,78,True,passion_fruit,True,Mike,True,46-55
106,813,Male,1975-06-22,49,Standard,5,"Mon, Sat, Sun, Thu, Tue",False,,14:07:00,15:03:00,56,True,"berry_boost, orange",False,,False,46-55
52,418,Female,1975-09-25,49,Premium,3,"Mon, Sun, Wed",False,,19:37:00,21:31:00,114,True,"berry_boost, coconut_pineapple",False,,False,46-55
28,202,Male,1975-05-25,49,Premium,4,"Mon, Sat, Tue, Wed",False,,08:21:00,08:52:00,31,True,black_currant,False,,False,46-55
84,646,Male,1975-08-16,49,Premium,3,"Fri, Sun, Thu",False,,17:16:00,19:52:00,156,True,passion_fruit,False,,False,46-55


In [136]:
num_top=20
mean_visit_top=df.sort_values(by='Age', ascending=False).head(num_top)['visit_per_week'].mean()
print(f"Average number of visits per week for top {num_top} members by age: {mean_visit_top:.2f} visit per week") 

Average number of visits per week for top 20 members by age: 2.75 visit per week


In [137]:
num_top=20
mean_visit_top=df.sort_values(by='Age', ascending=False).head(num_top)['avg_time_in_gym'].mean()
print(f"Average time in gym for top {num_top} members by age: {mean_visit_top:.2f} minute") 

Average time in gym for top 20 members by age: 74.15 minute


### Summary Statistics by Gender

In [138]:

# Create the 'outputs' directory if it doesn't already exist
os.makedirs('../over_45/outputs_45/', exist_ok=True)

# Open the file 'gender_summary_stats_age.txt' in write mode and store summary statistics in it
with open('../over_45/outputs_45/gender_summary_stats_age.txt', 'w') as f:
    # Write the descriptive statistics for the 'Age' column grouped by 'gender' to the text file
    f.write(str(df.groupby('gender')[["Age"]].describe().round(1)))

# Print the descriptive statistics for 'Age' grouped by 'gender' to the console
df.groupby('gender')[['Age']].describe().round(1)


Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,51.0,47.0,1.4,45.0,46.0,47.0,48.0,49.0
Male,81.0,47.1,1.5,45.0,46.0,47.0,49.0,49.0


In [139]:

# Open the file 'gender_summary_stats_visit.txt' in write mode and store summary statistics in it
with open('../over_45/outputs_45/gender_summary_stats_visit.txt', 'w') as f:
    # Write the descriptive statistics for the 'visit_per_week' column grouped by 'gender' to the text file
    f.write(str(df.groupby('gender')[["visit_per_week"]].describe().round(1)))

# Print the descriptive statistics for 'visit_per_week' grouped by 'gender' to the console
df.groupby('gender')[['visit_per_week']].describe().round(1)


Unnamed: 0_level_0,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week,visit_per_week
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,51.0,2.5,1.2,1.0,2.0,2.0,3.0,5.0
Male,81.0,2.7,1.2,1.0,2.0,3.0,3.0,5.0


In [140]:
# Open the file 'gender_summary_stats_time.txt' in write mode and store summary statistics in it
with open('../over_45/outputs_45/gender_summary_stats_time.txt', 'w') as f:
    # Write the descriptive statistics for the 'avg_time_in_gym' column grouped by 'gender' to the text file
    f.write(str(df.groupby('gender')[["avg_time_in_gym"]].describe().round(1)))

# Print the descriptive statistics for 'avg_time_in_gym' grouped by 'gender' to the console
df.groupby('gender')[['avg_time_in_gym']].describe().round(1)

Unnamed: 0_level_0,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym,avg_time_in_gym
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Female,51.0,99.0,43.9,35.0,57.0,91.0,137.0,177.0
Male,81.0,100.7,45.7,30.0,57.0,93.0,138.0,180.0


In [141]:

# Open the file 'gender_summary_stats_abo.txt' in write mode and store summary statistics in it
with open('../over_45/outputs_45/gender_summary_stats_abo.txt', 'w') as f:
    # Write the descriptive statistics for the 'abonoment_type' column grouped by 'gender' to the text file
    f.write(str(df.groupby('gender')[["abonoment_type"]].describe().round(1)))

# Print the descriptive statistics for 'abonoment_type' grouped by 'gender' to the console
df.groupby('gender')[['abonoment_type']].describe().round(1)

Unnamed: 0_level_0,abonoment_type,abonoment_type,abonoment_type,abonoment_type
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,51,2,Premium,30
Male,81,2,Premium,44


In [142]:
# Open the file 'gender_summary_stats_attend.txt' in write mode and store summary statistics in it
with open('../over_45/outputs_45/gender_summary_stats_attend.txt', 'w') as f:
    # Write the descriptive statistics for the 'attend_group_lesson' column grouped by 'gender' to the text file
    f.write(str(df.groupby('gender')[["attend_group_lesson"]].describe().round(1)))

# Print the descriptive statistics for 'attend_group_lesson' grouped by 'gender' to the console
df.groupby('gender')[['attend_group_lesson']].describe().round(1)

Unnamed: 0_level_0,attend_group_lesson,attend_group_lesson,attend_group_lesson,attend_group_lesson
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,51,2,False,26
Male,81,2,False,42


In [143]:
# Open the file 'gender_summary_stats_drink.txt' in write mode and store summary statistics in it
with open('../over_45/outputs_45/gender_summary_stats_drink.txt', 'w') as f:
    # Write the descriptive statistics for the 'drink_abo' column grouped by 'gender' to the text file
    f.write(str(df.groupby('gender')[["drink_abo"]].describe().round(1)))

# Print the descriptive statistics for 'drink_abo' grouped by 'gender' to the console
df.groupby('gender')[['drink_abo']].describe().round(1)

Unnamed: 0_level_0,drink_abo,drink_abo,drink_abo,drink_abo
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,51,2,True,30
Male,81,2,True,46


In [144]:
# Open the file 'gender_summary_stats_personal.txt' in write mode and store summary statistics in it
with open('../over_45/outputs_45/gender_summary_stats_personal.txt', 'w') as f:
    # Write the descriptive statistics for the 'personal_training' column grouped by 'gender' to the text file
    f.write(str(df.groupby('gender')[["personal_training"]].describe().round(1)))

# Print the descriptive statistics for 'personal_training' grouped by 'gender' to the console
df.groupby('gender')[['personal_training']].describe().round(1)

Unnamed: 0_level_0,personal_training,personal_training,personal_training,personal_training
Unnamed: 0_level_1,count,unique,top,freq
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,51,2,True,33
Male,81,2,True,47


In [145]:

# Get the value counts for male personal trainers
male_trainers_counts = df[df['gender']=="Male"]['name_personal_trainer'].value_counts()

# Create a Plotly bar plot
fig = px.bar(
    x=male_trainers_counts.index, 
    y=male_trainers_counts.values, 
    title='Number of Male Clients per Personal Trainer',
    labels={'x': 'Personal Trainer', 'y': 'Number of Male Clients'}
)

# Optional: Improve readability
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
iplot(fig)

# Optional: Save the plot as an interactive HTML file
fig.write_html('../over_45/vis_45/male_trainers_distribution.html')

In [146]:

# Get the value counts for male personal trainers
female_trainers_counts = df[df['gender']=="Female"]['name_personal_trainer'].value_counts()

# Create a Plotly bar plot
fig = px.bar(
    x=female_trainers_counts.index, 
    y=female_trainers_counts.values, 
    title='Number of Female Clients per Personal Trainer',
    labels={'x': 'Personal Trainer', 'y': 'Number of Female Clients'}
)

# Optional: Improve readability
fig.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for better readability
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
iplot(fig)

# Optional: Save the plot as an interactive HTML file
fig.write_html('../over_45/vis_45/female_trainers_distribution.html')

In [147]:
# Get the value counts for female sauna usage
female_sauna_counts = df[df['gender']=="Female"]['uses_sauna'].value_counts()

# Create a Plotly pie plot
fig = px.pie(
    values=female_sauna_counts.values, 
    names=female_sauna_counts.index, 
    title='Female Clients Sauna Usage',
    hole=0.3,
    # color_discrete_sequence=['blue', 'red']  # Specify exact colors
  # Optional: creates a donut chart effect
)

# Optional: Customize layout
fig.update_layout(
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
iplot(fig)

# Optional: Save the plot as an interactive HTML file
fig.write_html('../over_45/vis_45/females_using_sauna.html')

In [148]:
# Get the value counts for female sauna usage
male_sauna_counts = df[df['gender']=="Male"]['uses_sauna'].value_counts()

# Create a Plotly pie plot
fig = px.pie(
    values=male_sauna_counts.values, 
    names=male_sauna_counts.index, 
    title='Male Clients Sauna Usage',
    hole=0.3,
    color_discrete_sequence=px.colors.qualitative.Pastel  # Pastel color palette

)

# Optional: Customize layout
fig.update_layout(
    height=600,  # Adjust height as needed
    width=800   # Adjust width as needed
)

# Display the plot
iplot(fig)

# Optional: Save the plot as an interactive HTML file
fig.write_html('../over_45/vis_45/males_using_sauna.html')

In [149]:
def plot_lessons_count(gender):
    lessons=[]
    for i,lesson_row in enumerate(df[df['gender']==gender]['fav_group_lesson'].values): 
        # print(i,lesson_row)
        if type(lesson_row)!=str:
            continue
        lessons.extend(lesson_row.split(', '))
    # print(lessons)
    dict(Counter(lessons))
    lessons_count=dict(Counter(lessons))
    sorted_lessons_count = dict(sorted(lessons_count.items(), key=lambda item: item[1],reverse=True))

    # Optional: Create a bar plot to visualize the results
    import plotly.graph_objs as go

    fig = go.Figure(data=[go.Bar(
        x=list(sorted_lessons_count.keys()),
        y=list(sorted_lessons_count.values()),
        marker_color='skyblue'
    )])

    fig.update_layout(
        title=f'Lessons Count by {gender}',
        xaxis_title='Lesson',
        yaxis_title=f'Count of {gender}',
        template='plotly_white'
    )

    iplot(fig)
    fig.write_html(f"../over_45/vis_45/{gender}_Lessons_Count.html")

plot_lessons_count("Female")

In [150]:
# bar plot for lesson count for male members
plot_lessons_count("Male")

### Age Group Summary Statistics by Personal Training

In [151]:
# First, let's split the combined group lessons
def split_group_lessons(lessons):
    return [lesson.strip() for lesson in lessons.split(',')]

# Explode the group lessons to get individual lessons
df_exploded = df.assign(
    fav_group_lesson=df['fav_group_lesson'].str.split(', ')
).explode('fav_group_lesson')

# Count of each group lesson for every age group
group_lesson_counts = df_exploded.groupby(['age_group', 'fav_group_lesson'],observed=False).size().reset_index(name='count')

# Plotly Visualization
import plotly.express as px
import plotly.graph_objs as go

# Heatmap
fig_heatmap = px.density_heatmap(
    group_lesson_counts, 
    x='fav_group_lesson', 
    y='age_group', 
    z='count',
    title='Distribution of Favorite Group Lessons Across Age Groups',
    labels={'count': 'Number of Members', 'fav_group_lesson': 'Group Lesson'},
    color_continuous_scale='Viridis'
)

fig_heatmap.update_layout(
    xaxis_title='Group Lesson',
    yaxis_title='Age Group',
    template='plotly_white',
    height=600,
    width=1000
)

iplot(fig_heatmap)



# Print top 10 group lessons by total count
print("Top 10 Group Lessons by Total Count:")
print(group_lesson_counts.groupby('fav_group_lesson')['count'].sum().sort_values(ascending=False).head(10))

Top 10 Group Lessons by Total Count:
fav_group_lesson
BodyPump       20
Running        13
BodyBalance    12
HIT            12
LesMiles       12
Kickboxen      11
Zumba          11
Pilates        10
XCore          10
Yoga           10
Name: count, dtype: int64


### Time Range people count

In [152]:
def get_people_hour_count(start_hour):
    # Ensure start_hour is within 0-23 range
    start_hour = start_hour % 24
    
    # Convert the time column to datetime time format
    df['avg_time_check_in'] = pd.to_datetime(df['avg_time_check_in'], format='%H:%M:%S').dt.time
    
    # Handle the special case for 23:00 to 00:00
    if start_hour == 23:
        df_filtered = df[(df['avg_time_check_in'] >= pd.to_datetime('23:00:00').time()) | 
                         (df['avg_time_check_in'] < pd.to_datetime('00:00:00').time())]
    else:
        # Filter the DataFrame for times between start_hour:00:00 and (start_hour+1):00:00
        df_filtered = df[(df['avg_time_check_in'] >= pd.to_datetime(f'{start_hour:02d}:00:00').time()) & 
                         (df['avg_time_check_in'] < pd.to_datetime(f'{(start_hour+1):02d}:00:00').time())]

    # Return the count
    return len(df_filtered)

# Collect results in a dictionary
hourly_attendance = {f"{hour:02d}:00": get_people_hour_count(hour) for hour in range(8, 24)}

# Visualization
import plotly.express as px
import plotly.graph_objs as go

# Create a bar chart of hourly attendance
fig = px.bar(
    x=list(hourly_attendance.keys()), 
    y=list(hourly_attendance.values()),
    title='Hourly Gym Attendance (8:00 AM - 11:59 PM)',
    labels={'x': 'Time', 'y': 'Number of People'}
)

fig.update_layout(
    xaxis_title='Time',
    yaxis_title='Number of People',
    template='plotly_white',
    xaxis_tickangle=-45,
    height=600,
    width=1000
)

iplot(fig)

# Line plot for trend visualization
fig_line = px.line(
    x=list(hourly_attendance.keys()), 
    y=list(hourly_attendance.values()),
    title='Hourly Gym Attendance Trend (8:00 AM - 11:59 PM)',
    labels={'x': 'Time', 'y': 'Number of People'}
)

fig_line.update_layout(
    xaxis_title='Time',
    yaxis_title='Number of People',
    template='plotly_white',
    xaxis_tickangle=-45,
    height=600,
    width=1000,
    # markers=True
)

iplot(fig_line)
fig_line.write_html('../over_45/vis_45/Hourly_Gym_Attendance_Trend.html') 

### People staying at gym for most time 

In [153]:
# Top members by average time in gym
print("Top members by average time in gym:")
df.sort_values(by='avg_time_in_gym',ascending=False).head(10) 

Top members by average time in gym:


Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
48,382,Male,1978-12-06,45,Premium,2,"Mon, Wed",True,"HIT, Spinning",14:20:00,17:20:00,180,True,black_currant,True,Hanna,True,36-45
102,791,Female,1977-06-26,47,Standard,3,"Sat, Thu, Tue",False,,16:49:00,19:46:00,177,True,"berry_boost, orange",False,,False,46-55
103,797,Male,1978-01-21,46,Standard,5,"Fri, Mon, Sat, Tue, Wed",True,"Spinning, Yoga",09:41:00,12:38:00,177,True,orange,True,Chantal,True,46-55
116,868,Female,1979-02-03,45,Standard,2,"Fri, Wed",True,"BodyBalance, Zumba",18:26:00,21:21:00,175,True,"orange, passion_fruit",True,Mike,False,36-45
79,605,Male,1979-02-18,45,Standard,3,"Mon, Sat, Tue",True,HIT,17:42:00,20:36:00,174,False,,False,,True,36-45
39,264,Male,1976-07-05,48,Standard,5,"Fri, Mon, Sat, Sun, Thu",False,,15:42:00,18:35:00,173,False,,True,Hanna,True,46-55
44,313,Male,1975-12-28,48,Standard,1,Tue,True,"BodyPump, HIT",19:29:00,22:21:00,172,True,"passion_fruit, coconut_pineapple",False,,True,46-55
125,945,Female,1975-11-29,48,Standard,2,"Thu, Wed",False,,17:14:00,20:05:00,171,False,,True,Jeffrey,True,46-55
47,372,Female,1978-10-10,46,Premium,5,"Fri, Sun, Thu, Tue, Wed",True,"Running, BodyPump",11:18:00,14:06:00,168,True,"coconut_pineapple, black_currant",True,Jeffrey,True,46-55
126,953,Female,1977-06-25,47,Standard,2,"Mon, Wed",True,"BodyPump, Zumba",18:47:00,21:35:00,168,True,"berry_boost, orange",False,,False,46-55


### People staying at gym for least time

In [154]:
# least members by average time in gym
print("Least members by average time in gym:")
df.sort_values(by='avg_time_in_gym',ascending=True).head(10) 

Least members by average time in gym:


Unnamed: 0,id,gender,birthday,Age,abonoment_type,visit_per_week,days_per_week,attend_group_lesson,fav_group_lesson,avg_time_check_in,avg_time_check_out,avg_time_in_gym,drink_abo,fav_drink,personal_training,name_personal_trainer,uses_sauna,age_group
6,58,Male,1977-12-30,46,Premium,5,"Fri, Mon, Sat, Thu, Tue",True,"BodyBalance, BodyPump",12:19:00,12:49:00,30,False,,True,Chantal,True,46-55
28,202,Male,1975-05-25,49,Premium,4,"Mon, Sat, Tue, Wed",False,,08:21:00,08:52:00,31,True,black_currant,False,,False,46-55
91,698,Male,1975-03-21,49,Standard,3,"Fri, Mon, Tue",False,,14:52:00,15:25:00,33,True,"passion_fruit, black_currant",True,Chantal,False,46-55
31,218,Female,1978-01-12,46,Premium,3,"Sun, Thu, Wed",True,BodyPump,18:01:00,18:36:00,35,True,"coconut_pineapple, passion_fruit",False,,False,46-55
20,172,Female,1975-07-05,49,Standard,5,"Sat, Sun, Thu, Tue, Wed",True,Pilates,08:15:00,08:50:00,35,False,,False,,False,46-55
42,285,Male,1976-06-08,48,Premium,4,"Mon, Sun, Thu, Wed",False,,13:33:00,14:08:00,35,True,lemon,False,,False,46-55
23,176,Male,1976-02-24,48,Standard,4,"Fri, Thu, Tue, Wed",False,,09:34:00,10:11:00,37,False,,False,,False,46-55
92,724,Male,1977-07-15,47,Premium,2,"Mon, Tue",True,"HIT, Pilates",19:06:00,19:43:00,37,False,,True,Chantal,True,46-55
36,243,Female,1975-08-16,49,Premium,1,Sun,False,,13:58:00,14:35:00,37,True,berry_boost,True,Mike,False,46-55
30,216,Male,1975-04-11,49,Premium,3,"Fri, Mon, Wed",False,,12:37:00,13:15:00,38,True,lemon,True,Mike,False,46-55


## Correlations

In [155]:
numerical_cols = df.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation matrix for numerical columns
correlation_matrix = df[numerical_cols].corr().round(2)
correlation_matrix

Unnamed: 0,Age,visit_per_week,avg_time_in_gym
Age,1.0,-0.09,-0.27
visit_per_week,-0.09,1.0,-0.1
avg_time_in_gym,-0.27,-0.1,1.0


In [156]:
# Select only numerical columns
numerical_cols = df.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation matrix for numerical columns
correlation_matrix = df[numerical_cols].corr().round(2)

# Visualization using Plotly
import plotly.express as px

# Heatmap of correlation matrix
fig = px.imshow(
    correlation_matrix, 
    title='Correlation Heatmap of Numerical Columns',
    color_continuous_scale='RdBu_r',  # Red-Blue diverging color scale
    text_auto=True  # Show correlation values
)

# Customize layout
fig.update_layout(
    width=800,
    height=800
)

iplot(fig)

# Print the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Optional: Identify highly correlated features
def get_high_correlations(corr_matrix, threshold=0.5):
    high_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                high_corr.append((
                    corr_matrix.columns[i], 
                    corr_matrix.columns[j], 
                    corr_matrix.iloc[i, j]
                ))
    return high_corr

high_correlations = get_high_correlations(correlation_matrix)
print("\nHighly Correlated Features (|correlation| > 0.5):")
for feat1, feat2, corr_value in high_correlations:
    print(f"{feat1} - {feat2}: {corr_value:.2f}")


Correlation Matrix:
                  Age  visit_per_week  avg_time_in_gym
Age              1.00           -0.09            -0.27
visit_per_week  -0.09            1.00            -0.10
avg_time_in_gym -0.27           -0.10             1.00

Highly Correlated Features (|correlation| > 0.5):
