In [1]:
# Import necessary libraries for data manipulation and visualization
# Pandas is used for data handling and analysis
# Plotly Express for creating interactive and simple visualizations
# Plotly IO to set the default template for visualizations
# Plotly Graph Objects for more complex, customizable charts
# Set the default template for Plotly visualizations to 'plotly_white' for clean, minimalistic plots

import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.templates.default = "plotly_white"



In [None]:
# Load the dataset 'bounce-rate.csv' using pandas and store it in a DataFrame
# Display the first few rows of the dataset to inspect the data structure and contents
data = pd.read_csv("bounce-rate.csv")
print(data.head())

      Client ID  Sessions Avg. Session Duration Bounce Rate
0  5.778476e+08       367              00:01:35      87.19%
1  1.583822e+09       260              00:01:04      29.62%
2  1.030699e+09       237              00:00:02      99.16%
3  1.025030e+09       226              00:02:22      25.66%
4  1.469968e+09       216              00:01:23      46.76%


The dataset contains the following columns:

- Client ID: A unique identifier assigned to each user or device.
- Sessions: The number of sessions recorded for each user.
- Avg. Session Duration: The average duration of a session for each user, presented in HH:MM:SS format.
- Bounce Rate: The percentage of sessions where the user leaves the website after viewing only one page.
This data represents user behavior metrics, with the Client ID showing a unique user identifier, the number of sessions, how long users stay on average per session, and their respective bounce rates.

In [4]:
# Check for missing values in the dataset by counting the number of null values in each column
# This helps identify any data quality issues that need to be addressed
print(data.isnull().sum())

Client ID                0
Sessions                 0
Avg. Session Duration    0
Bounce Rate              0
dtype: int64


This result suggests that the dataset has no missing or null values in any of the columns.

In [5]:
# Print general information about the DataFrame, including the number of non-null entries,
# the data types of each column, and memory usage. This helps to quickly assess the structure
# and completeness of the dataset.
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Client ID              999 non-null    float64
 1   Sessions               999 non-null    int64  
 2   Avg. Session Duration  999 non-null    object 
 3   Bounce Rate            999 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 31.3+ KB
None



This output provides an overview of the dataset using the data.info() method in Pandas. Here's a breakdown of the information:

Class Type: The data is a Pandas DataFrame, which is a 2-dimensional labeled data structure commonly used for data manipulation in Python.

RangeIndex: The dataset contains 999 entries (rows), indexed from 0 to 998.

Data Columns: The dataset has 4 columns:

Client ID: This column contains 999 non-null entries with the data type float64, meaning it stores floating-point numbers.
Sessions: This column also has 999 non-null entries, with an int64 data type, indicating it holds integer values.
Avg. Session Duration: This column contains 999 non-null entries with the data type object. This suggests that the values in this column are likely stored as strings (text), possibly representing time durations in a non-standard format.
Bounce Rate: This column also has 999 non-null entries with the data type object, indicating it is stored as a string, likely representing percentage values.
Dtypes: The columns have mixed data types:

float64 for numerical values (Client ID).
int64 for numerical values (Sessions).
object for text-based data (Avg. Session Duration and Bounce Rate).
Memory Usage: The dataset occupies approximately 31.3 KB of memory.

This summary helps assess the structure of the dataset, including column types and any potential issues with data formatting (e.g., 'Avg. Session Duration' and 'Bounce Rate' stored as objects).

In [6]:
# The following steps clean and transform the 'Avg. Session Duration' and 'Bounce Rate' columns:

# Remove the leading character (assumed to be a non-numeric symbol, e.g., '0') from 'Avg. Session Duration'
data['Avg. Session Duration'] = data['Avg. Session Duration'].str[1:]

# Convert the 'Avg. Session Duration' column to a timedelta format (time difference) for easier manipulation
data['Avg. Session Duration'] = pd.to_timedelta(data['Avg. Session Duration'])

# Convert 'Avg. Session Duration' from timedelta to minutes for consistency and easier interpretation
data['Avg. Session Duration'] = data['Avg. Session Duration'] / pd.Timedelta(minutes=1)

# Remove the percentage sign '%' from 'Bounce Rate' and convert the values to floats for numerical analysis
data['Bounce Rate'] = data['Bounce Rate'].str.rstrip('%').astype('float')

# Print the updated DataFrame to verify the changes
print(data)


        Client ID  Sessions  Avg. Session Duration  Bounce Rate
0    5.778476e+08       367               1.583333        87.19
1    1.583822e+09       260               1.066667        29.62
2    1.030699e+09       237               0.033333        99.16
3    1.025030e+09       226               2.366667        25.66
4    1.469968e+09       216               1.383333        46.76
..            ...       ...                    ...          ...
994  1.049263e+09        17               7.733333        41.18
995  1.145806e+09        17               5.616667        47.06
996  1.153811e+09        17               0.200000        94.12
997  1.182133e+09        17               1.216667        88.24
998  1.184187e+09        17               2.566667        64.71

[999 rows x 4 columns]


In [7]:
# Generate a statistical summary of the numerical columns in the DataFrame.
# This includes:
# - Count: Number of non-null entries
# - Mean: Average value
# - Std: Standard deviation (measure of spread)
# - Min: Minimum value
# - 25%, 50%, 75%: Percentiles (25th, median, 75th)
# - Max: Maximum value
# This provides a quick overview of the distribution and variability of the numerical data.
print(data.describe())


          Client ID    Sessions  Avg. Session Duration  Bounce Rate
count  9.990000e+02  999.000000             999.000000   999.000000
mean   1.036401e+09   32.259259               3.636520    65.307978
std    6.151503e+08   24.658588               4.040562    22.997270
min    1.849182e+05   17.000000               0.000000     4.880000
25%    4.801824e+08   21.000000               0.891667    47.370000
50%    1.029507e+09   25.000000               2.466667    66.670000
75%    1.587982e+09   35.000000               4.816667    85.190000
max    2.063338e+09  367.000000              30.666667   100.000000


Explanation of the results from data.describe():

Client ID:
- Count: 999 non-null values.
- Mean: The average Client ID is around 1.036 billion, suggesting a broad range of unique user IDs.
- Std: The standard deviation of about 615 million indicates significant variability in the Client ID values.
- Min: The smallest Client ID value is 184,918, which is quite low compared to the average, indicating that IDs are not sequential or numerically close.
- Max: The largest Client ID value is approximately 2.06 billion, showing the large range of identifiers.

Sessions:
- Count: 999 non-null values.
- Mean: The average number of sessions per user is 32.26.
- Std: The standard deviation of 24.66 indicates that there is considerable variability in the number of sessions among users.
- Min: The smallest number of sessions recorded for a user is 17.
- Max: The maximum number of sessions for a user is 367, showing that some users are much more engaged than others.
- Percentiles: 25% of users have 21 sessions or fewer, 50% (the median) have 25 sessions or fewer, and 75% have 35 sessions or fewer.

Avg. Session Duration:
- Count: 999 non-null values.
- Mean: The average session duration is approximately 3.64 minutes.
- Std: The standard deviation is 4.04 minutes, showing a wide spread of session lengths.
- Min: The shortest session duration recorded is 0 minutes, indicating that some sessions were extremely brief or possibly not properly logged.
- Max: The longest session recorded is about 30.67 minutes, which is quite high, suggesting some users have much longer engagement times.
- Percentiles: The 25th percentile indicates that 25% of sessions are shorter than about 0.89 minutes, the median is 2.47 minutes, and the 75th percentile shows that 75% of sessions are shorter than 4.82 minutes.

Bounce Rate:
- Count: 999 non-null values.
- Mean: The average bounce rate is 65.31%, indicating that, on average, users leave the site without interacting much after viewing only one page.
- Std: The standard deviation of 22.99% shows significant variation in bounce rates among users.
- Min: The lowest recorded bounce rate is 4.88%, meaning some users interact significantly before leaving.
- Max: The highest bounce rate is 100%, indicating some users leave after just viewing the landing page.
- Percentiles: The 25th percentile shows that 25% of users have a bounce rate lower than 47.37%, the median is 66.67%, and the 75th percentile shows that 75% of users have a bounce rate lower than 85.19%.

These statistics give a snapshot of user engagement, showing variability in session behavior and bounce rates across the dataset. The large spread in session durations and bounce rates suggests diverse user behavior.

In [8]:
# Exclude 'Client Id' column from the dataset
data_without_id = data.drop('Client ID', axis=1)

# Calculate the correlation matrix
correlation_matrix = data_without_id.corr()

# Visualize the correlation matrix
correlation_fig = px.imshow(correlation_matrix, 
                            labels=dict(x='Features', 
                                        y='Features', 
                                        color='Correlation'))
correlation_fig.update_layout(title='Correlation Matrix')
correlation_fig.show()

In [9]:
# Define the thresholds for high, medium, and low bounce rates
high_bounce_rate_threshold = 70
low_bounce_rate_threshold = 30

# Segment the clients based on bounce rates
data['Bounce Rate Segment'] = pd.cut(data['Bounce Rate'], 
                                     bins=[0, low_bounce_rate_threshold, 
                                           high_bounce_rate_threshold, 100],
                                   labels=['Low', 'Medium', 'High'], right=False)

# Count the number of clients in each segment
segment_counts = data['Bounce Rate Segment'].value_counts().sort_index()

# Visualize the segments
segment_fig = px.bar(segment_counts, labels={'index': 'Bounce Rate Segment', 
                                             'value': 'Number of Clients'},
                     title='Segmentation of Clients based on Bounce Rates')
segment_fig.show()

In [10]:
# Calculate the average session duration for each segment
segment_avg_duration = data.groupby('Bounce Rate Segment')['Avg. Session Duration'].mean()

# Create a bar chart to compare user engagement
engagement_fig = go.Figure(data=go.Bar(
    x=segment_avg_duration.index,
    y=segment_avg_duration,
    text=segment_avg_duration.round(2),
    textposition='auto',
    marker=dict(color=['#2ECC40', '#FFDC00', '#FF4136'])
))

engagement_fig.update_layout(
    title='Comparison of User Engagement by Bounce Rate Segment',
    xaxis=dict(title='Bounce Rate Segment'),
    yaxis=dict(title='Average Session Duration (minutes)'),
)

engagement_fig.show()

In [11]:
# Calculate the total session duration for each client
data['Total Session Duration'] = data['Sessions'] * data['Avg. Session Duration']

# Sort the DataFrame by the total session duration in descending order
df_sorted = data.sort_values('Total Session Duration', ascending=False)

# the top 10 most loyal users
df_sorted.head(10)

Unnamed: 0,Client ID,Sessions,Avg. Session Duration,Bounce Rate,Bounce Rate Segment,Total Session Duration
20,1884620000.0,93,30.666667,16.13,Low,2852.0
54,1041722000.0,67,20.5,22.39,Low,1373.5
262,875655700.0,34,29.966667,26.47,Low,1018.866667
10,1461865000.0,117,8.45,48.72,Medium,988.65
173,184918.2,40,24.416667,17.5,Low,976.666667
15,1049234000.0,99,9.716667,34.34,Medium,961.95
310,2026953000.0,31,22.116667,35.48,Medium,685.616667
24,1903206000.0,90,7.016667,36.67,Medium,631.5
211,2054569000.0,37,16.25,35.14,Medium,601.25
402,622093500.0,28,21.3,39.29,Medium,596.4


In [12]:
# Create a scatter plot to analyze the relationship between bounce rate and avg session duration
scatter_fig = px.scatter(data, x='Bounce Rate', y='Avg. Session Duration',
                         title='Relationship between Bounce Rate and Avg. Session Duration', trendline='ols')

scatter_fig.update_layout(
    xaxis=dict(title='Bounce Rate'),
    yaxis=dict(title='Avg. Session Duration')
)

scatter_fig.show()

In [13]:
# Define the retention segments based on number of sessions
def get_retention_segment(row):
    if row['Sessions'] >= 32: # 32 is mean of sessions
        return 'Frequent Users'
    else:
        return 'Occasional Users'

# Create a new column for retention segments
data['Retention Segment'] = data.apply(get_retention_segment, axis=1)

# Print the updated DataFrame
print(data)

        Client ID  Sessions  Avg. Session Duration  Bounce Rate  \
0    5.778476e+08       367               1.583333        87.19   
1    1.583822e+09       260               1.066667        29.62   
2    1.030699e+09       237               0.033333        99.16   
3    1.025030e+09       226               2.366667        25.66   
4    1.469968e+09       216               1.383333        46.76   
..            ...       ...                    ...          ...   
994  1.049263e+09        17               7.733333        41.18   
995  1.145806e+09        17               5.616667        47.06   
996  1.153811e+09        17               0.200000        94.12   
997  1.182133e+09        17               1.216667        88.24   
998  1.184187e+09        17               2.566667        64.71   

    Bounce Rate Segment  Total Session Duration Retention Segment  
0                  High              581.083333    Frequent Users  
1                   Low              277.333333    Frequent

In [14]:
# Calculate the average bounce rate for each retention segment
segment_bounce_rates = data.groupby('Retention Segment')['Bounce Rate'].mean().reset_index()

# Create a bar chart to visualize the average bounce rates by retention segment
bar_fig = px.bar(segment_bounce_rates, x='Retention Segment', y='Bounce Rate',
                 title='Average Bounce Rate by Retention Segment',
                 labels={'Retention Segment': 'Retention Segment', 'Bounce Rate': 'Average Bounce Rate'})

bar_fig.show()

In [15]:
# Count the number of users in each retention segment
segment_counts = data['Retention Segment'].value_counts()

# Define the pastel colors
colors = ['#FFB6C1', '#87CEFA']

# Create a pie chart using Plotly
fig = px.pie(segment_counts, 
             values=segment_counts.values, 
             names=segment_counts.index, 
             color=segment_counts.index, 
             color_discrete_sequence=colors,
             title='User Retention Rate')

# Update layout and show the chart
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()