In [1]:
import pandas as pd

df = pd.read_csv('data/Customer Purchasing Behaviors.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             238 non-null    int64  
 1   age                 238 non-null    int64  
 2   annual_income       238 non-null    int64  
 3   purchase_amount     238 non-null    int64  
 4   loyalty_score       238 non-null    float64
 5   region              238 non-null    object 
 6   purchase_frequency  238 non-null    int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 13.1+ KB


In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,238.0,119.5,68.848868,1.0,60.25,119.5,178.75,238.0
age,238.0,38.676471,9.351118,22.0,31.0,39.0,46.75,55.0
annual_income,238.0,57407.563025,11403.875717,30000.0,50000.0,59000.0,66750.0,75000.0
purchase_amount,238.0,425.630252,140.052062,150.0,320.0,440.0,527.5,640.0
loyalty_score,238.0,6.794118,1.899047,3.0,5.5,7.0,8.275,9.5
purchase_frequency,238.0,19.798319,4.562884,10.0,17.0,20.0,23.0,28.0


In [4]:
import pandas as pd
import plotly.graph_objects as go

# Select the columns of interest
columns_of_interest = ['age', 'annual_income', 'purchase_amount', 'purchase_frequency']

# Calculate the required statistics
statistics = df[columns_of_interest].describe().loc[['mean', '50%', 'std', 'min', 'max']]

# Rename '50%' to 'median' for clarity
statistics.rename(index={'50%': 'median'}, inplace=True)

# Create the table
fig = go.Figure(data=[go.Table(
    header=dict(values=['Statistic'] + list(statistics.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[statistics.index] + [statistics[col].values for col in statistics.columns],
               fill_color='lavender',
               align='left'))
])

# Save the table as an image
fig.write_image('figures/statistics_table.png')

# Show the plot (optional)
fig.show()

In [5]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

Histograms

In [6]:
# Histogram for Age
fig = px.histogram(df, x='age', title='Age Distribution')
fig.show()

# Histogram for Annual Income
fig = px.histogram(df, x='annual_income', title='Annual Income Distribution')
fig.show()

# Histogram for Purchase Amount
fig = px.histogram(df, x='purchase_amount', title='Purchase Amount Distribution')
fig.show()

# Histogram for Purchase Frequency
fig = px.histogram(df, x='purchase_frequency', title='Purchase Frequency Distribution')
fig.show()

Box Plots

In [7]:
# Calculate quartiles
Q1 = df['annual_income'].quantile(0.25)
Q3 = df['annual_income'].quantile(0.75)

# Define income brackets based on quartiles
def categorize_income(income):
    if income < Q1:
        return 'low'
    elif Q1 <= income < Q3:
        return 'medium'
    else:
        return 'high'

# Apply the function to create a new column 'income_bracket'
df['income_bracket'] = df['annual_income'].apply(categorize_income)

# Display the first few rows to verify the new column
df.head()

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency,income_bracket
0,1,25,45000,200,4.5,North,12,low
1,2,34,55000,350,7.0,South,18,medium
2,3,45,65000,500,8.0,West,22,medium
3,4,22,30000,150,3.0,East,10,low
4,5,29,47000,220,4.8,North,13,low


In [8]:
# Define age groups
def categorize_age(age):
    if age < 35:
        return 'young adult'
    elif 35 <= age < 50:
        return 'middle age'
    else:
        return 'senior'

# Apply the function to create a new column 'age_group'
df['age_group'] = df['age'].apply(categorize_age)

# Display the first few rows to verify the new column
df.head()

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency,income_bracket,age_group
0,1,25,45000,200,4.5,North,12,low,young adult
1,2,34,55000,350,7.0,South,18,medium,young adult
2,3,45,65000,500,8.0,West,22,medium,middle age
3,4,22,30000,150,3.0,East,10,low,young adult
4,5,29,47000,220,4.8,North,13,low,young adult


In [9]:
# Box Plot for Purchase Amount by Age Group
fig = px.box(df, x='age_group', y='purchase_amount', title='Purchase Amount by Age Group')
fig.show()

# Box Plot for Purchase Frequency by Income Bracket
fig = px.box(df, x='income_bracket', y='purchase_frequency', title='Purchase Frequency by Income Bracket')
fig.show()

In [10]:
# Group by age_group and calculate mean of purchase_amount and purchase_frequency
age_group_agg = df.groupby('age_group').agg({
    'purchase_amount': 'mean',
    'purchase_frequency': 'mean'
}).reset_index()

# Group by income_bracket and calculate mean of purchase_amount and purchase_frequency
income_bracket_agg = df.groupby('income_bracket').agg({
    'purchase_amount': 'mean',
    'purchase_frequency': 'mean'
}).reset_index()

# Plot for age_group
fig1 = px.bar(age_group_agg, x='age_group', y='purchase_amount', title='Average Purchase Amount by Age Group')
fig1.show()

fig2 = px.bar(age_group_agg, x='age_group', y='purchase_frequency', title='Average Purchase Frequency by Age Group')
fig2.show()

# Plot for income_bracket
fig3 = px.bar(income_bracket_agg, x='income_bracket', y='purchase_amount', title='Average Purchase Amount by Income Bracket')
fig3.show()

fig4 = px.bar(income_bracket_agg, x='income_bracket', y='purchase_frequency', title='Average Purchase Frequency by Income Bracket')
fig4.show()

In [11]:
import pandas as pd
import statsmodels.api as sm

# Assuming df is your DataFrame

# Correlation Analysis
correlation_matrix = df[['age', 'annual_income', 'purchase_amount', 'purchase_frequency']].corr()
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
                         age  annual_income  purchase_amount  \
age                 1.000000       0.974833         0.986140   
annual_income       0.974833       1.000000         0.984208   
purchase_amount     0.986140       0.984208         1.000000   
purchase_frequency  0.980532       0.982920         0.993579   

                    purchase_frequency  
age                           0.980532  
annual_income                 0.982920  
purchase_amount               0.993579  
purchase_frequency            1.000000  


In [12]:
import pandas as pd
import plotly.express as px

# Assuming df is your DataFrame

# Correlation Analysis
correlation_matrix = df[['age', 'annual_income', 'purchase_amount', 'purchase_frequency']].corr()

# Plotting the heatmap using Plotly
fig = px.imshow(correlation_matrix, 
                text_auto=True, 
                aspect="auto", 
                color_continuous_scale='RdBu_r', 
                title='Correlation Matrix Heatmap')

fig.show()

Key Insights
Correlation Analysis:

The correlation matrix reveals strong positive correlations between age, annual_income, purchase_amount, and purchase_frequency.
Notably, purchase_amount has a very high correlation with both age (0.986) and annual_income (0.984), indicating that older age and higher income are associated with higher purchase amounts.
Purchase_frequency also shows strong correlations with age (0.981), annual_income (0.983), and purchase_amount (0.994), suggesting that these factors are closely related to how frequently purchases are made.

In [13]:
# Regression Analysis
# Prepare the data
X = df[['age', 'annual_income']]
X = sm.add_constant(X)  # Adds a constant term to the predictor

# Regression model for purchase_amount
y_amount = df['purchase_amount']
model_amount = sm.OLS(y_amount, X).fit()
print("\nRegression Analysis for Purchase Amount:")
print(model_amount.summary())

# Regression model for purchase_frequency
y_frequency = df['purchase_frequency']
model_frequency = sm.OLS(y_frequency, X).fit()
print("\nRegression Analysis for Purchase Frequency:")
print(model_frequency.summary())


Regression Analysis for Purchase Amount:
                            OLS Regression Results                            
Dep. Variable:        purchase_amount   R-squared:                       0.983
Model:                            OLS   Adj. R-squared:                  0.983
Method:                 Least Squares   F-statistic:                     6799.
Date:                Wed, 11 Sep 2024   Prob (F-statistic):          1.11e-208
Time:                        17:58:16   Log-Likelihood:                -1028.5
No. Observations:                 238   AIC:                             2063.
Df Residuals:                     235   BIC:                             2073.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
cons

Regression Analysis:

- Purchase Amount:
The regression model for purchase_amount has an R-squared value of 0.983, indicating that 98.3% of the variance in purchase_amount can be explained by age and annual_income.
Both age and annual_income are significant predictors of purchase_amount with p-values less than 0.05.
The coefficients suggest that for each additional year of age, the purchase_amount increases by 8.0462 units, and for each unit increase in annual_income, the purchase_amount increases by 0.0057 units.
- Purchase Frequency:
The regression model for purchase_frequency has an R-squared value of 0.976, indicating that 97.6% of the variance in purchase_frequency can be explained by age and annual_income.
Both age and annual_income are significant predictors of purchase_frequency with p-values less than 0.05.
The coefficients suggest that for each additional year of age, the purchase_frequency increases by 0.2194 units, and for each unit increase in annual_income, the purchase_frequency increases by 0.0002 units.

# Hypothesis Testing: Impact of Income and Age on Purchase Behavior

## 1. Introduction

This notebook explores the hypothesis that age and income significantly impact purchase behavior, specifically purchase amount and purchase frequency. We will use hypothesis testing to assess these relationships and visualize the results using Plotly.

## 2. Setup

### 2.1. Import Libraries

In [14]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from sklearn.linear_model import LinearRegression


In [15]:
# Correlation coefficients
correlation_age_purchase_amount = df['age'].corr(df['purchase_amount'])
correlation_age_purchase_frequency = df['age'].corr(df['purchase_frequency'])

print(f'Correlation between Age and Purchase Amount: {correlation_age_purchase_amount}')
print(f'Correlation between Age and Purchase Frequency: {correlation_age_purchase_frequency}')

Correlation between Age and Purchase Amount: 0.9861399758870352
Correlation between Age and Purchase Frequency: 0.9805324896239003


In [16]:
# Correlation coefficients
correlation_income_purchase_amount = df['annual_income'].corr(df['purchase_amount'])
correlation_income_purchase_frequency = df['annual_income'].corr(df['purchase_frequency'])

print(f'Correlation between Income and Purchase Amount: {correlation_income_purchase_amount}')
print(f'Correlation between Income and Purchase Frequency: {correlation_income_purchase_frequency}')


Correlation between Income and Purchase Amount: 0.9842084473825974
Correlation between Income and Purchase Frequency: 0.9829204016501978


In [17]:
# Prepare data for regression
X = df[['age', 'annual_income']]
y = df['purchase_amount']

# Fit the model
model = LinearRegression()
model.fit(X, y)

# Coefficients
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)

# Plotting the regression line
fig = px.scatter(df, x='age', y='purchase_amount', trendline='ols', title='Purchase Amount vs Age')
fig.show()

fig = px.scatter(df, x='annual_income', y='purchase_amount', trendline='ols', title='Purchase Amount vs Annual Income')
fig.show()


               Coefficient
age               8.046245
annual_income     0.005655


The coefficients in a linear regression model represent the relationship between each independent variable and the dependent variable. In this case, the model is predicting purchase_amount based on age and annual_income.

- age (8.046245): This coefficient indicates that for each additional year of age, the purchase_amount increases by approximately 8.046245 units, assuming annual_income remains constant.
- annual_income (0.005655): This coefficient indicates that for each additional unit of annual income, the purchase_amount increases by approximately 0.005655 units, assuming age remains constant.

In summary, age has a more significant impact on purchase_amount compared to annual_income based on the magnitude of the coefficients.

In [18]:
# Prepare data for regression
y = df['purchase_frequency']

# Fit the model
model.fit(X, y)

# Coefficients
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)

# Plotting the regression line
fig = px.scatter(df, x='age', y='purchase_frequency', trendline='ols', title='Purchase Frequency vs Age')
fig.show()

fig = px.scatter(df, x='annual_income', y='purchase_frequency', trendline='ols', title='Purchase Frequency vs Annual Income')
fig.show()


               Coefficient
age               0.219420
annual_income     0.000218


The coefficients in your linear regression model represent the relationship between each independent variable and the dependent variable, which in this case is purchase_frequency.

- age (0.219420): This coefficient indicates that for each additional year of age, the purchase_frequency increases by approximately 0.219420 units, assuming annual_income remains constant.
- annual_income (0.000218): This coefficient indicates that for each additional unit of annual income, the purchase_frequency increases by approximately 0.000218 units, assuming age remains constant.

In summary, age has a more significant impact on purchase_frequency compared to annual_income based on the magnitude of the coefficients.

In [19]:
# ANOVA for purchase amount by age groups
anova_age_purchase_amount = stats.f_oneway(*[df[df['age_group'] == group]['purchase_amount'] for group in df['age_group'].unique()])
print(f'ANOVA results for Purchase Amount by Age Group: {anova_age_purchase_amount}')

# ANOVA for purchase amount by income brackets
anova_income_purchase_amount = stats.f_oneway(*[df[df['income_bracket'] == group]['purchase_amount'] for group in df['income_bracket'].unique()])
print(f'ANOVA results for Purchase Amount by Income Bracket: {anova_income_purchase_amount}')


ANOVA results for Purchase Amount by Age Group: F_onewayResult(statistic=575.5295743402377, pvalue=2.761318967757114e-91)
ANOVA results for Purchase Amount by Income Bracket: F_onewayResult(statistic=903.9312052119108, pvalue=4.439944437785204e-111)


The ANOVA (Analysis of Variance) results provide insights into whether there are statistically significant differences in the purchase amounts across different age groups and income brackets.

Interpretation of Results
ANOVA for Purchase Amount by Age Group:

F-statistic: 575.5295743402377
p-value: 2.761318967757114e-91
The very high F-statistic and extremely low p-value (much less than 0.05) indicate that there are significant differences in purchase amounts across different age groups. This means that age groups have a statistically significant effect on purchase amounts.

ANOVA for Purchase Amount by Income Bracket:

F-statistic: 903.9312052119108
p-value: 4.439944437785204e-111
Similarly, the very high F-statistic and extremely low p-value (much less than 0.05) indicate that there are significant differences in purchase amounts across different income brackets. This means that income brackets have a statistically significant effect on purchase amounts.

Conclusion
Both age groups and income brackets significantly influence purchase amounts, as evidenced by the ANOVA results.

In [20]:
# ANOVA for purchase frequency by age groups
anova_age_purchase_frequency = stats.f_oneway(*[df[df['age_group'] == group]['purchase_frequency'] for group in df['age_group'].unique()])
print(f'ANOVA results for Purchase Frequency by Age Group: {anova_age_purchase_frequency}')

# ANOVA for purchase frequency by income brackets
anova_income_purchase_frequency = stats.f_oneway(*[df[df['income_bracket'] == group]['purchase_frequency'] for group in df['income_bracket'].unique()])
print(f'ANOVA results for Purchase Frequency by Income Bracket: {anova_income_purchase_frequency}')


ANOVA results for Purchase Frequency by Age Group: F_onewayResult(statistic=555.2526727909417, pvalue=9.045573011582664e-90)
ANOVA results for Purchase Frequency by Income Bracket: F_onewayResult(statistic=755.3150454718719, pvalue=4.689083986986747e-103)


The ANOVA results for purchase frequency by age group and income bracket indicate whether there are statistically significant differences in purchase frequency across different age groups and income brackets.

### Interpretation of Results

1. **ANOVA for Purchase Frequency by Age Group**:
   - **F-statistic**: 555.2526727909417
   - **p-value**: 9.045573011582664e-90

   The very high F-statistic and extremely low p-value (much less than 0.05) indicate that there are significant differences in purchase frequency across different age groups. This means that age groups have a statistically significant effect on purchase frequency.

2. **ANOVA for Purchase Frequency by Income Bracket**:
   - **F-statistic**: 755.3150454718719
   - **p-value**: 4.689083986986747e-103

   Similarly, the very high F-statistic and extremely low p-value (much less than 0.05) indicate that there are significant differences in purchase frequency across different income brackets. This means that income brackets have a statistically significant effect on purchase frequency.

### Conclusion
Both age groups and income brackets significantly influence purchase frequency, as evidenced by the ANOVA results.

In [21]:
df.columns

Index(['user_id', 'age', 'annual_income', 'purchase_amount', 'loyalty_score',
       'region', 'purchase_frequency', 'income_bracket', 'age_group'],
      dtype='object')

In [22]:
def calculate_avg_loyalty_by_region(df):
    """
    Calculate the average loyalty score by region.

    Parameters:
    df (pd.DataFrame): DataFrame containing customer data with 'region' and 'loyalty_score' columns.

    Returns:
    pd.DataFrame: DataFrame with 'region' and 'avg_loyalty_score' columns.
    """
    avg_loyalty_by_region = df.groupby('region')['loyalty_score'].mean().reset_index()
    avg_loyalty_by_region.columns = ['region', 'avg_loyalty_score']
    return avg_loyalty_by_region

# Example usage
avg_loyalty_df = calculate_avg_loyalty_by_region(df)
avg_loyalty_df


Unnamed: 0,region,avg_loyalty_score
0,East,5.4
1,North,5.865385
2,South,6.725974
3,West,7.911688
