In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

sns.set_style("whitegrid")

In [2]:
# Starter Code: Generate Synthetic CLV Data
np.random.seed(110)
n_customers = 500

# Generate predictor variables
avg_purchase_value = np.random.normal(75, 25, n_customers) # Avg $75, std $25
purchase_frequency = np.random.normal(5, 2, n_customers)   # Avg 5 purchases/yr, std 2
customer_tenure = np.random.uniform(0.5, 10, n_customers)  # Tenure between 0.5 and 10 years

# Ensure non-negative values where logical
avg_purchase_value = np.maximum(avg_purchase_value, 10)
purchase_frequency = np.maximum(purchase_frequency, 0.5)

# Generate CLV based on a linear relationship + noise
# CLV = Base + (Effect of AvgPurchase * Freq * Tenure) - somewhat multiplicative effect logic + noise
# Let's simplify to additive for clarity in MLR interpretation for this assignment:
# CLV = Base + Effect(AvgPurchase) + Effect(Freq) + Effect(Tenure) + Noise
clv = 150 + 10 * avg_purchase_value + 80 * purchase_frequency + 50 * customer_tenure + np.random.normal(0, 250, n_customers)
clv = np.maximum(clv, 50) # Ensure minimum CLV

# Create DataFrame
df_clv = pd.DataFrame({
    'AvgPurchaseValue': avg_purchase_value,
    'PurchaseFrequency': purchase_frequency,
    'CustomerTenure': customer_tenure,
    'CLV': clv
})

print("Synthetic Customer Data Head:")
print(df_clv.head()) # (first few rows)
print("\nData Description:")
print(df_clv.describe()) # basic statistics

Synthetic Customer Data Head:
   AvgPurchaseValue  PurchaseFrequency  CustomerTenure          CLV
0         83.214928           2.649825        2.762106  1503.484718
1         55.095036           3.438132        7.472734  1492.435762
2        110.078096           3.054568        9.820457  1367.956073
3         36.305181           3.581917        6.321292  1010.988008
4        104.168257           3.359105        8.689112  1997.209326

Data Description:
       AvgPurchaseValue  PurchaseFrequency  CustomerTenure          CLV
count        500.000000         500.000000      500.000000   500.000000
mean          75.546159           5.005602        5.229268  1561.155591
std           24.889128           1.954198        2.792919   393.090770
min           10.000000           0.500000        0.510736   401.844341
25%           58.085492           3.722779        2.783082  1281.604601
50%           76.024690           5.031070        5.120830  1571.990603
75%           93.576606           6.313

In [None]:
#2 Define features X and target Y
x_feat = df_clv[['AvgPurchaseValue', 'PurchaseFrequency', 'CustomerTenure']] # Feature DataFrame (2D)
y_feat = df_clv['CLV']           # Target Series (1D)

print("\nShape of x_feat:", x_feat.shape)
print("Shape of y_feat:", y_feat.shape)

#3 Split data into a training set (75%) and a test set (25%) random_state = 110 (Train Test Split)
x_feat_train, x_feat_test, y_feat_train, y_feat_test = train_test_split(
    x_feat, y_feat,  # The features and target variables
    test_size=0.25,    # Proportion of data to allocate to the test set
    random_state=110   
)

print("Original data size:", x_feat.shape[0])
print("Training set size (X):", x_feat_train.shape[0])
print("Test set size (X):", x_feat_test.shape[0])
print("Training set size (y):", y_feat_train.shape[0])
print("Test set size (y):", y_feat_test.shape[0]) #split into 75% and 25%




Shape of x_feat: (500, 3)
Shape of y_feat: (500,)
Original data size: 500
Training set size (X): 375
Test set size (X): 125
Training set size (y): 375
Test set size (y): 125


In [26]:
#4 Create Linear Regression Model + #5 fit the model using training data

slr_model = LinearRegression()
slr_model.fit(x_feat_train, y_feat_train) # Train ONLY on the training data (#5)

print("\nSLR Model trained successfully.")

#6 Printing intercepts and coefficients

slr_intercept = slr_model.intercept_
slr_coefficient_pv = slr_model.coef_[0]
slr_coefficient_pf = slr_model.coef_[1]
slr_coefficient_ct = slr_model.coef_[2]

print(f"Intercept B0: {slr_intercept:.2f}")
print(f"Coefficients: {slr_coefficient_pv:.2f}, {slr_coefficient_pf:.2f}, {slr_coefficient_ct:.2f}")
print(f"Model Equation: CLV ≈ {slr_intercept:.2f} + {slr_coefficient_pv:.2f}(AvgPurchaseValue) + {slr_coefficient_pf:.2f}(PurchaseFrequency) + {slr_coefficient_ct:.2f}(CustomerTenure)")


SLR Model trained successfully.
Intercept B0: 189.65
Coefficients: 9.64, 75.15, 49.80
Model Equation: CLV ≈ 189.65 + 9.64(AvgPurchaseValue) + 75.15(PurchaseFrequency) + 49.80(CustomerTenure)


# Interpretation
For each $1 increase in **average purchase value**, the CLV increases by approximately `$9.64`. For each additional **purchase per period**, CLV increases by approximately `$75.15`, and for each year the customer is **retained**, CLV increases by approximately `$49.80`.

In [28]:
y_feat_pred_test = slr_model.predict(x_feat_test)

# Calculate evaluation metrics
r2_test = r2_score(y_feat_test, y_feat_pred_test)
mse_test = mean_squared_error(y_feat_test, y_feat_pred_test) # Calculate MSE first
rmse_test = np.sqrt(mse_test) # Then take the square root

print(f"\n--- SLR Model Evaluation (Test Set) ---")
print(f"R-squared (R²): {r2_test:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse_test:.2f}")
print(f"Mean Absolute Error (MAE): {mse_test:.2f}")

# Optional: Compare with training set performance (often better)
r2_train = slr_model.score(x_feat_train, y_feat_train)
print(f"R-squared (R²) on Training Set: {r2_train:.3f}")


--- SLR Model Evaluation (Test Set) ---
R-squared (R²): 0.582
Root Mean Squared Error (RMSE): 253.95
Mean Absolute Error (MAE): 64493.09
R-squared (R²) on Training Set: 0.620


# Interpretation
**R-Squared (R²) ≈ 58.2%**: About 58.2% of the variability in Customer Lifetime Value can be explained by the linear model. For the training set 62% of the variability can be explained by the model.

**RMSE**: On average, the model's predictions are off by $253.95.

**MAE** On average, the model's predictions for CLV differ from the actual sales by $64,493

In [None]:
customer = pd.DataFrame({
    'AvgPurchaseValue': [85],
    'PurchaseFrequency': [6],
    'CustomerTenure': [4]
}) # data needs to be in brackets because pandas expects a list

predict_clv = slr_model.predict(customer)
print(f"Predicted CLV for a New Customer: {predict_clv[0]:.2f}") # results are in line with the data from the first few rows of the clv dataframe
# the data only has one 

IndexError: index 1 is out of bounds for axis 0 with size 1

Tasks:

1. Run the starter code to generate the df_clv DataFrame and briefly examine the first few rows (.head()) and basic statistics (.describe()).

2. Define your features X (should include AvgPurchaseValue, PurchaseFrequency, CustomerTenure) and your target y (CLV).

3. Split the data into a training set (75% of data) and a test set (25% of data). Use random_state=110 for reproducibility.

4. Create an instance of the LinearRegression model.

5. Train (fit) the MLR model using the training data.

6. Print the model’s intercept (β₀) and the coefficients (β) for each predictor variable. Pair the coefficients with their feature names for clarity.

7. Evaluate the model’s performance on the test set by calculating and printing the R-squared (R²) and Root Mean Squared Error (RMSE).

8. Make a prediction for the CLV of a new hypothetical customer with the following profile:

AvgPurchaseValue = $85

PurchaseFrequency = 6 purchases/year

CustomerTenure = 4 years Print the predicted CLV for this customer.