In [19]:
# Imports

from azureml.core import Workspace, Dataset, Experiment, Run
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import base64
from io import BytesIO
import numpy as np

# Connect to workspace
ws = Workspace.from_config()
datastore = ws.get_default_datastore()

# Load dataset
data_ref = [(datastore, 'clv-data/customer_features.csv')]
dataset = Dataset.Tabular.from_delimited_files(data_ref)
customer_data = dataset.to_pandas_dataframe()

# Helper: Convert figure to base64 <img> tag
def fig_to_base64_img(fig, dpi=120):
    buf = BytesIO()
    fig.savefig(buf, format='png', dpi=dpi, bbox_inches='tight')
    buf.seek(0)
    base64_img = base64.b64encode(buf.read()).decode('utf-8')
    plt.close(fig)
    return f'<img src="data:image/png;base64,{base64_img}" style="max-width:100%; height:auto;"/>'

# 1️⃣ Cumulative Distribution of CLV (focused on 0–10000, wider layout)
fig1, ax1 = plt.subplots(figsize=(12, 5))  # Wider figure

# Filter CLV to 0–1000
filtered_clv = customer_data['CLV'][(customer_data['CLV'] >= 0) & (customer_data['CLV'] <= 1000)]
sorted_clv = np.sort(filtered_clv)
cumulative = np.arange(1, len(sorted_clv)+1) / len(sorted_clv)

# Plot
ax1.plot(sorted_clv, cumulative, color='green')
ax1.set_title("Cumulative CLV Distribution (0–1,000)", fontsize=14)
ax1.set_xlabel("CLV", fontsize=12)
ax1.set_ylabel("Cumulative Fraction of Customers", fontsize=12)
ax1.set_xlim(0, 1000)  # Focus view
ax1.grid(True)
fig1.tight_layout()
img1 = fig_to_base64_img(fig1)


# 2️⃣ Frequency vs CLV – Bubble Plot (0.5 Bins, Larger Size Diff, CLV ≤ 600)
fig2, ax2 = plt.subplots(figsize=(14, 7))

# Filter data
filtered_data = customer_data[
    (customer_data['Frequency'] >= 0) & (customer_data['Frequency'] <= 20) &
    (customer_data['CLV'] > 0) & (customer_data['CLV'] <= 600)
]

# Define 0.5-width frequency bins
bin_edges = np.arange(0, 20.5, 0.5)
filtered_data['FreqBin'] = pd.cut(filtered_data['Frequency'], bins=bin_edges, right=False)

# Ensure all bins are present
bin_labels = pd.IntervalIndex.from_breaks(bin_edges, closed='left')
grouped = (
    filtered_data.groupby('FreqBin')
    .agg(AvgCLV=('CLV', 'mean'), Count=('CustomerID', 'count'))
    .reindex(bin_labels)
    .reset_index()
)

# Midpoints for plotting
grouped['FreqMid'] = grouped['FreqBin'].apply(lambda x: x.left + 0.25)

# Use log scale for bubble size, with higher multiplier for contrast
grouped['Size'] = np.sqrt(grouped['Count'].fillna(0)) * 50  

# Bubble plot
sc = ax2.scatter(
    grouped['FreqMid'],
    grouped['AvgCLV'],
    s=grouped['Size'],
    alpha=0.75,
    color='cornflowerblue',
    edgecolor='black'
)

# Annotations: Number of customers per bubble
for _, row in grouped.iterrows():
    if row['Count'] > 0 and not np.isnan(row['AvgCLV']):
        ax2.text(
            row['FreqMid'],
            row['AvgCLV'] + 25,  # Slightly above bubble
            f"{int(row['Count'])}",
            ha='center',
            va='bottom',
            fontsize=8
        )

# Labels & layout
ax2.set_title("Avg CLV by Frequency Bin (0–600 CLV, Log Bubble Size)", fontsize=16, pad=10)
ax2.set_xlabel("Frequency Bin Midpoint", fontsize=14)
ax2.set_ylabel("Average CLV", fontsize=14)
ax2.set_xlim(0, 20)
ax2.set_ylim(0, 600)
ax2.grid(True, linestyle='--', alpha=0.5)
ax2.tick_params(labelsize=12)

fig2.tight_layout()
img2 = fig_to_base64_img(fig2)


# 3️⃣ Median CLV by Country (Horizontal Bar Plot)
fig3, ax3 = plt.subplots(figsize=(10, 10))  # Wider height for better spacing

# Filter CLV ≤ 10K and compute median per country
median_clv = customer_data[customer_data['CLV'] <= 10000] \
    .groupby('Country')['CLV'].median().sort_values(ascending=True)  # Ascending for horizontal bars

# Plot horizontal bars
ax3.barh(median_clv.index, median_clv.values, color='salmon')
ax3.set_title("Median CLV by Country (0–10K)", fontsize=14)
ax3.set_xlabel("Median CLV", fontsize=12)
ax3.set_ylabel("Country", fontsize=12)

fig3.tight_layout()
img3 = fig_to_base64_img(fig3)


# Generate HTML dashboard
with open('clv_dashboard.html', 'w') as f:
    f.write("<html><head><title>CLV Dashboard</title></head><body style='font-family:sans-serif;'>")
    f.write("<h2>Cumulative CLV Distribution</h2>" + img1)
    f.write("<h2>Frequency vs CLV (Hexbin)</h2>" + img2)
    f.write("<h2>Median CLV by Country</h2>" + img3)
    f.write("</body></html>")

# Upload to Azure ML
exp = Experiment(ws, 'clv-project')
run = exp.start_logging()
run.upload_file(name='clv_dashboard.html', path_or_stream='clv_dashboard.html')
run.complete()

print("✅ Simplified, readable CLV dashboard generated and uploaded.")


{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe'}
{'infer_column_types': 'False', 'activity': 'to_pandas_dataframe', 'activityApp': 'TabularDataset'}
✅ Simplified, readable CLV dashboard generated and uploaded.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['FreqBin'] = pd.cut(filtered_data['Frequency'], bins=bin_edges, right=False)
