In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

data_censored = pd.read_csv("data_censored.csv")

print(data_censored.head())
clustering_vars = ["age", "x1", "x2", "x3"]
data_for_clustering = data_censored[clustering_vars].copy()
scaler = StandardScaler()
data_for_clustering_scaled = scaler.fit_transform(data_for_clustering)
data_for_clustering_scaled = pd.DataFrame(data_for_clustering_scaled, columns = clustering_vars)


def calculate_avg_silhouette(k, data):
    kmeans_sil = KMeans(n_clusters=k, n_init=25, random_state=42)
    cluster_labels = kmeans_sil.fit_predict(data)
    silhouette_avg = silhouette_score(data, cluster_labels)
    return silhouette_avg

k_values = range(2, 11)
avg_silhouettes = [calculate_avg_silhouette(k, data_for_clustering_scaled) for k in k_values]

optimal_k = k_values[np.argmax(avg_silhouettes)]
print("Optimal number of clusters (Silhouette):", optimal_k)

plt.figure(figsize=(8, 6))
plt.plot(k_values, avg_silhouettes, marker='o', linestyle='-')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Average Silhouette Width")
plt.title("Silhouette Analysis for Optimal k")
plt.show()

np.random.seed(42)
kmeans_result = KMeans(n_clusters=optimal_k, n_init=25, random_state=42)
data_censored['cluster'] = pd.Categorical(kmeans_result.fit_predict(data_for_clustering_scaled))

cluster_summary = data_censored.groupby('cluster')[clustering_vars].mean().reset_index()
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
axes = axes.flatten()

for i, var in enumerate(clustering_vars):
    ax = axes[i]
    ax.bar(cluster_summary['cluster'].astype(str), cluster_summary[var], color='skyblue')
    ax.set_xlabel('Cluster')
    ax.set_ylabel(f'Mean {var}')
    ax.set_title(f'Mean {var} per Cluster')

print(data_censored.groupby(['cluster', 'treatment']).size().unstack())

plt.tight_layout()
plt.show()

for cluster_id in data_censored['cluster'].unique():
    print(f"Cluster {cluster_id} stats:")
    print(data_censored[data_censored['cluster'] == cluster_id].groupby('treatment')['outcome'].describe())
    print()

outcome_summary = data_censored.groupby(['cluster', 'treatment'])['outcome'].describe().unstack()
mean_outcome_summary = outcome_summary['mean'].fillna(0)

fig, ax = plt.subplots(figsize=(10, 6))

cluster_labels = mean_outcome_summary.index.map(lambda x: '-'.join(map(str, x)) if isinstance(x, tuple) else str(x))
treatment_0_means = mean_outcome_summary[0]
treatment_1_means = mean_outcome_summary[1]

x = np.arange(len(cluster_labels))
width = 0.35

rects1 = ax.bar(x - width/2, treatment_0_means, width, label='Treatment 0', color='skyblue')
rects2 = ax.bar(x + width/2, treatment_1_means, width, label='Treatment 1', color='salmon')

ax.set_ylabel('Mean Outcome')
ax.set_xlabel('Cluster')
ax.set_title('Mean Outcome by Cluster and Treatment Group')
ax.set_xticks(x)
ax.set_xticklabels(cluster_labels)
ax.legend()

plt.tight_layout()
plt.show()

The best place to implement a clustering algorithm is before fitting the Marginal Structural Model (MSM), and after the expanded dataset has been loaded and the weights calculated. It is also ideal for clustering to be performed after the data expansion process.

![Silhouette.png](attachment:Silhouette.png)

The value of K(number of clusters) with highest silhouette score is the optimal K. In the chart, the highest silhouette score is for k=4.

![Cluster_summary.png](attachment:Cluster_summary.png)

The first chart shows the average age of individuals within each cluster. All clusters have a relatively high average age (around 45-50), suggesting that age alone isn't a strong differentiator between these clusters. There is no obvious separation across the groups, there's not an older or younger group.

For the second chart (Mean x1 per Cluster), clusters 1 and 2 have a high mean value for x1 (close to 1), while clusters 0 and 3 are essentially zero. This indicates that x1 is a very significant feature in distinguishing between these two pairs of clusters.

In the "Mean x2 per Cluster" chart, clusters 0 and 3 have more negative mean values for x2, while clusters 1 and, especially, 2 are less negative. This suggests x2 also plays a role in differentiating the clusters, potentially working in opposition to x1 in some cases.

The "Mean x3 per Cluster chart" shows that clusters 0 and 1 have high mean x3 values, while clusters 2 and especially cluster 3 are close to zero. This feature helps separate (0,1) from (2,3).


![Outcome_summary.png](attachment:Outcome_summary.png)

The treatment effect is not uniform across the clusters. It appears to be negative (beneficial) in all clusters where we have data, but the magnitude of the effect varies considerably.