In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ------------------------------------------------------------
# LOAD & CLEAN DATA
# ------------------------------------------------------------

df = pd.read_csv("/mnt/data/FRED-QD_2025m01.csv")

# Drop metadata rows
df = df.drop(index=[0, 1]).reset_index(drop=True)

# Convert columns
df["sasdate"] = pd.to_datetime(df["sasdate"])
for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df = df.dropna()

# ------------------------------------------------------------
# SELECT VARIABLES
# ------------------------------------------------------------

features = [
    "PCECC96",   # Consumption
    "GPDIC1",    # Investment
    "FPIx",      # Price Index
    "S&P 500",   # Stock Market
    "S&P div yield",
    "S&P PE ratio"
]

X = df[features]

# ------------------------------------------------------------
# TRAIN/TEST SPLIT (80% train, 20% test)
# ------------------------------------------------------------

X_train, X_test = train_test_split(
    X, test_size=0.2, shuffle=False
)

# ------------------------------------------------------------
# SCALE FEATURES (IMPORTANT FOR K-MEANS)
# ------------------------------------------------------------

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------------------------------------
# K-MEANS CLUSTERING (TRAIN ON 80%, ASSIGN 20%)
# ------------------------------------------------------------

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_train_scaled)

# Assign clusters
train_clusters = kmeans.labels_
test_clusters = kmeans.predict(X_test_scaled)

# Attach cluster labels to original dataframes
df_train = X_train.copy()
df_train["cluster"] = train_clusters

df_test = X_test.copy()
df_test["cluster"] = test_clusters

# ------------------------------------------------------------
# DISPLAY RESULTS
# ------------------------------------------------------------

print("=== K-MEANS RESULTS (80% Train / 20% Test) ===\n")

print("Training set cluster distribution:")
print(df_train["cluster"].value_counts(), "\n")

print("Test set cluster distribution:")
print(df_test["cluster"].value_counts(), "\n")

print("Train Cluster Means:")
print(df_train.groupby("cluster")[features].mean(), "\n")

print("Test Cluster Means:")
print(df_test.groupby("cluster")[features].mean())
