In [None]:
import pandas as pd

# Load both datasets
df1 = pd.read_csv("FRED-QD_2025m01.csv")
df2 = pd.read_csv("FRED-QD_2025m02.csv")

# OPTIONAL â€” ensure they align by date if a date column exists
# Example:
# df1 = df1.sort_values("DATE")
# df2 = df2.sort_values("DATE")

# Combine the datasets side-by-side (axis=1)
combined_df = pd.concat([df1, df2], axis=1)

# Drop non-numeric columns (important for ML)
combined_df = combined_df.select_dtypes(include=["float64", "int64"])

# Drop missing values
combined_df = combined_df.dropna()

print("Combined dataset shape:", combined_df.shape)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Features only
X = combined_df

# 80/20 split
X_train, X_test = train_test_split(
    X, test_size=0.2, random_state=42
)

# Scale data (VERY important for clustering)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# K-Means model
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_train_scaled)

# Predict cluster labels for test data
test_clusters = kmeans.predict(X_test_scaled)

print("K-Means cluster labels for test set:")
print(test_clusters)
