In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# 1. Load the dataset
df = pd.read_csv("../data/data.csv")

In [3]:
# 2. Preprocess Categorical Data
# Convert Outlook and Play into dummy/indicator variables (0 or 1)
df_encoded = pd.get_dummies(df, columns=['Outlook', 'Windy', 'Play'], drop_first=True)

In [4]:
# 3. Feature Scaling (Crucial for K-Means)
# Temperature (12-35) and Humidity (30-95) have different scales.
# StandardScaler ensures they contribute equally to the distance.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded)

In [5]:
# 4. Apply K-Means
# We'll look for 3 clusters (e.g., "Stormy," "Dry/Hot," "Mild")
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(X_scaled)

In [6]:
# 5. Results
df['Cluster_Label'] = clusters
print("--- Weather Data with Cluster Assignments ---")
print(df[['Outlook', 'Temp_Num', 'Humidity_Num', 'Cluster_Label']])

--- Weather Data with Cluster Assignments ---
     Outlook  Temp_Num  Humidity_Num  Cluster_Label
0      Rainy        25            85              1
1      Rainy        22            90              1
2   Overcast        30            78              2
3      Sunny        35            40              0
4      Sunny        32            45              0
5      Sunny        18            50              0
6   Overcast        15            60              2
7      Rainy        20            80              1
8      Rainy        12            70              1
9      Sunny        28            30              0
10     Rainy        24            65              2
11  Overcast        26            95              2


In [7]:
# 6. Check Cluster Centers (to see what each cluster represents)
print("\nCluster Centroids (Scaled):")
print(kmeans.cluster_centers_)


Cluster Centroids (Scaled):
[[ 0.65644218 -1.2159456  -0.84515425  1.41421356 -0.3380617   0.1767767 ]
 [-0.6311944   0.77604719  1.18321596 -0.70710678 -0.3380617  -0.88388348]
 [-0.02524778  0.43989841 -0.3380617  -0.70710678  0.6761234   0.70710678]]
