# W09 — Hierarchical Clustering

**Notebook template** following the W09 assignment. Replace `NIM` and `Name` in the filename before submission.

This notebook includes:
- Data loading & preprocessing (Part A)
- EDA visualizations (Part B)
- Data preparation and silhouette experiments (Part C)
- Two hierarchical clustering models + dendrograms (Part D)
- Comparison, interpretation, and managerial insights (Part E)

---

**Dataset:** property_data_clustering_clean.csv

**Note:** This notebook is ready to run on your machine or Google Colab (internet access required to download the dataset).

In [None]:
# Part A – Data Preprocessing

import pandas as pd
import numpy as np

# Load dataset
url = 'https://raw.githubusercontent.com/NathaliaMinoque/datasets/refs/heads/main/property_data_clustering_clean.csv'
df = pd.read_csv(url)

# 1) Initial inspection
print('Shape:', df.shape)
print('\nInfo:')
print(df.info())

# 2) Missing / inconsistent values
print('\nMissing values per column:')
print(df.isna().sum())

# 3) Unique values for categorical columns
cat_cols = ['Nama Daerah','Terjual/Belum','Arah Hadap Rumah','Posisi Rumah','Lebar Jalan Depan Rumah (ROW)']
for c in cat_cols:
    if c in df.columns:
        print(f'\nUnique values in {c}:')
        print(df[c].value_counts(dropna=False))

# Show head
print('\nFirst 5 rows:')
print(df.head())


In [None]:
# Part B – Exploratory Data Analysis (2 visualizations)
import matplotlib.pyplot as plt
import numpy as np

# Ensure numeric cols are correct
num_cols = ['Luas Tanah (m2)','Luas Bangunan (m2)','Jumlah Kamar','Jumlah Kamar Mandi','Tingkat/Lantai','Harga Penawaran (dari Owner)']
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Viz 1: Correlation heatmap (using matplotlib only)
corr = df[num_cols].corr()
plt.figure(figsize=(6,5))
plt.imshow(corr, interpolation='nearest')
plt.title('Correlation matrix (numeric features)')
plt.xticks(range(len(num_cols)), num_cols, rotation=45, ha='right')
plt.yticks(range(len(num_cols)), num_cols)
for (i, j), val in np.ndenumerate(corr.values):
    plt.text(j, i, f"{val:.2f}", ha='center', va='center')
plt.colorbar()
plt.tight_layout()
plt.show()

# Viz 2: Boxplot of Harga Penawaran by Terjual/Belum (if present)
if 'Terjual/Belum' in df.columns:
    groups = df[['Terjual/Belum','Harga Penawaran (dari Owner)']].dropna()
    labels = groups['Terjual/Belum'].unique().tolist()
    data = [groups.loc[groups['Terjual/Belum']==lab,'Harga Penawaran (dari Owner)'].values for lab in labels]
    plt.figure(figsize=(6,4))
    plt.boxplot(data)
    plt.xticks(range(1,len(labels)+1), labels)
    plt.ylabel('Harga Penawaran (IDR)')
    plt.title('Harga Penawaran by Terjual/Belum')
    plt.tight_layout()
    plt.show()
else:
    print('Column Terjual/Belum not found — skipping second plot.')


In [None]:
# Part C – Data Preparation for Clustering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from itertools import product
import numpy as np

# Select features to use for clustering (numerical + encoded categorical)
num_features = ['Luas Tanah (m2)','Luas Bangunan (m2)','Jumlah Kamar','Jumlah Kamar Mandi','Tingkat/Lantai','Harga Penawaran (dari Owner)']
cat_features = [c for c in ['Nama Daerah','Arah Hadap Rumah','Posisi Rumah','Lebar Jalan Depan Rumah (ROW)','Terjual/Belum'] if c in df.columns]

# Build preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [c for c in num_features if c in df.columns]),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ], remainder='drop')

X = preprocessor.fit_transform(df)
print('Processed feature matrix shape:', X.shape)

# Experiment with parameters and record silhouette scores
linkages = ['complete','average','single','ward']
dist_metrics = ['euclidean','manhattan']
cluster_range = [2,3,4,5]

results = []
for n_clusters, link, metric in product(cluster_range, linkages, dist_metrics):
    if link == 'ward' and metric != 'euclidean':
        continue
    try:
        model = AgglomerativeClustering(n_clusters=n_clusters, linkage=link, affinity=metric)
        labels = model.fit_predict(X)
        sil = silhouette_score(X, labels, metric=metric)
        results.append({'n_clusters':n_clusters,'linkage':link,'metric':metric,'silhouette':sil})
    except Exception as e:
        pass

res_df = pd.DataFrame(results).sort_values('silhouette', ascending=False)
print('\nTop silhouette results:')
print(res_df.head())


In [None]:
# Part D – Hierarchical Clustering Modeling
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram

# Choose best params from res_df
best = res_df.iloc[0] if not res_df.empty else None
if best is not None:
    best_n = int(best['n_clusters'])
    best_link = best['linkage']
    best_metric = best['metric']
else:
    best_n, best_link, best_metric = 3, 'average', 'euclidean'

# Model 1 dendrogram
Z1 = linkage(X, method=best_link, metric=best_metric)
plt.figure(figsize=(12,4))
dendrogram(Z1, truncate_mode='level', p=5)
plt.title(f'Dendrogram — Model 1 ({best_link} - {best_metric})')
plt.show()

# Model 2 (Ward)
Z2 = linkage(X, method='ward')
plt.figure(figsize=(12,4))
dendrogram(Z2, truncate_mode='level', p=5)
plt.title('Dendrogram — Model 2 (ward)')
plt.show()

# Fit Agglomerative models
model1 = AgglomerativeClustering(n_clusters=best_n, linkage=best_link, affinity=best_metric)
labels1 = model1.fit_predict(X)
model2 = AgglomerativeClustering(n_clusters=best_n, linkage='ward', affinity='euclidean')
labels2 = model2.fit_predict(X)

# Merge labels

df['cluster_model1'] = labels1
df['cluster_model2'] = labels2
print(df[['cluster_model1','cluster_model2']].head())


In [None]:
# Part E – Comparison & Interpretation
numeric = [c for c in num_features if c in df.columns]

print('Model 1 cluster means:')
print(df.groupby('cluster_model1')[numeric].mean())

print('\nModel 2 cluster means:')
print(df.groupby('cluster_model2')[numeric].mean())

# Managerial insights
print('\nManagerial insights (example):')
print('- Use high-price cluster characteristics to craft premium listings and marketing.')
print('- Identify underperforming areas (low sale rate) to run promotions or adjust pricing.')


## Submission

- Save this notebook as `W09 - NIM - Name.ipynb` (replace with your NIM and Name).
- Push to GitHub and submit the repository link as requested by the assignment.

---
*Notebook generated automatically. Review & adapt the analysis, interpretations, and plots before final submission.*