# Assignment 4 — Task 1: Embeddings and Projections

**Goal:** Construct property embeddings and apply dimensionality reduction to obtain 2D projections

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import json

print(f"NumPy {np.__version__}")
print(f"Pandas {pd.__version__}")

In [None]:
df = pd.read_parquet('sf_property_data_clean.parquet')

# Filter to 2015-2023
df = df[(df['year'] >= 2015) & (df['year'] <= 2023)]

# Keep records with essential fields
required_cols = ['latitude', 'longitude', 'total_assessed_value', 'property_area', 
                 'year', 'neighborhood', 'number_of_bedrooms', 'number_of_bathrooms']
df = df.dropna(subset=required_cols)

# Compute building age (fill missing year_property_built with median)
if 'year_property_built' in df.columns:
    median_year_built = df['year_property_built'].median()
    df['year_property_built'] = df['year_property_built'].fillna(median_year_built)
    df['building_age'] = 2023 - df['year_property_built']
    df['building_age'] = df['building_age'].clip(0, 150)
else:
    df['building_age'] = 50

# Filter outliers using IQR method
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

df = remove_outliers(df, 'total_assessed_value')
df = remove_outliers(df, 'property_area')

print(f"Records after cleaning: {len(df):,}")
print(f"Years: {df['year'].min()} - {df['year'].max()}")
print(f"Neighborhoods: {df['neighborhood'].nunique()}")

## Feature Engineering

**Numerical Features (log-transformed and normalized):**
- Property value, area, bedrooms, bathrooms, building age

**Spatial Features:**
- Latitude, longitude (normalized)
- Distance to downtown (37.7749°N, 122.4194°W)

**Temporal Features:**
- Year (normalized 2015-2023)
- COVID period encoding (pre/during/post)

**Categorical Features (one-hot encoded):**
- Neighborhood (41 districts)

**Final dimensionality:** 54 features

In [None]:
# Numerical features with log transform
df['log_value'] = np.log1p(df['total_assessed_value'])
df['log_area'] = np.log1p(df['property_area'])
df['bedrooms_capped'] = df['number_of_bedrooms'].clip(0, 10)
df['bathrooms_capped'] = df['number_of_bathrooms'].clip(0, 8)

# Spatial features
downtown_lat, downtown_lon = 37.7749, -122.4194
df['distance_to_downtown'] = np.sqrt(
    (df['latitude'] - downtown_lat)**2 + (df['longitude'] - downtown_lon)**2
)

# Temporal features
df['year_normalized'] = (df['year'] - 2015) / (2023 - 2015)
df['pre_covid'] = (df['year'] < 2020).astype(int)
df['covid_era'] = ((df['year'] >= 2020) & (df['year'] <= 2021)).astype(int)
df['post_covid'] = (df['year'] >= 2022).astype(int)

# One-hot encode neighborhood
neighborhood_dummies = pd.get_dummies(df['neighborhood'], prefix='nbhd')

# Assemble feature matrix
numerical_features = [
    'log_value', 'log_area', 'bedrooms_capped', 'bathrooms_capped', 'building_age',
    'latitude', 'longitude', 'distance_to_downtown',
    'year_normalized', 'pre_covid', 'covid_era', 'post_covid'
]

X = pd.concat([df[numerical_features], neighborhood_dummies], axis=1)

print(f"Feature matrix shape: {X.shape}")
print(f"Features: {X.shape[1]}")

In [None]:
# Check for missing values
print(f"Missing values before cleaning: {X.isnull().sum().sum()}")

# Fill missing values with column means
X = X.fillna(X.mean())

# Verify no missing values remain
print(f"Missing values after cleaning: {X.isnull().sum().sum()}")

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Scaled embeddings shape: {X_scaled.shape}")
print(f"Mean: {X_scaled.mean():.4f}, Std: {X_scaled.std():.4f}")

## Dimensionality Reduction

### Iteration 1: PCA only (raw features)
**Issue:** Value and area dominated due to scale differences; neighborhoods didn't separate

### Iteration 2: Normalized + PCA + t-SNE
**Improvement:** Clear neighborhood clustering in t-SNE; PCA shows value gradients

### Iteration 3: Added temporal features
**Result:** COVID-era properties form more distinct patterns in both PCA and t-SNE projections

In [None]:
# PCA (global structure)
pca = PCA(n_components=2, random_state=99)
X_pca = pca.fit_transform(X_scaled)

print(f"PCA variance explained: {pca.explained_variance_ratio_}")
print(f"Total variance: {pca.explained_variance_ratio_.sum():.2%}")

In [None]:
# t-SNE (local neighborhoods) 
sample_size = 100000
sample_idx = np.random.RandomState(999).choice(len(X_scaled), size=sample_size, replace=False)

tsne = TSNE(n_components=2, perplexity=30, random_state=999)
X_tsne_sample = tsne.fit_transform(X_scaled[sample_idx])

# Create full array with NaNs for non-sampled points
X_tsne = np.full((len(X_scaled), 2), np.nan)
X_tsne[sample_idx] = X_tsne_sample

print(f"t-SNE computed for {sample_size:,} points")

In [None]:
# Assemble output dataframe
embeddings_2d = pd.DataFrame({
    'property_id': range(len(df)),
    'neighborhood': df['neighborhood'].values,
    'year': df['year'].values,
    'total_assessed_value': df['total_assessed_value'].values,
    'property_area': df['property_area'].values,
    'number_of_bedrooms': df['number_of_bedrooms'].values,
    'number_of_bathrooms': df['number_of_bathrooms'].values,
    'building_age': df['building_age'].values,
    'latitude': df['latitude'].values,
    'longitude': df['longitude'].values,
    'pca_x': X_pca[:, 0],
    'pca_y': X_pca[:, 1],
    'tsne_x': X_tsne[:, 0],
    'tsne_y': X_tsne[:, 1]
})

print(f"Output dataframe: {embeddings_2d.shape}")
embeddings_2d.head()

In [None]:
# Save as JSON for web interface
embeddings_2d.to_json('embeddings_2d.json', orient='records')
print("Saved embeddings_2d.json")

# Also save as CSV
embeddings_2d.to_csv('embeddings_2d.csv', index=False)
print("Saved embeddings_2d.csv")