# Median Income vs House Prices Correlation Analysis

Analyzing the relationship between median income by municipality and house prices.

In [40]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats

## Load Data

In [41]:
# Load datasets
properties = pd.read_csv('../data/cleaned/cleaned_dataset_v2.csv')
income = pd.read_csv('../data/raw/median_income.csv')

print(f"Properties: {len(properties)} rows")
print(f"Income data: {len(income)} municipalities")

Properties: 10946 rows
Income data: 580 municipalities


## Prepare Data

In [42]:
# Clean income data - convert municipality names to lowercase for matching
income['municipality_clean'] = income['municipality'].str.lower().str.strip()

# The median_income is already numeric (in thousands of euros with dot as decimal)
# Just ensure it's float type
income['median_income'] = pd.to_numeric(income['median_income'], errors='coerce')

# Calculate average price per municipality
properties['locality_clean'] = properties['locality'].str.lower().str.strip()
avg_prices = properties.groupby('locality_clean')['price'].agg(['mean', 'median', 'count']).reset_index()
avg_prices.columns = ['municipality_clean', 'avg_price', 'median_price', 'property_count']

# Merge datasets
merged = avg_prices.merge(income[['municipality_clean', 'median_income']], 
                          on='municipality_clean', 
                          how='inner')

# Remove any rows with missing values
merged = merged.dropna()

print(f"Matched municipalities: {len(merged)}")
print(f"Total properties in matched municipalities: {merged['property_count'].sum()}")

Matched municipalities: 321
Total properties in matched municipalities: 5041


## Visualization

In [43]:
# Simple scatter plot
fig = px.scatter(merged, 
                 x='median_income', 
                 y='avg_price',
                 hover_name='municipality_clean',
                 labels={'median_income': 'Median Income (k€)', 
                        'avg_price': 'Average House Price (€)'},
                 title=f'Income vs Price Correlation (r = {merged["median_income"].corr(merged["avg_price"]):.3f})',
                 trendline='ols')

fig.update_traces(marker=dict(size=8, opacity=0.6, color='blue'))
fig.update_layout(height=500, template='plotly_white')
fig.show()

## Summary

The analysis shows the correlation between median income and house prices across Belgian municipalities.