# Population vs House Price Analysis

This notebook analyzes the relationship between municipal population and house prices in Belgium.

In [36]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

## 1. Load Data

In [37]:
# Load population data
pop_df = pd.read_csv('../data/raw/TF_SOC_POP_STRUCT_2025.csv')

# Load property data
prop_df = pd.read_csv('../data/cleaned/clean_dataset.csv')

print(f"Population data shape: {pop_df.shape}")
print(f"Property data shape: {prop_df.shape}")

Population data shape: (466822, 21)
Property data shape: (10946, 18)


## 2. Process Population Data

Aggregate population by municipality by summing all MS_POPULATION values.

In [38]:
# Group by municipality and sum population
# Use TX_DESCR_NL (Dutch names) as primary identifier
pop_by_municipality = pop_df.groupby('TX_DESCR_NL').agg({
    'MS_POPULATION': 'sum',
    'TX_DESCR_FR': 'first',  # Keep French name for reference
    'CD_REFNIS': 'first'  # Keep municipality code
}).reset_index()

pop_by_municipality.columns = ['municipality_nl', 'total_population', 'municipality_fr', 'refnis_code']

print(f"Number of municipalities: {len(pop_by_municipality)}")
print(f"\nTop 10 municipalities by population:")
print(pop_by_municipality.nlargest(10, 'total_population')[['municipality_nl', 'total_population']])

Number of municipalities: 565

Top 10 municipalities by population:
    municipality_nl  total_population
19        Antwerpen            562002
182            Gent            272657
92        Charleroi            205763
83          Brussel            198314
323            Luik            197323
443      Schaarbeek            129775
13       Anderlecht            128724
81           Brugge            120283
362           Namen            115029
299          Leuven            104906


## 3. Process Property Data

Calculate median and mean house prices by municipality.

In [39]:
# Remove properties with missing or zero prices
prop_df_clean = prop_df[prop_df['price'].notna() & (prop_df['price'] > 0)].copy()

# Normalize municipality names (lowercase for matching)
prop_df_clean['municipality_lower'] = prop_df_clean['locality'].str.lower().str.strip()

# Group by municipality and calculate price statistics
price_by_municipality = prop_df_clean.groupby('municipality_lower').agg({
    'price': ['median', 'mean', 'count', 'std']
}).reset_index()

price_by_municipality.columns = ['municipality_lower', 'median_price', 'mean_price', 'property_count', 'price_std']

print(f"Number of municipalities with properties: {len(price_by_municipality)}")
print(f"Total properties analyzed: {price_by_municipality['property_count'].sum()}")
print(f"\nTop 10 municipalities by median price:")
print(price_by_municipality.nlargest(10, 'median_price')[['municipality_lower', 'median_price', 'property_count']])

Number of municipalities with properties: 842
Total properties analyzed: 10781

Top 10 municipalities by median price:
             municipality_lower  median_price  property_count
579  ophain-bois-seigneur-isaac     2950000.0               1
94                    boekhoute     2498000.0               1
467                   linkebeek     2290000.0               1
188                     donceel     1999999.0               2
106                    borgloon     1999000.0               1
117                      brakel     1950000.0               3
168                     de-haan     1840000.0               1
135                      celles     1750000.0               1
573                      ooigem     1750000.0               1
818            wortegem-petegem     1650000.0               2


## 4. Merge Population and Price Data

In [40]:
# Normalize population municipality names for matching
pop_by_municipality['municipality_lower'] = pop_by_municipality['municipality_nl'].str.lower().str.strip()

# Also try French names for better matching
pop_by_municipality['municipality_fr_lower'] = pop_by_municipality['municipality_fr'].str.lower().str.strip()

# Merge datasets
merged_df = price_by_municipality.merge(
    pop_by_municipality[['municipality_lower', 'total_population', 'municipality_nl']],
    on='municipality_lower',
    how='inner'
)

# Try to match unmatched municipalities using French names
unmatched = price_by_municipality[~price_by_municipality['municipality_lower'].isin(merged_df['municipality_lower'])]
additional_matches = unmatched.merge(
    pop_by_municipality[['municipality_fr_lower', 'total_population', 'municipality_nl']],
    left_on='municipality_lower',
    right_on='municipality_fr_lower',
    how='inner'
).drop('municipality_fr_lower', axis=1)

# Combine matches
merged_df = pd.concat([merged_df, additional_matches], ignore_index=True)

print(f"Successfully matched municipalities: {len(merged_df)}")
print(f"Total properties in matched municipalities: {merged_df['property_count'].sum()}")
print(f"\nSample of merged data:")
print(merged_df.head(10))

Successfully matched municipalities: 338
Total properties in matched municipalities: 5074

Sample of merged data:
  municipality_lower  median_price     mean_price  property_count  \
0              aalst      325000.0  361097.673913             184   
1             aalter      425000.0  488333.333333               3   
2           aarschot      326500.0  329250.000000               4   
3           affligem      295000.0  315000.000000               3   
4     aiseau-presles      364000.0  364000.000000               2   
5              alken      650000.0  650000.000000               1   
6               amay      210000.0  244500.000000               4   
7            andenne      307000.0  297333.333333               6   
8         anderlecht      299000.0  327607.407407              27   
9          anderlues      199999.0  199999.000000               1   

       price_std  total_population municipality_nl  
0  222237.675538             92131           Aalst  
1  136503.968196    

## 5. Statistical Analysis

In [41]:
# Calculate correlation
correlation_median = merged_df['total_population'].corr(merged_df['median_price'])
correlation_mean = merged_df['total_population'].corr(merged_df['mean_price'])

# Perform linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(
    merged_df['total_population'], 
    merged_df['median_price']
)

print("=" * 50)
print("CORRELATION ANALYSIS")
print("=" * 50)
print(f"Correlation (Population vs Median Price): {correlation_median:.4f}")
print(f"Correlation (Population vs Mean Price): {correlation_mean:.4f}")
print(f"\nLinear Regression (Population vs Median Price):")
print(f"  R-squared: {r_value**2:.4f}")
print(f"  P-value: {p_value:.6f}")
print(f"  Slope: {slope:.4f}")
print(f"  Interpretation: {'Statistically significant' if p_value < 0.05 else 'Not statistically significant'}")

# Summary statistics
print("\n" + "=" * 50)
print("SUMMARY STATISTICS")
print("=" * 50)
print(merged_df[['total_population', 'median_price', 'mean_price', 'property_count']].describe())

CORRELATION ANALYSIS
Correlation (Population vs Median Price): -0.0801
Correlation (Population vs Mean Price): 0.0125

Linear Regression (Population vs Median Price):
  R-squared: 0.0064
  P-value: 0.141849
  Slope: -0.7712
  Interpretation: Not statistically significant

SUMMARY STATISTICS
       total_population  median_price    mean_price  property_count
count        338.000000  3.380000e+02  3.380000e+02      338.000000
mean       24404.491124  4.018242e+05  4.248571e+05       15.011834
std        26862.755852  2.587246e+05  2.533887e+05       40.026547
min         1743.000000  4.000000e+04  4.000000e+04        1.000000
25%        10975.000000  2.605000e+05  2.928889e+05        1.000000
50%        16896.500000  3.489000e+05  3.709500e+05        2.500000
75%        28167.250000  4.743750e+05  4.987500e+05        5.000000
max       272657.000000  2.290000e+06  2.290000e+06      295.000000


## 6. Visualizations

In [42]:
# Simple scatter plot
fig1 = px.scatter(
    merged_df,
    x='total_population',
    y='median_price',
    title='Population vs Median House Price',
    labels={
        'total_population': 'Population',
        'median_price': 'Median Price (€)'
    },
    hover_name='municipality_nl',
    hover_data={
        'municipality_nl': False,  # Don't show in hover box since it's the name
        'total_population': ':,',  # Format with comma separators
        'median_price': ':,.0f',   # Format with comma separators, no decimals
        'property_count': True
    }
)

fig1.show()

# Simple scatter plot with log scale
fig2 = px.scatter(
    merged_df,
    x='total_population',
    y='median_price',
    title='Population vs Median House Price (Log Scale)',
    labels={
        'total_population': 'Population (log)',
        'median_price': 'Median Price (€)'
    },
    log_x=True,
    hover_name='municipality_nl',
    hover_data={
        'municipality_nl': False,  # Don't show in hover box since it's the name
        'total_population': ':,',  # Format with comma separators
        'median_price': ':,.0f',   # Format with comma separators, no decimals
        'property_count': True
    }
)

fig2.show()

In [43]:
# Population categories
merged_df['pop_category'] = pd.cut(
    merged_df['total_population'], 
    bins=[0, 10000, 50000, 100000, float('inf')],
    labels=['<10K', '10K-50K', '50K-100K', '>100K']
)

# Simple histogram of population
fig1 = px.histogram(
    merged_df,
    x='total_population',
    title='Distribution of Municipal Population',
    labels={'total_population': 'Population'}
)
fig1.show()

# Simple histogram of prices
fig2 = px.histogram(
    merged_df,
    x='median_price',
    title='Distribution of Median House Prices',
    labels={'median_price': 'Median Price (€)'}
)
fig2.show()

# Simple bar chart by category
pop_category_prices = merged_df.groupby('pop_category', observed=True)['median_price'].median()
fig3 = px.bar(
    x=pop_category_prices.index.astype(str),
    y=pop_category_prices.values,
    title='Median Price by Population Category',
    labels={'x': 'Population Category', 'y': 'Median Price (€)'}
)
fig3.show()

# Simple box plot
fig4 = px.box(
    merged_df,
    x='pop_category',
    y='median_price',
    title='Price Distribution by Population Category',
    labels={'pop_category': 'Population Category', 'median_price': 'Median Price (€)'}
)
fig4.show()

## 7. Top and Bottom Municipalities

In [44]:
# Top 10 most expensive municipalities
print("=" * 80)
print("TOP 10 MOST EXPENSIVE MUNICIPALITIES")
print("=" * 80)
top_10_expensive = merged_df.nlargest(10, 'median_price')[['municipality_nl', 'total_population', 'median_price', 'property_count']]
print(top_10_expensive.to_string(index=False))

print("\n" + "=" * 80)
print("TOP 10 LEAST EXPENSIVE MUNICIPALITIES")
print("=" * 80)
bottom_10_expensive = merged_df.nsmallest(10, 'median_price')[['municipality_nl', 'total_population', 'median_price', 'property_count']]
print(bottom_10_expensive.to_string(index=False))

print("\n" + "=" * 80)
print("TOP 10 MOST POPULOUS MUNICIPALITIES")
print("=" * 80)
top_10_populous = merged_df.nlargest(10, 'total_population')[['municipality_nl', 'total_population', 'median_price', 'property_count']]
print(top_10_populous.to_string(index=False))

TOP 10 MOST EXPENSIVE MUNICIPALITIES
 municipality_nl  total_population  median_price  property_count
       Linkebeek              4646     2290000.0               1
         Donceel              3128     1999999.0               2
          Brakel             15127     1950000.0               3
          Celles              5775     1750000.0               1
Wortegem-Petegem              6545     1650000.0               2
        Kraainem             13940     1600000.0               3
         Lebbeke             20298     1030000.0               1
       Zonnebeke             12769      995000.0               1
       Rixensart             23052      905000.0               2
          Brecht             30664      895000.0               3

TOP 10 LEAST EXPENSIVE MUNICIPALITIES
  municipality_nl  total_population  median_price  property_count
      Ecaussinnes             11622       40000.0               1
            Eupen             20053       50000.0               1
          A

## 8. Conclusions

This analysis examines the relationship between municipal population and house prices:

1. **Correlation**: The correlation coefficient indicates the strength and direction of the linear relationship
2. **Statistical Significance**: The p-value tells us if the relationship is statistically significant
3. **Price Distribution**: Shows how house prices vary across municipalities of different population sizes
4. **Regional Patterns**: Identifies which municipalities have the highest and lowest prices relative to their population