# Correlation Matrix Analysis

How are the variables correlated to each other? Can we find groups of variables that are correlated together?

In [48]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

In [49]:
# Load data
df = pd.read_csv('../data/cleaned/cleaned_dataset_v2.csv')

In [50]:
# Prepare data for correlation - include categorical variables
df_analysis = df.copy()

# Create price per area column
df_analysis['price_per_area'] = df_analysis['price'] / df_analysis['area']

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns to encode: {categorical_cols}\n")

# Label encode categorical variables using pandas factorize
for col in categorical_cols:
    if col in df_analysis.columns:
        df_analysis[col + '_encoded'] = pd.factorize(df_analysis[col])[0]

# Select all numerical columns (original + encoded)
numerical_cols = df_analysis.select_dtypes(include=[np.number]).columns.tolist()
df_numerical = df_analysis[numerical_cols]

print(f"\nTotal columns for correlation analysis: {len(numerical_cols)}")
print(f"Numerical columns: {numerical_cols}")

Categorical columns to encode: ['property_id', 'url', 'property_type', 'state', 'property_subtype', 'locality', 'sale_type', 'has_swimming_pool', 'has_equipped_kitchen']


Total columns for correlation analysis: 19
Numerical columns: ['price', 'rooms', 'area', 'has_open_fire', 'postal_code', 'facades_number', 'is_furnished', 'has_terrace', 'has_garden', 'price_per_area', 'property_id_encoded', 'url_encoded', 'property_type_encoded', 'state_encoded', 'property_subtype_encoded', 'locality_encoded', 'sale_type_encoded', 'has_swimming_pool_encoded', 'has_equipped_kitchen_encoded']


In [51]:
# Calculate correlation matrix
correlation_matrix = df_numerical.corr()

# Create interactive heatmap
fig = px.imshow(correlation_matrix, 
                text_auto='.2f',
                aspect='auto',
                color_continuous_scale='RdBu_r',
                color_continuous_midpoint=0,
                title='Correlation Matrix')

fig.update_layout(width=900, height=800)
fig.show()

In [52]:
# Find strong correlations (|r| >= 0.5)
strong_corr = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) >= 0.5:
            strong_corr.append({
                'Variable 1': correlation_matrix.columns[i],
                'Variable 2': correlation_matrix.columns[j],
                'Correlation': round(corr_value, 3)
            })

strong_corr_df = pd.DataFrame(strong_corr).sort_values('Correlation', ascending=False, key=abs)
print(f"\nStrong correlations (|r| >= 0.5): {len(strong_corr_df)} pairs found\n")
strong_corr_df


Strong correlations (|r| >= 0.5): 10 pairs found



Unnamed: 0,Variable 1,Variable 2,Correlation
6,property_id_encoded,url_encoded,1.0
9,property_type_encoded,property_subtype_encoded,0.956
2,rooms,area,0.729
7,property_id_encoded,locality_encoded,0.702
8,url_encoded,locality_encoded,0.702
1,price,area,0.617
3,has_open_fire,facades_number,0.612
0,price,rooms,0.592
4,has_garden,property_type_encoded,-0.591
5,has_garden,property_subtype_encoded,-0.562


## Correlation Matrix by Property Type

Split the analysis by property type (House vs Apartment)

In [53]:
# Group correlated variables
if len(strong_corr_df) > 0:
    print("Variables that correlate together:\n")
    for idx, row in strong_corr_df.iterrows():
        print(f"- {row['Variable 1']} <-> {row['Variable 2']}: {row['Correlation']}")
else:
    print("No strong correlations found (threshold |r| >= 0.5)")

Variables that correlate together:

- property_id_encoded <-> url_encoded: 1.0
- property_type_encoded <-> property_subtype_encoded: 0.956
- rooms <-> area: 0.729
- property_id_encoded <-> locality_encoded: 0.702
- url_encoded <-> locality_encoded: 0.702
- price <-> area: 0.617
- has_open_fire <-> facades_number: 0.612
- price <-> rooms: 0.592
- has_garden <-> property_type_encoded: -0.591
- has_garden <-> property_subtype_encoded: -0.562


In [54]:
# Group correlated variables
if len(strong_corr_df) > 0:
    print("Variables that correlate together:\n")
    for idx, row in strong_corr_df.iterrows():
        print(f"- {row['Variable 1']} <-> {row['Variable 2']}: {row['Correlation']}")
else:
    print("No strong correlations found (threshold |r| >= 0.5)")

Variables that correlate together:

- property_id_encoded <-> url_encoded: 1.0
- property_type_encoded <-> property_subtype_encoded: 0.956
- rooms <-> area: 0.729
- property_id_encoded <-> locality_encoded: 0.702
- url_encoded <-> locality_encoded: 0.702
- price <-> area: 0.617
- has_open_fire <-> facades_number: 0.612
- price <-> rooms: 0.592
- has_garden <-> property_type_encoded: -0.591
- has_garden <-> property_subtype_encoded: -0.562


## Key Findings

**Groups of correlated variables identified:**

In [55]:
# Filter for Apartments
df_apartments = df_analysis[df_analysis['property_type'] == 'Apartment'][numerical_cols]
corr_apartments = df_apartments.corr()

# Create heatmap for Apartments
fig_apartments = px.imshow(corr_apartments, 
                           text_auto='.2f',
                           aspect='auto',
                           color_continuous_scale='RdBu_r',
                           color_continuous_midpoint=0,
                           title='Correlation Matrix - Apartments Only')

fig_apartments.update_layout(width=900, height=800)
fig_apartments.show()

In [56]:
# Filter for Houses
df_houses = df_analysis[df_analysis['property_type'] == 'House'][numerical_cols]
corr_houses = df_houses.corr()

# Create heatmap for Houses
fig_houses = px.imshow(corr_houses, 
                       text_auto='.2f',
                       aspect='auto',
                       color_continuous_scale='RdBu_r',
                       color_continuous_midpoint=0,
                       title='Correlation Matrix - Houses Only')

fig_houses.update_layout(width=900, height=800)
fig_houses.show()

In [57]:
# Check unique property types
print("Unique property types:")
print(df['property_type'].value_counts())

Unique property types:
property_type
House        5488
Apartment    4716
Other         742
Name: count, dtype: int64
