# Correlation Matrix Analysis

How are the variables correlated to each other? Can we find groups of variables that are correlated together?

In [7]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

In [8]:
# Load data
df = pd.read_csv('../data/cleaned/clean_dataset.csv')

In [9]:
# Prepare data for correlation - include categorical variables
df_analysis = df.copy()

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns to encode: {categorical_cols}\n")

# Label encode categorical variables using pandas factorize
for col in categorical_cols:
    if col in df_analysis.columns:
        df_analysis[col + '_encoded'] = pd.factorize(df_analysis[col])[0]

# Select all numerical columns (original + encoded)
numerical_cols = df_analysis.select_dtypes(include=[np.number]).columns.tolist()
df_numerical = df_analysis[numerical_cols]

print(f"\nTotal columns for correlation analysis: {len(numerical_cols)}")
print(f"Numerical columns: {numerical_cols}")

Categorical columns to encode: ['property_id', 'url', 'property_type', 'state', 'property_subtype', 'locality', 'sale_type']


Total columns for correlation analysis: 18
Numerical columns: ['price', 'rooms', 'area', 'facades_number', 'is_furnished', 'has_terrace', 'has_garden', 'has_equipped_kitchen', 'has_swimming_pool', 'has_open_fire', 'postal_code', 'property_id_encoded', 'url_encoded', 'property_type_encoded', 'state_encoded', 'property_subtype_encoded', 'locality_encoded', 'sale_type_encoded']


In [10]:
# Calculate correlation matrix
correlation_matrix = df_numerical.corr()

# Create interactive heatmap
fig = px.imshow(correlation_matrix, 
                text_auto='.2f',
                aspect='auto',
                color_continuous_scale='RdBu_r',
                color_continuous_midpoint=0,
                title='Correlation Matrix')

fig.update_layout(width=900, height=800)
fig.show()

In [11]:
# Find strong correlations (|r| >= 0.5)
strong_corr = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) >= 0.5:
            strong_corr.append({
                'Variable 1': correlation_matrix.columns[i],
                'Variable 2': correlation_matrix.columns[j],
                'Correlation': round(corr_value, 3)
            })

strong_corr_df = pd.DataFrame(strong_corr).sort_values('Correlation', ascending=False, key=abs)
print(f"\nStrong correlations (|r| >= 0.5): {len(strong_corr_df)} pairs found\n")
strong_corr_df


Strong correlations (|r| >= 0.5): 9 pairs found



Unnamed: 0,Variable 1,Variable 2,Correlation
5,property_id_encoded,url_encoded,1.0
8,property_type_encoded,property_subtype_encoded,0.956
3,rooms,area,0.729
6,property_id_encoded,locality_encoded,0.702
7,url_encoded,locality_encoded,0.702
4,facades_number,has_open_fire,0.63
1,price,area,0.617
0,price,rooms,0.592
2,price,has_swimming_pool,0.522


## Key Findings

**Groups of correlated variables identified:**

In [12]:
# Group correlated variables
if len(strong_corr_df) > 0:
    print("Variables that correlate together:\n")
    for idx, row in strong_corr_df.iterrows():
        print(f"- {row['Variable 1']} <-> {row['Variable 2']}: {row['Correlation']}")
else:
    print("No strong correlations found (threshold |r| >= 0.5)")

Variables that correlate together:

- property_id_encoded <-> url_encoded: 1.0
- property_type_encoded <-> property_subtype_encoded: 0.956
- rooms <-> area: 0.729
- property_id_encoded <-> locality_encoded: 0.702
- url_encoded <-> locality_encoded: 0.702
- facades_number <-> has_open_fire: 0.63
- price <-> area: 0.617
- price <-> rooms: 0.592
- price <-> has_swimming_pool: 0.522
