# Property Price Prediction - EDA and Model Training

This notebook explores the real estate data and trains a machine learning model to predict property prices in Poland.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load raw data
df = pd.read_csv('../data/raw/data_final.csv', index_col=0)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 2. Data Exploration

In [None]:
print("=== DATASET INFO ===")
print(df.info())

print("\n=== MISSING VALUES ===")
print(df.isnull().sum())

print("\n=== PRICE STATISTICS ===")
print(df['Price'].describe())

print("\n=== VOIVODESHIPS ===")
print(df['voivodeship'].value_counts())

In [None]:
# Check unique values for categorical features
print("Heating types:", df['Heating'].unique())
print(f"Count: {df['Heating'].nunique()}\n")

print("Building materials:", df['Building material'].unique())
print(f"Count: {df['Building material'].nunique()}\n")

print("Building types:", df['Building type'].unique())
print(f"Count: {df['Building type'].nunique()}\n")

print("Market types:", df['Market'].unique())
print(f"Count: {df['Market'].nunique()}")

## 3. Data Cleaning and Preprocessing

In [None]:
# Create a copy for processing
df_processed = df.copy()

# Drop rows with missing critical values
df_processed = df_processed.dropna(subset=['Price', 'Area (m²)', 'Number of rooms', 'year_const', 'voivodeship'])

print(f"Shape after removing missing critical values: {df_processed.shape}")
print(f"Rows removed: {len(df) - len(df_processed)}")

In [None]:
# Extract city and district from Address
def extract_city_district(address):
    """Extract city and district from address string"""
    if pd.isna(address):
        return 'Unknown', 'Unknown'
    
    parts = str(address).split(',')
    if len(parts) >= 2:
        # City is usually second to last
        city = parts[-2].strip()
        district = parts[-3].strip() if len(parts) >= 3 else 'Unknown'
        return city, district
    return 'Unknown', 'Unknown'

# Apply extraction
df_processed[['city', 'district']] = df_processed['Address'].apply(
    lambda x: pd.Series(extract_city_district(x))
)

print("Sample cities and districts:")
print(df_processed[['Address', 'city', 'district']].head(10))
print(f"\nNumber of unique cities: {df_processed['city'].nunique()}")
print(f"Number of unique districts: {df_processed['district'].nunique()}")

In [None]:
# Remove outliers in price and area
price_q1, price_q3 = df_processed['Price'].quantile([0.01, 0.99])
area_q1, area_q3 = df_processed['Area (m²)'].quantile([0.01, 0.99])

print(f"Price range (1%-99%): {price_q1} - {price_q3}")
print(f"Area range (1%-99%): {area_q1} - {area_q3}")

df_processed = df_processed[
    (df_processed['Price'] >= price_q1) & 
    (df_processed['Price'] <= price_q3) &
    (df_processed['Area (m²)'] >= area_q1) & 
    (df_processed['Area (m²)'] <= area_q3)
]

print(f"\nShape after removing outliers: {df_processed.shape}")

In [None]:
# Handle missing values in categorical features
df_processed['Heating'].fillna('inne', inplace=True)
df_processed['Building material'].fillna('inny', inplace=True)
df_processed['Market'].fillna('wtórny', inplace=True)

print("Missing values after handling:")
print(df_processed[['Heating', 'Building material', 'Market', 'Building type']].isnull().sum())

## 4. Feature Selection and Engineering

In [None]:
# Select features for model
features = [
    'Area (m²)',
    'Number of rooms',
    'year_const',
    'Heating',
    'Building material',
    'Building type',
    'Market',
    'voivodeship'
]

# Create feature matrix and target
X = df_processed[features].copy()
y = df_processed['Price'].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

In [None]:
# Encode categorical features
categorical_features = ['Heating', 'Building material', 'Building type', 'Market', 'voivodeship']
label_encoders = {}

X_encoded = X.copy()

for col in categorical_features:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le
    print(f"{col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

print(f"\nEncoded features shape: {X_encoded.shape}")
print(X_encoded.head())

## 5. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining set price range: {y_train.min()} - {y_train.max()}")
print(f"Test set price range: {y_test.min()} - {y_test.max()}")

## 6. Train Random Forest Model

In [None]:
# Train Random Forest Regressor
print("Training Random Forest model...")
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

model.fit(X_train, y_train)
print("Model training completed!")

## 7. Model Evaluation

In [None]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("=== MODEL PERFORMANCE ===")
print(f"\nTraining Set:")
print(f"  R² Score: {train_r2:.4f}")
print(f"  RMSE: {train_rmse:,.2f} PLN")
print(f"  MAE: {train_mae:,.2f} PLN")

print(f"\nTest Set:")
print(f"  R² Score: {test_r2:.4f}")
print(f"  RMSE: {test_rmse:,.2f} PLN")
print(f"  MAE: {test_mae:,.2f} PLN")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== FEATURE IMPORTANCE ===")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

## 8. Save Model and Encoders

In [None]:
# Save model
joblib.dump(model, '../models/price_model.joblib')
print("Model saved to ../models/price_model.joblib")

# Save label encoders
joblib.dump(label_encoders, '../models/label_encoders.joblib')
print("Label encoders saved to ../models/label_encoders.joblib")

# Save feature list
joblib.dump(features, '../models/features.joblib')
print("Features list saved to ../models/features.joblib")

# Save processed data for reference
df_processed.to_csv('../data/processed/data_processed.csv')
print("Processed data saved to ../data/processed/data_processed.csv")

In [None]:
print("\n=== TRAINING SUMMARY ===")
print(f"Dataset size: {len(df_processed)} properties")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"\nModel: Random Forest Regressor")
print(f"Test R² Score: {test_r2:.4f}")
print(f"Test RMSE: {test_rmse:,.2f} PLN")
print(f"\nModel and encoders saved successfully!")