# Random Forest Model

## 1) Clean X selection and train-test splitting

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load CSV
df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")

def prepare_features_and_target(df):
    """
    Drops unwanted columns and returns X_train, X_test, y_train, y_test.
    """
    # Columns to drop
    columns_to_drop = [
        "price (€)",
        "property_ID",
        "locality_name",
        "postal_code",
        # Note: "price_per_square_meter" is not included in current CSV
    ]
    
    # 1. Create X and y
    X = df.drop(columns=columns_to_drop)
    y = df["price (€)"]
    
    # 2. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=777
    )
    
    return X_train, X_test, y_train, y_test



## 2) Imputation for numerical and categorical columns to handle NaNs

In [2]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Function to impute numerical columns
def impute_numeric_columns(X_train, X_test):
    """
    Imputes numerical columns in X_train and X_test using the mean of X_train.
    Returns the updated X_train, X_test, and the list of numerical columns.
    """

    # Select numerical columns automatically
    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

    # Create imputer (mean strategy)
    imputer = SimpleImputer(strategy="mean")

    # Fit on training data
    X_train[num_cols] = imputer.fit_transform(X_train[num_cols])

    # Transform test data using SAME means
    X_test[num_cols] = imputer.transform(X_test[num_cols])

    return X_train, X_test, num_cols

# Function to impute categorical column "state of building"
def impute_categorical_state(X_train, X_test, column="state_of_building"):
    """
    Fills missing values in the state_of_building column using the 'unknown' category.
    """
    X_train[column] = X_train[column].fillna("unknown")
    X_test[column] = X_test[column].fillna("unknown")
    
    return X_train, X_test

display(df.dtypes)



property_ID                       object
locality_name                     object
postal_code                        int64
type                              object
subtype                           object
price (€)                          int64
number_of_bedrooms               float64
living_area (m²)                   int64
equiped_kitchen (yes:1, no:0)      int64
furnished (yes:1, no:0)            int64
open_fire (yes:1, no:0)            int64
terrace (yes:1, no:0)              int64
terrace_area (m²)                float64
garden (yes:1, no:0)               int64
number_facades                   float64
swimming_pool (yes:1, no:0)        int64
state_of_building                 object
province                          object
dtype: object

## 3) Encoding: LabelEncoding and OrdinalEncoding

In [3]:
from sklearn.preprocessing import LabelEncoder

def label_encode_columns(X_train, X_test, columns):
    """
    Label-encodes multiple categorical columns using separate LabelEncoders.
    No data leakage: fits only on X_train.
    Returns transformed X_train, X_test, and a dictionary of encoders.
    """
    encoders = {}
    
    for col in columns:
        le = LabelEncoder()
        
        X_train[col + "_le"] = le.fit_transform(X_train[col])
        X_test[col + "_le"] = le.transform(X_test[col])
        
        encoders[col] = le  # store encoder
        
    return X_train, X_test, encoders

from sklearn.preprocessing import OrdinalEncoder

def ordinal_encode_state(X_train, X_test, column="state_of_building"):
    """
    Applies Ordinal Encoding to the state_of_building column using a fixed custom order.
    Fits only on X_train (no leakage).
    """
    
    state_order = [[
        "unknown",
        "To demolish",
        "Under construction",
        "To restore",
        "To renovate",
        "To be renovated",
        "Normal",
        "Fully renovated",
        "Excellent",
        "New"
    ]]
    
    oe = OrdinalEncoder(categories=state_order)
    
    train_encoded = oe.fit_transform(X_train[[column]])
    test_encoded = oe.transform(X_test[[column]])
    
    X_train[column + "_oe"] = train_encoded.flatten()
    X_test[column + "_oe"] = test_encoded.flatten()
    
    return X_train, X_test, oe



## 4) Final Assembly

In [4]:
# Step 1: Load CSV
#df = pd.read_csv("../data/raw/filtered_final_cleaned_data.csv")

# Step 2: Prepare X and y
X_train, X_test, y_train, y_test = prepare_features_and_target(df)

# Step 3: Impute numerical values
X_train, X_test, num_cols = impute_numeric_columns(X_train, X_test)

# Step 4: Impute categorical missing values
X_train, X_test = impute_categorical_state(X_train, X_test)

# Step 5: LabelEncoding for "type", "subtype" and "province" columns
X_train, X_test, le_encoders = label_encode_columns(
    X_train,
    X_test,
    columns=["type", "province", "subtype"]
)

# Step 6: OrdinalEncoding for "state_of_building" column
X_train, X_test, state_encoder = ordinal_encode_state(X_train, X_test, column="state_of_building")

# Step 7: Assemble everything together and drop the original columns from before encoding
X_train_final = X_train.drop(columns=["type", "subtype", "state_of_building", "province"])
X_test_final  = X_test.drop(columns=["type", "subtype", "state_of_building", "province"])

# y_train & y_test remain unchanged
y_train = y_train
y_test = y_test

# Optional: quick safety check
print("X_train_final shape:", X_train_final.shape)
print("X_test_final shape: ", X_test_final.shape)


X_train_final shape: (11636, 14)
X_test_final shape:  (2909, 14)


## 5) Training and Evaluation

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 1. Function to create and train the model
def train_random_forest(X_train, y_train):
    model = RandomForestRegressor(
    n_estimators=300,       # number of trees
    max_depth=15,           # limit tree depth
    min_samples_split=10,   # require more samples to split
    min_samples_leaf=5,     # prevent tiny leaf nodes
    random_state=888,
    n_jobs=-1               # use all CPU cores
)

    model.fit(X_train, y_train)
    return model

# 2. Function to evaluate the model
def evaluate_random_forest(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    metrics = {
        "train": 
        {
            "MAE": mean_absolute_error(y_train, y_pred_train),
            "RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
            "R2": r2_score(y_train, y_pred_train)
        },
        "test": 
        {
            "MAE": mean_absolute_error(y_test, y_pred_test),
            "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
            "R2": r2_score(y_test, y_pred_test)
        }
    }
    return metrics
   

# Train the model
model_rf = train_random_forest(X_train_final, y_train)

# Evaluate the model

results = evaluate_random_forest(
    model_rf,
    X_train_final, y_train,
    X_test_final, y_test
)

results



{'train': {'MAE': 57324.674913963776,
  'RMSE': np.float64(107368.78375210226),
  'R2': 0.8189871043007264},
 'test': {'MAE': 79336.77402068197,
  'RMSE': np.float64(219089.9979201939),
  'R2': 0.5247791789404848}}

Conclusion: there seems to be overfitting (gap between train R2 and test R2 is too big: >O.100)

Ways to improve model: grid search, adjust RF parameters, log transform?
