In [1]:
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.display import display, Markdown
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
import sys

In [2]:
# Set float format for display (e.g., 2 decimal places)
pd.options.display.float_format = '{:.2f}'.format

# Suppress only the specific FutureWarning from pandas
warnings.filterwarnings("ignore")

In [3]:
#using DataCleaning Module

sys.path.append('/kaggle/input/modules/pyfiles')
from datacleaning import DataCleaner

In [4]:
df = pd.read_csv("/kaggle/input/london-house-price-prediction-advanced-techniques/train.csv")
df["price"] = df["price"]*(10**-6)

In [5]:
questions = """
Startegy to (1) wrangle data (1.1 data completeness)(check for each columns):
1. ✅ `fullAddress` - No change required.
2. ✅ To extract incode from `postcode`.
3. ✅ To drop `country` column. 
4. ✅ `Latitude`/`Logitude` - No change required.
5. ✅ To impute `floorAreaSqM` with most frequent values (mode).
6. ✅ To impute `bathrooms`/`bedrooms`/`living_room`/`tenure`/`property_type`/`currentEnergyRating` with most frequent values (mode).
7. ✅ To generate time features from sale_year+sale_month using DeterministicProcess and merge
8. ✅ To encode columns w.r.t mean/bins - street, postcode, outcode, tenure, latitudeBins, longitudeBins, propertyType, currentEnergyRating
"""
display(Markdown(questions))


Startegy to (1) wrangle data (1.1 data completeness)(check for each columns):
1. ✅ `fullAddress` - No change required.
2. ✅ To extract incode from `postcode`.
3. ✅ To drop `country` column. 
4. ✅ `Latitude`/`Logitude` - No change required.
5. ✅ To impute `floorAreaSqM` with most frequent values (mode).
6. ✅ To impute `bathrooms`/`bedrooms`/`living_room`/`tenure`/`property_type`/`currentEnergyRating` with most frequent values (mode).
7. ✅ To generate time features from sale_year+sale_month using DeterministicProcess and merge
8. ✅ To encode columns w.r.t mean/bins - street, postcode, outcode, tenure, latitudeBins, longitudeBins, propertyType, currentEnergyRating


### Feature Selection and Modelling

In [6]:
train_df = pd.read_csv("/kaggle/input/london-house-price-prediction-advanced-techniques/train.csv")
test_df = pd.read_csv("/kaggle/input/london-house-price-prediction-advanced-techniques/test.csv")

train_df['sale_date'] = pd.to_datetime({
    'year': train_df['sale_year'],
    'month': train_df['sale_month'],
    'day': 1
})

test_df['sale_date'] = pd.to_datetime({
    'year': test_df['sale_year'],
    'month': test_df['sale_month'],
    'day': 1
})

sale_dates = pd.to_datetime(train_df['sale_date'].sort_values().unique())
sale_index = pd.date_range(start=sale_dates.min(), end=sale_dates.max(), freq='MS')  # 'MS' = Month Start

# TRAINING
train_cleaner = DataCleaner(train_df)
train_cleaner.extract_incode()
train_cleaner.impute_with_mode()
train_cleaner.generate_time_features(index=sale_index, fit=False)
train_cleaner.encode_features(target_col='price', fit=True)

train_data_final = train_cleaner.df

# TESTING
test_cleaner = DataCleaner(test_df)
test_cleaner.dp = train_cleaner.dp  # share fitted DP
test_cleaner.mean_encoders = train_cleaner.mean_encoders
test_cleaner.global_means = train_cleaner.global_means
test_cleaner.bin_edges = train_cleaner.bin_edges
test_cleaner.label_encoders = train_cleaner.label_encoders
test_cleaner.energy_encoder = train_cleaner.energy_encoder

test_cleaner.extract_incode()
test_cleaner.impute_with_mode()
test_cleaner.generate_time_features(index=sale_index, fit=True)  # use same DP
test_cleaner.encode_features(fit=False)

test_data_final = test_cleaner.df

Imputed 'bathrooms' with mode: 1.0
Imputed 'bedrooms' with mode: 2.0
Imputed 'floorAreaSqM' with mode: 55.0
Imputed 'livingRooms' with mode: 1.0
Imputed 'tenure' with mode: Leasehold
Imputed 'propertyType' with mode: Purpose Built Flat
Imputed 'currentEnergyRating' with mode: D
Imputed 'bathrooms' with mode: 1.0
Imputed 'bedrooms' with mode: 2.0
Imputed 'floorAreaSqM' with mode: 55.0
Imputed 'livingRooms' with mode: 1.0
Imputed 'tenure' with mode: Leasehold
Imputed 'propertyType' with mode: Purpose Built Flat
Imputed 'currentEnergyRating' with mode: D


In [7]:
X_train = train_data_final[['incode', 'outcode', 'latitude', 'longitude', 'bathrooms', 'bedrooms', 'floorAreaSqM',
    'livingRooms', 'tenure', 'propertyType', 'currentEnergyRating', 'sale_month', 'sale_year', 
    'const', 'trend', 'trend_squared', 'trend_cubed', 'trend**4', 'latitudeBins', 'longitudeBins']]

y_train = train_data_final[['price']]

# Split into training and validation
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.10, random_state=42
)

In [8]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Example feature set
X1_feature = ['trend', 'trend_squared', 'trend_cubed']  # Linear features
X2_feature = ['bathrooms', 'bedrooms', 'floorAreaSqM', 'livingRooms', 'latitude', 'longitude']  # Non-linear features

# Preprocessor for X2 (non-linear features, using scaling and encoding)
X2_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), X2_feature),
])

# Preprocessor for X1 (linear features)
X1_preprocessor = ColumnTransformer([
    ('trend_scale', StandardScaler(), X1_feature)
])

# Custom Hybrid Model class that combines Ridge and CatBoost
class HybridModel(BaseEstimator, RegressorMixin):
    def __init__(self, trend_model, machine_model, trend_cols, machine_cols, trend_preprocessor, machine_preprocessor):
        self.trend_model = trend_model
        self.machine_model = machine_model
        self.trend_cols = trend_cols
        self.machine_cols = machine_cols
        self.trend_preprocessor = trend_preprocessor
        self.machine_preprocessor = machine_preprocessor

    def fit(self, X, y):
        # Fit Ridge (linear model) on trend features
        X1 = self.trend_preprocessor.fit_transform(X[self.trend_cols])
        X2 = self.machine_preprocessor.fit_transform(X[self.machine_cols])
        
        self.trend_model.fit(X1, y)
        
        # Compute residuals and fit CatBoost (non-linear model) on residuals
        residual = y - self.trend_model.predict(X1)
        self.machine_model.fit(X2, residual)
        return self

    def predict(self, X):
        # Predict with both models
        X1 = self.trend_preprocessor.transform(X[self.trend_cols])
        X2 = self.machine_preprocessor.transform(X[self.machine_cols])
        
        trend_pred = self.trend_model.predict(X1)
        machine_pred = self.machine_model.predict(X2)
        
        # Combine the predictions by adding (could also use a weighted average)
        return trend_pred + machine_pred


# Example data (replace with actual data)
# Let's simulate some data to illustrate
X = pd.DataFrame({
    'trend': np.linspace(1, 100, 1000),
    'trend_squared': np.linspace(1, 100, 1000)**2,
    'trend_cubed': np.linspace(1, 100, 1000)**3,
    'bathrooms': np.random.randint(1, 5, 1000),
    'bedrooms': np.random.randint(1, 7, 1000),
    'floorAreaSqM': np.random.uniform(30, 200, 1000),
    'livingRooms': np.random.randint(1, 5, 1000),
    'latitude': np.random.uniform(51, 52, 1000),
    'longitude': np.random.uniform(-0.5, 0.5, 1000),
})

# Simulated target variable (price)
y = np.random.uniform(100000, 1000000, 1000)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the hybrid model pipeline
model = Pipeline([
    ('HybridModel', HybridModel(
        trend_model=Ridge(alpha=1.0),  # Linear model
        machine_model=CatBoostRegressor(
            iterations=1000,  # Number of boosting rounds
            depth=6,  # Depth of each tree
            learning_rate=0.1,  # Learning rate
            cat_features=[],  # No categorical features in this example (you can add them if available)
            verbose=0  # Turn off CatBoost's verbose output
        ), 
        trend_cols=X1_feature,
        machine_cols=X2_feature,
        trend_preprocessor=X1_preprocessor,
        machine_preprocessor=X2_preprocessor
    ))
])

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Mean Absolute Error: 253177.20586469374


In [9]:
val_df = test_data_final[['incode', 'outcode',
    'latitude', 'longitude', 'bathrooms', 'bedrooms', 'floorAreaSqM',
    'livingRooms', 'tenure', 'propertyType', 'currentEnergyRating','sale_month', 'sale_year', 
    'const', 'trend', 'trend_squared', 'trend_cubed', 'trend**4', 'latitudeBins', 'longitudeBins']]

submission = pd.read_csv('/kaggle/input/london-house-price-prediction-advanced-techniques/sample_submission.csv')

# 💡 Hybrid Prediction (change weights if needed)
submission['price'] = model.predict(val_df)

submission.to_csv('submission.csv', index=False)