# Train LGBMRegressor(tuned) + catboost(default) + combined dataset + 10 split fold + labeled datasources + distance to cities

# Imports

In [None]:
!pip install polars

In [None]:
!pip install snoop

In [None]:
import polars as pl
from snoop import pp
from polars.testing import assert_frame_equal, assert_series_equal

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
train_pl = pl.read_csv('/kaggle/input/playground-series-s3e1/train.csv')
test_pl = pl.read_csv('/kaggle/input/playground-series-s3e1/test.csv')
sample_sub_pl = pl.read_csv('/kaggle/input/playground-series-s3e1/sample_submission.csv')

# join Kaggle dataset with SKlearn dataset

In [None]:
from sklearn.datasets import fetch_california_housing

original_data = fetch_california_housing()
original_data.feature_names
original_data.target_names

In [None]:
additional_data = ( 
    pl.concat([
        pl.from_numpy(original_data.data, original_data.feature_names), 
        pl.from_numpy(original_data.target, original_data.target_names),
    ],how='horizontal')
    .select([
        pl.all().exclude('MedHouseVal'),
        pl.lit(False).alias('is_generated'), # add labels to distinguish two datasets
        'MedHouseVal',
    ])
)
additional_data.head()
additional_data.columns

In [None]:
train_pl = (
    train_pl
    .select([
        pl.all().exclude('MedHouseVal'),
        pl.lit(True).alias('is_generated'),
        'MedHouseVal',
    ])
)
train_pl.head()


In [None]:
features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'is_generated'] # added ext_src
target = 'MedHouseVal'

In [None]:
features + [target]
train_joined_pl = train_pl[features + [target]].vstack(additional_data) # not including column `id`
train_joined_pl.shape
train_joined_pl.columns

# Feature: distance to cities
Thanks to @phongnguyen1, reference: https://www.kaggle.com/code/phongnguyen1/distance-to-cities-features-clustering?scriptVersionId=115694922&cellId=40

**About haversine_distances** [link](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html)

> The Haversine (or great circle) distance is the angular distance between two points on the surface of a sphere. The first coordinate of each point is assumed to be the latitude, the second is the longitude, given in radians. The dimension of the data must be 2.

> As the Earth is nearly spherical, the haversine formula provides a good approximation of the distance between two points of the Earth surface, with a less than 1% error on average.

In [None]:
def get_distance(lat1, long1, lat2, long2):
    from sklearn.metrics.pairwise import haversine_distances
    from math import radians

    loc1 = [radians(lat1), radians(long1)]
    loc2 = [radians(lat2), radians(long2)]
    result = haversine_distances([loc1, loc2])
    result * 6371000/1000  # multiply by Earth radius to get kilometers
    return result[0][1]

In [None]:
Sacramento = (38.576931, -121.494949)
SanFrancisco = (37.780080, -122.420160)
SanJose = (37.334789, -121.888138)
LosAngeles = (34.052235, -118.243683)
SanDiego = (32.715759, -117.163818)

In [None]:
train_joined_pl_add_dist = (
    train_joined_pl
    .with_columns([
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], Sacramento[0], Sacramento[1])).alias('dist2Sacramento'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanFrancisco[0], SanFrancisco[1])).alias('dist2SanFrancisco'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanJose[0], SanJose[1])).alias('dist2SanJose'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], LosAngeles[0], LosAngeles[1])).alias('dist2LosAngeles'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanDiego[0], SanDiego[1])).alias('dist2SanDiego'),        
    ])
    .with_columns([
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: min([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2nearestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: max([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2furthestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: sum([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2allCity')        
    ])
)
train_joined_pl_add_dist

In [None]:
features = (train_joined_pl_add_dist.columns)

features.remove('MedHouseVal')

features, target
len(features)

# Training LGBMRegressor model

Let's begin by splitting our data into a train and validation set.

In [None]:
from lightgbm.sklearn import LGBMRegressor
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

The variable that we will be predicting is the `MedHouseVal`. We will use the rest of the columns (minus the id column) for training.

In [None]:
# these parameters come from soupmonsters awesome notebook here: https://www.kaggle.com/code/soupmonster/simple-lightgbm-baseline
params= {
 'learning_rate': 0.02, 
 'n_estimators': 100_000, 
 'metric': 'rmse',
 'lambda_l1': 1.945,
 'num_leaves': 87,
 'feature_fraction': 0.79,
 'bagging_fraction': 0.93,
 'bagging_freq': 4,
 'min_data_in_leaf': 103,
 'max_depth': 17,
}

In [None]:
clfs_f64pl = []
kf = KFold(n_splits=10, random_state=0, shuffle=True) # this line must be included in the same cell as the training block below
rmses = []

for i, (train_index, val_index) in enumerate(kf.split(train_joined_pl_add_dist)): # kf.split can work with pl.DataFrame
    X_train, X_val = train_joined_pl_add_dist[features][train_index].select(pl.all().cast(pl.Float64)), train_joined_pl_add_dist[features][val_index].select(pl.all().cast(pl.Float64))
    y_train, y_val = train_joined_pl_add_dist[target][train_index].cast(pl.Float64), train_joined_pl_add_dist[target][val_index].cast(pl.Float64)

    clf = LGBMRegressor(**params)
    clf.fit(X_train.to_numpy(),
            y_train.to_numpy(),
            eval_set=[(X_val.to_numpy(), y_val.to_numpy())], 
            callbacks=[lgbm.early_stopping(85, verbose=True)]) # why early_stop at 500 

    preds = clf.predict(X_val.to_numpy())
    
    clfs_f64pl.append(clf) # save 5 trained models into this list
    rmses.append(mean_squared_error(y_val.to_numpy(), preds, squared=False))
    
print(f'mean RMSE across all folds: {pl.Series(rmses).mean()}')

# Train a catboost model

In [None]:
from catboost import CatBoostRegressor
clfs_f64pl_cat = []
rmses = []
kf = KFold(n_splits=10, random_state=1, shuffle=True)
for train_index, val_index in kf.split(train_joined_pl_add_dist):
    X_train = train_joined_pl_add_dist[features][train_index].select(pl.all().cast(pl.Float64))
    X_val = train_joined_pl_add_dist[features][val_index].select(pl.all().cast(pl.Float64))
    y_train = train_joined_pl_add_dist[target][train_index].cast(pl.Float64)
    y_val = train_joined_pl_add_dist[target][val_index].cast(pl.Float64)

    clf = CatBoostRegressor(iterations=100_000, loss_function='RMSE')
    clf.fit(X_train.to_numpy(), 
            y_train.to_numpy(), 
            eval_set=(X_val.to_numpy(), y_val.to_numpy()), 
            early_stopping_rounds=1000, verbose=False)
    
    
    preds = clf.predict(X_val.to_numpy())
    
    clfs_f64pl_cat.append(clf)
    rmses.append(mean_squared_error(y_val.to_numpy(), preds, squared=False))
print(f'mean RMSE across all folds: {np.mean(rmses)}')

Let us now look at the variables that are important according to our model.

In [None]:
(
    pl.DataFrame({
    "features": features,
    "importance": clf.feature_importances_, # using the latest model
    })
    .with_columns([
        (pl.col('importance')/pl.col('importance').sum()).alias('ratio')
    ])
    .sort('ratio', reverse=True)
)

# prepare test set

In [None]:
features

In [None]:
test_pl_adddist = (

    test_pl
    .select([
        pl.all(),
        pl.lit(True).alias('is_generated'),
    ])
    .with_columns([
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], Sacramento[0], Sacramento[1])).alias('dist2Sacramento'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanFrancisco[0], SanFrancisco[1])).alias('dist2SanFrancisco'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanJose[0], SanJose[1])).alias('dist2SanJose'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], LosAngeles[0], LosAngeles[1])).alias('dist2LosAngeles'),
        pl.struct(['Latitude', 'Longitude']).apply(lambda x: get_distance(x['Latitude'], x['Longitude'], SanDiego[0], SanDiego[1])).alias('dist2SanDiego'),        
    ])
    .with_columns([
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: min([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2nearestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: max([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2furthestCity'),
        pl.struct(['dist2Sacramento', 'dist2SanFrancisco', 'dist2SanJose', 'dist2LosAngeles', 'dist2SanDiego']).apply(lambda x: sum([x['dist2Sacramento'], x['dist2SanFrancisco'], x['dist2SanJose'], x['dist2LosAngeles'], x['dist2SanDiego']])).alias('dist2allCity')        
    ])
)
test_pl_adddist

# Ensemble 

In [None]:
test_preds = []

for clf in (clfs_f64pl + clfs_f64pl_cat):
    preds = clf.predict(test_pl_adddist[features].to_numpy())
    test_preds.append(preds)

test_preds_mean_pl = (
    pl.DataFrame(test_preds)
    .transpose()
    .select([
        pl.all().explode()
    ])
    .mean(axis=1)
    .to_list()
)

# Make a submission

In [None]:
submission = pl.DataFrame({
    'id': test_pl.select('id').to_series(),
    'MedHouseVal': test_preds_mean_pl
})
# submission.head()

submission.write_csv('clfs_lgbm_cat_extsrc.csv')

This is shaping up to be a very excting challenge! 🥳 

**If you found this notebook useful, please upvote! 🙏 Thank you!**

All the best in the competition!