In [1]:
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, f_regression

import pandas as pd

import numpy as np
import pandas as pd
import re

# new import for lightgbm, also requires numpy
from lightgbm import LGBMRegressor

In [37]:
train = pd.read_csv('train.csv')
new_X = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

def extract_number(text):
    match = re.search(r'(\d+\.?\d*)', str(text))
    if match:
        return float(match.group())
    else:
        return 0.5  # if it doesn't have a number, it is 'half-bath'

train['baths'] = train['bathrooms_text'].apply(extract_number)
train['host_acceptance_rate'] = train['host_acceptance_rate'].str.rstrip('%').astype(float) / 100


train['host_is_superhost'] = train['host_is_superhost'].astype('category')
train['host_location'] = train['host_location'].astype('category')
train['neighbourhood_cleansed'] = train['neighbourhood_cleansed'].astype('category')
train['property_type'] = train['property_type'].astype('category')
train['instant_bookable'] = train['instant_bookable'].astype('category')
train['amenities'] = train['amenities'].astype('category')
train['host_response_time'] = train['host_response_time'].astype('category')


y = train.loc[:, ['price']]
X = train.loc[:, [#'host_location', #maybe
                  #'host_response_time', #maybe
                  'host_acceptance_rate', 
                  'host_is_superhost', 
                  'neighbourhood_cleansed', 
                  'property_type', #'room_type'
                  'accommodates', 
                  'baths', 
                  'beds',  #not bedrooms
                  'amenities', # or if it even has amenities, or pools vs hottub or pets
                  #'maximum_maximum_nights', 
                  #'maximum_nights',
                  #'number_of_reviews', 
                  #'number_of_reviews_ltm',  #check this
                  'instant_bookable'
                  ]]

warnings.filterwarnings("ignore")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=307)


In [38]:
numeric_features = ['accommodates', 
                    'host_acceptance_rate', 
                    'beds', 
                    'baths', 
                    #'maximum_nights', 
                    #'maximum_maximum_nights',
                    #'number_of_reviews', 
                    #'number_of_reviews_ltm'
                    ]

categorical_features = [#'host_location', 
                        #'host_response_time',
                        'host_is_superhost', 
                        'neighbourhood_cleansed', 
                        'property_type', 
                        'amenities',
                        'instant_bookable'
                        ]

numeric = Pipeline(steps=[('impute', SimpleImputer(strategy='median')), 
                 ('polynomial', PolynomialFeatures(degree = 2, include_bias=False)),
                 ('standardize', StandardScaler()),
                 ('percent', SelectPercentile(f_regression, percentile=40))])

categorical = Pipeline(steps=[('impute2', SimpleImputer(strategy='most_frequent')),
                     ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
                     ('percent', SelectPercentile(f_regression, percentile=60))
                     ])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric, numeric_features),
        ("categorical", categorical, categorical_features)
    ]
)

models = {
    'KNN': KNeighborsRegressor(),
    'LightGBM': LGBMRegressor()
}

# Create a pipeline for each model
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

# Fit and evaluate each pipeline
results = {}
for model_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)  # Fit the pipeline
    y_pred = pipeline.predict(X_test)  # Make predictions
    mae = mean_absolute_error(y_test, y_pred)  # Calculate MAE
    results[model_name] = mae

# Print results
for model_name, mae in results.items():
    print(f"{model_name}: MAE = {mae}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000352 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 671
[LightGBM] [Info] Number of data points in the train set: 12219, number of used features: 122
[LightGBM] [Info] Start training from score 294.987233
KNN: MAE = 176.6712929623568
LightGBM: MAE = 258.11097601797223
