# Соревнование для учащися курса "Специалист по работе с данными / Data Scientist". Школа 21 СБЕР / Томский государственный университет

https://www.kaggle.com/competitions/sber-21-tgu-2022/data

Задача - предсказать стоимость аренды квартир в Лондоне. Метрика - MAE.

Предварительный анализ данных см. в ноутбуке EDA.ipynb

# Дополнительный отбор признаков

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
import chime

In [2]:
%load_ext chime

In [38]:
class Preprocess:
    def __init__(self, max_categories=6, n=10):
        self.n=n
        self.max_categories = max_categories
        self.uncorrelated_features = ['latitude', 'longitude',
                                          'available', 'comments',
                                          'canceled', 'notice',
                                          'host_response_rate', 'host_since']
        self.correlated_features = ['accommodates', 'bathrooms', 'beds','guests_included',
                                        'security_deposit','bedrooms','minimum_nights',
                                        'extra_people','square_feet','cleaning_fee']

    def fit(self, X, y=None):    
        # Merge with other files
        calendar = pd.read_csv('calendar_availability.csv', index_col='listing_id')
        reviews = pd.read_csv('reviews_stat.csv', index_col='listing_id')
        df = X.copy().join([calendar, reviews], how='left')
        df.drop(columns='host_id', inplace=True)

        #Convert the column with the percent sign
        df['host_response_rate'] = df['host_response_rate']\
                .str.replace(r'\D+', '', regex=True)\
                .astype('float')

        # Convert datetimes into numbers for further use as a numeric variable
        # This will apply to the host_since column
        self.date_features = df.select_dtypes(include='datetime').columns.to_list()
        df[self.date_features] = df[self.date_features].astype(np.int64)

        self.numeric_features = df.select_dtypes(include='number').columns.to_list()
        cat_columns = df.select_dtypes(include=['object', 'bool'])
        self.cat_features = cat_columns.columns.to_list()

       # Identify categorical features with more than n unique values
        # and convert each into a binary feature: data provided or not
        cat_features_nunique = cat_columns.nunique()
        

        self.long = cat_columns.columns[cat_features_nunique > self.n]
        df[self.long] = df[self.long].isna()
        
        self.scale_encode = ColumnTransformer(
            transformers = [
                ('scale', StandardScaler(), self.numeric_features),
                ('get_dummies', OneHotEncoder(sparse=False,
                                              drop='first', 
                                              handle_unknown='infrequent_if_exist',
                                              min_frequency=2,
                                             max_categories=self.max_categories), self.cat_features)
            ],
            remainder='passthrough',
            n_jobs=-1
        ).fit(df)
        
        return self
        
    def transform(self, X, y=None):
        # Merge with other files
        calendar = pd.read_csv('calendar_availability.csv', index_col='listing_id')
        reviews = pd.read_csv('reviews_stat.csv', index_col='listing_id')
        df = X.copy().join([calendar, reviews], how='left')
        df.drop(columns='host_id', inplace=True)

        #Convert the column with the percent sign
        df['host_response_rate'] = df['host_response_rate']\
                .str.replace(r'\D+', '', regex=True)\
                .astype('float')

        # Convert datetimes into numbers for further use as a numeric variable
        # This will apply to the host_since column
        df[self.date_features] = df[self.date_features].astype(np.int64)

        # Identify categorical features with more than n unique values
        # and convert each into a binary feature: data provided or not
        df[self.long] = df[self.long].isna()
        
        result = self.scale_encode.transform(df)
            
        return result

In [15]:
train = pd.read_csv('train.csv',
                    index_col='id',
                    true_values=['t'],
                    false_values=['f'],
                    na_values='none',
                    parse_dates=['host_since'],
                   low_memory=False)

In [5]:
X = train.drop(columns='price')
y = train.price

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.2,
                                                  random_state=8,
                                                 shuffle=True)

In [7]:
X_val.shape

(7254, 41)

In [39]:
%%time
%%chime
preprocess = Preprocess().fit(X_train)

CPU times: total: 719 ms
Wall time: 55.3 s


In [40]:
%%time
%%chime
X_train_transf = preprocess.transform(X_train)

CPU times: total: 594 ms
Wall time: 27.6 s


In [41]:
X_train_transf.shape

(29016, 55)

In [42]:
%%time
%%chime
X_val_transf = preprocess.transform(X_val)

CPU times: total: 297 ms
Wall time: 1.96 s


In [43]:
X_val_transf.shape

(7254, 55)

In [44]:
np.save('X_train_transf', X_train_transf)
np.save('X_val_transf', X_val_transf)
np.save('y_train', y_train)
np.save('y_val', y_val)

In [10]:
X_train_transf = np.load('X_train_transf.npy')
X_val_transf = np.load('X_val_transf.npy')
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')

## Предсказание по средней цене

In [45]:
y_val.describe()

count    7254.000000
mean      115.931624
std       175.953193
min         0.000000
25%        45.000000
50%        85.000000
75%       135.000000
max      7716.000000
Name: price, dtype: float64

In [46]:
(y_val - y_val.mean()).abs().mean()

74.18344137864436

## Градиентный бустинг

In [47]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [48]:
params = {'max_iter': [100, 200, 300, 400, 500],
          'max_depth':[10, 20, 50, 70],
          'l2_regularization': [0.0, 0.2, 0.4]
         }

In [49]:
regressor = HistGradientBoostingRegressor(loss='absolute_error',
                                          learning_rate=0.1, 
                                          early_stopping='auto',
                                          scoring='loss',
                                          validation_fraction=0.1,
                                          n_iter_no_change=10,
                                          tol=1e-07, 
                                          random_state=8) 
boost = GridSearchCV(estimator = regressor,
                      param_grid = params,
                      scoring='neg_mean_absolute_error',
                      n_jobs=4, 
                      refit=True,
                      cv=5)

In [50]:
%%time
%%chime
boost.fit(X_train_transf, y_train)

CPU times: total: 13.4 s
Wall time: 6min 6s


In [51]:
boost.best_score_

-34.10179694990254

In [52]:
boost.best_params_

{'l2_regularization': 0.2, 'max_depth': 10, 'max_iter': 400}

Сужаю сеть

In [53]:
params = {'max_iter': [350, 400, 450],
          'max_depth':[15, 20, 30, 40],
          'l2_regularization': [0.1, 0.2, 0.3]
         }

In [54]:
regressor = HistGradientBoostingRegressor(loss='absolute_error',
                                          learning_rate=0.1, 
                                          early_stopping='auto',
                                          scoring='loss',
                                          validation_fraction=0.1,
                                          n_iter_no_change=10,
                                          tol=1e-07, 
                                          random_state=8) 
boost = GridSearchCV(estimator = regressor,
                      param_grid = params,
                      scoring='neg_mean_absolute_error',
                      n_jobs=4, 
                      refit=True,
                      cv=5)

In [55]:
%%time
%%chime
boost.fit(X_train_transf, y_train)

CPU times: total: 14.9 s
Wall time: 4min 14s


In [56]:
boost.best_score_

-34.01911797066337

In [57]:
boost.best_params_

{'l2_regularization': 0.1, 'max_depth': 15, 'max_iter': 450}

Результаты немного хуже, чем у того же метода с предварительной подстановкой пропущенных значений: 33.73 против 34.04, но стандартное отклонение кроссвалидации 1.90 говорит о том, что изменение не существенное. Тем не менее буду испльзовать этот вариант, т.к. он проще.