# Соревнование для учащися курса "Специалист по работе с данными / Data Scientist". Школа 21 СБЕР / Томский государственный университет

https://www.kaggle.com/competitions/sber-21-tgu-2022/data

Задача - предсказать стоимость аренды квартир в Лондоне. Метрика - MAE.

Предварительный анализ данных см. в ноутбуке EDA.ipynb

In [24]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import chime
from joblib import dump, load

In [2]:
%load_ext chime

In [17]:
class Preprocess:
    def __init__(self, max_categories=6):
        self.max_categories = max_categories
        self.uncorrelated_features = ['latitude', 'longitude',
                                          'available', 'comments',
                                          'canceled', 'notice',
                                          'host_response_rate', 'host_since']
        self.correlated_features = ['accommodates', 'bathrooms', 'beds','guests_included',
                                        'security_deposit','bedrooms','minimum_nights',
                                        'extra_people','square_feet','cleaning_fee']
        
    def set_params(self, max_categories):
        self.max_categories = max_categories
        return self

    def fit(self, X, y=None):    
        # Merge with other files
        calendar = pd.read_csv('calendar_availability.csv', index_col='listing_id')
        reviews = pd.read_csv('reviews_stat.csv', index_col='listing_id')
        df = X.copy().join([calendar, reviews], how='left')
        df.drop(columns='host_id', inplace=True)

        #Convert the column with the percent sign
        df['host_response_rate'] = df['host_response_rate']\
                .str.replace(r'\D+', '', regex=True)\
                .astype('float')

        # Convert datetimes into numbers for further use as a numeric variable
        # This will apply to the host_since column
        self.date_features = df.select_dtypes(include='datetime').columns.to_list()
        df[self.date_features] = df[self.date_features].astype(np.int64)

        self.numeric_features = df.select_dtypes(include='number').columns.to_list()
        self.cat_features = df.select_dtypes(include=['object', 'bool']).columns.to_list()
        

        self.scale_encode = ColumnTransformer(
            transformers = [
                ('scale', StandardScaler(), self.numeric_features),
                ('get_dummies', OneHotEncoder(sparse=False,
                                              drop='first', 
                                              handle_unknown='infrequent_if_exist',
                                              min_frequency=2,
                                             max_categories=self.max_categories), self.cat_features)
            ],
            remainder='passthrough',
            n_jobs=-1
        ).fit(df)
        
        return self

    
    def transform(self, X, y=None):
        # Merge with other files
        calendar = pd.read_csv('calendar_availability.csv', index_col='listing_id')
        reviews = pd.read_csv('reviews_stat.csv', index_col='listing_id')
        df = X.copy().join([calendar, reviews], how='left')
        df.drop(columns='host_id', inplace=True)

        #Convert the column with the percent sign
        df['host_response_rate'] = df['host_response_rate']\
                .str.replace(r'\D+', '', regex=True)\
                .astype('float')

        # Convert datetimes into numbers for further use as a numeric variable
        # This will apply to the host_since column
        df[self.date_features] = df[self.date_features].astype(np.int64)

        result = self.scale_encode.transform(df)
            
        return result

In [4]:
train = pd.read_csv('train.csv',
                    index_col='id',
                    true_values=['t'],
                    false_values=['f'],
                    na_values='none',
                    parse_dates=['host_since'],
                   low_memory=False)
train

Unnamed: 0_level_0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,...,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20145777,Double in a friendly house,A double bedroom in a cozy and modern apartm...,,A double bedroom in a cozy and modern apartm...,,,,,,,...,,,,1,0.0,1,flexible,False,False,90.0
22630537,London flat with great transport links,"Light, airy and modern one bedroom flat 2 mins...",,"Light, airy and modern one bedroom flat 2 mins...",,,,,,,...,,,25.0,2,20.0,1,moderate,False,False,75.0
27355619,"Studio Apartments, 5 mins to Kings Cross Station!","Based in high quality student accommodation, o...",What To Expect: * 16/17m2 Studio with modern f...,"Based in high quality student accommodation, o...",,As one of Central London’s most vibrant and ne...,• Do you have free WiFi? Yes we have super fas...,You will be in an incredible position for gett...,"You will enjoy your own studio, with a private...",Our reception can help you out with anything y...,...,,80.0,45.0,1,0.0,3,flexible,False,False,89.0
21011236,"Big, Beautiful, Sunny, West Kensington TWIN Room","Big Beautiful, airy, West Kensington TWIN room...",This is a pretty terraced house in a great are...,"Big Beautiful, airy, West Kensington TWIN room...",,We’re in a fantastic location - close to many ...,We lay on a good breakfast that guests serve t...,,The room is exclusively yours while you're her...,I'm Matthew and I live here with my son Alex. ...,...,,,12.0,1,11.0,1,moderate,False,False,60.0
24754494,Dashing 1BR in Bayswater by Sonder,"At this Sonder, you'll love the chic decor, st...",Every booking is instantly confirmed. Every ca...,"At this Sonder, you'll love the chic decor, st...",,"Your Sonder is in a beautiful dwelling, conver...","This Sonder does not have air conditioning, bu...",Your Sonder is located at the end of a cul-de-...,,"Our concierge is available by phone, email, or...",...,,300.0,63.0,2,5.0,2,strict_14_with_grace_period,False,False,169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18151315,Sunny studio with private kitchen and bathroom,"Quiet, sunny studio flat just minutes from the...",A small and very cosy flat with all necessary ...,"Quiet, sunny studio flat just minutes from the...",,Crystal Palace is a vibrant bohemian oasis sur...,,"2 mins to Gipsy Hill Rail, direct links to Vi...",You have shared access through main front door...,I have an apartment manager who lives close by...,...,,,25.0,1,0.0,2,moderate,False,False,65.0
28105053,A comfy double room in modern flat ★ Haggerston,It's a double room for 2 guests.,,It's a double room for 2 guests. I will be at ...,,,,There is Haggerston overground station only 3 ...,,I will be at the Airbnb to greet you and check...,...,,0.0,10.0,1,10.0,2,strict_14_with_grace_period,False,False,31.0
26342100,Goldsborough House - Apartment London SW8,"Spacious three bedroom flat, fantastic locatio...",,"Spacious three bedroom flat, fantastic locatio...",,,,,,,...,,,,1,0.0,1,flexible,False,False,150.0
13833612,Modern and Luxurious 1 Bedroom Flat,"You’ll love my place because of the ambiance, ...",This is a beautiful and comfortable one bedroo...,"You’ll love my place because of the ambiance, ...",,The are is central and well connected. For pub...,I expect people to treat the flat with respect...,"Easy access to public transport (DLR, Overgrou...",Full access to all spaces. The flat is all you...,I am available to help via email / phone and c...,...,,80.0,25.0,1,0.0,4,moderate,False,False,105.0


In [5]:
X = train.drop(columns='price')
y = train.price

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.2,
                                                  random_state=8,
                                                 shuffle=True)

In [7]:
X_val.shape

(7254, 41)

In [29]:
%%time
%%chime
preprocess = Preprocess().fit(X_train)

CPU times: total: 953 ms
Wall time: 2min 19s


In [30]:
%%time
%%chime
X_train_transf = preprocess.transform(X_train)

CPU times: total: 1.17 s
Wall time: 1min 10s


In [31]:
X_train_transf.shape

(29016, 118)

In [11]:
%%time
%%chime
X_val_transf = preprocess.transform(X_val)

CPU times: total: 703 ms
Wall time: 5.59 s


In [32]:
X_val_transf.shape

NameError: name 'X_val_transf' is not defined

In [13]:
np.save('X_train_transf', X_train_transf)
np.save('X_val_transf', X_val_transf)
np.save('y_train', y_train)
np.save('y_val', y_val)

In [10]:
X_train_transf = np.load('X_train_transf.npy')
X_val_transf = np.load('X_val_transf.npy')
y_train = np.load('y_train.npy')
y_val = np.load('y_val.npy')

## Предсказание по средней цене

In [16]:
y_val.describe()

count    7254.000000
mean      115.931624
std       175.953193
min         0.000000
25%        45.000000
50%        85.000000
75%       135.000000
max      7716.000000
Name: price, dtype: float64

In [17]:
(y_val - y_val.mean()).abs().mean()

74.18344137864436

## Градиентный бустинг

In [8]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [17]:
params = {'max_iter': [100, 200, 300, 400, 500],
          'max_depth':[10, 20, 50, 70],
          'l2_regularization': [0.0, 0.2, 0.4]
         }

In [18]:
regressor = HistGradientBoostingRegressor(loss='absolute_error',
                                          learning_rate=0.1, 
                                          early_stopping='auto',
                                          scoring='loss',
                                          validation_fraction=0.1,
                                          n_iter_no_change=10,
                                          tol=1e-07, 
                                          random_state=8) 
boost = GridSearchCV(estimator = regressor,
                      param_grid = params,
                      scoring='neg_mean_absolute_error',
                      n_jobs=4, 
                      refit=True,
                      cv=5)

In [19]:
%%time
%%chime
boost.fit(X_train_transf, y_train)

CPU times: total: 15.4 s
Wall time: 12min 13s


In [20]:
boost.best_score_

-33.73235909480901

In [21]:
boost.best_params_

{'l2_regularization': 0.2, 'max_depth': 20, 'max_iter': 400}

Сужаю сеть

In [22]:
params = {'max_iter': [350, 400, 450],
          'max_depth':[15, 20, 30, 40],
          'l2_regularization': [0.1, 0.2, 0.3]
         }

In [23]:
regressor = HistGradientBoostingRegressor(loss='absolute_error',
                                          learning_rate=0.1, 
                                          early_stopping='auto',
                                          scoring='loss',
                                          validation_fraction=0.1,
                                          n_iter_no_change=10,
                                          tol=1e-07, 
                                          random_state=8) 
boost = GridSearchCV(estimator = regressor,
                      param_grid = params,
                      scoring='neg_mean_absolute_error',
                      n_jobs=4, 
                      refit=True,
                      cv=5)

In [24]:
%%time
%%chime
boost.fit(X_train_transf, y_train)

CPU times: total: 15 s
Wall time: 8min 27s


In [25]:
boost.best_score_

-33.73235909480901

In [26]:
boost.best_params_

{'l2_regularization': 0.2, 'max_depth': 20, 'max_iter': 350}

In [9]:
best_estimator = HistGradientBoostingRegressor(loss='absolute_error',
                                              l2_regularization=0.2,
                                              max_depth=20,
                                              max_iter =350,
                                              learning_rate=0.1, 
                                              early_stopping='auto',
                                              scoring='loss',
                                              validation_fraction=0.1,
                                              n_iter_no_change=10,
                                              tol=1e-07, 
                                              random_state=8)

Окончательный конвеер, включающий подготовку данных и регрессионную модель

In [18]:
pipe = Pipeline([
    ('prep', Preprocess()),
    ('regress', best_estimator)
])

### Пробую менять параметр max_categories в обработке данных.

In [19]:
params = {'prep__max_categories': [6, 20, 40]}

In [20]:
pipe_grid = GridSearchCV(estimator = pipe,
                      param_grid = params,
                      scoring='neg_mean_absolute_error',
                      n_jobs=4, 
                      refit=True,
                      cv=5)

In [21]:
%%time
%%chime
pipe_grid.fit(X_train, y_train)

CPU times: total: 22.2 s
Wall time: 16min 4s


In [22]:
pipe_grid.best_score_

-33.81402176681601

In [23]:
pipe_grid.best_params_

{'prep__max_categories': 6}

Улучшения нет

# Окончательная модель и прогноз

In [34]:
%%time
%%chime
pipe.fit(X_train, y_train)

CPU times: total: 15.5 s
Wall time: 3min 31s


In [35]:
y_pred_val = pipe.predict(X_val)

In [36]:
mean_absolute_error(y_val, y_pred_val)

33.678846077603616

In [43]:
dump(pipe, 'best_model.joblib')

['best_model.joblib']

### Прогноз на тестовой выборке

In [44]:
X_test = pd.read_csv('test.csv',
                    index_col='id',
                    true_values=['t'],
                    false_values=['f'],
                    na_values='none',
                    parse_dates=['host_since'],
                   low_memory=False)
X_test

Unnamed: 0_level_0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,...,amenities,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24366684,"Luxurious 2 Bed by Kings Road, Chelsea with Ga...",This stunning newly-refurbished 2 bedroom Chel...,Amenities: - 24/7 guest support - Professiona...,This stunning newly-refurbished 2 bedroom Chel...,,"Two long high streets, King's Road and Fulham ...",,The flat is just a 5 minute walk from Sloane S...,Guests will have access to the entire property...,My support team or I will be contactable by em...,...,"{TV,Wifi,Kitchen,Heating,""Family/kid friendly""...",,85.0,80.0,2,15.0,2,strict_14_with_grace_period,False,False
20329256,Fulham Chelsea Great Studio! HR1a,My place is close to Chelsea Football Ground. ...,cosy self contained studio - close to public t...,My place is close to Chelsea Football Ground. ...,,"Fulham is a cool, vibrant area with great shop...",,very close to Fulham Broadway tube & buses,,happy to help,...,"{TV,Wifi,Kitchen,""Paid parking off premises"",H...",,,,1,0.0,2,flexible,False,False
27087563,Smart Spacious Double Room with Kitchenette & ...,A spacious double room with your own private b...,A bright sunny double room with a private bath...,A spacious double room with your own private b...,,"The house located in zone 2, in a quiet reside...",I have other listings in this house and you ha...,You can walk to Canary Wharf - 15 minutes. Pub...,"Apart from your own room, there is also a lar...",We work and live locally and are always availa...,...,"{TV,Internet,Wifi,Kitchen,""Buzzer/wireless int...",,100.0,25.0,1,8.0,4,moderate,False,False
21995708,Beautiful 1 bedroom garden flat with lovely cat,Beautifully decorated and peaceful 1 bedroom f...,,Beautifully decorated and peaceful 1 bedroom f...,,The house is at the heart of London's next up ...,You will be sharing the space with our cat Kal...,15 minute walk to the Central Line and 20 minu...,,,...,"{Wifi,Kitchen,Heating,""Family/kid friendly"",""S...",,0.0,30.0,1,0.0,3,moderate,False,False
19372467,"The Bolt Hole [440a, Fulham Road]",The Bolt Hole is a delightful 2 bedroom house ...,"An unusual house, built in the 1960's, The Bol...",The Bolt Hole is a delightful 2 bedroom house ...,,The Bolt Hole is located just off Fulham Broad...,,,Guests are free to use the washing machine and...,,...,"{TV,Wifi,Kitchen,""Free parking on premises"",He...",,,,1,0.0,2,flexible,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19819730,"Discover Shoreditch & Hackney ""Pear""",**** Instagram @host.tay **** The Old Nags Hea...,The apartment has recently been renovated so e...,**** Instagram @host.tay **** The Old Nags Hea...,,The apartment is located in Bethnal Green a vi...,,The room is in the perfect location for public...,Guests can check in anytime after 2pm. At 2pm...,I'll be at the apartment most days from 11am t...,...,"{Internet,Wifi,Kitchen,Heating,""Family/kid fri...",,0.0,27.0,2,15.0,2,strict_14_with_grace_period,False,False
27683820,56 Haldane Rd,Hello! I am the owner Erin J. Buchholz. Welcom...,,Hello! I am the owner Erin J. Buchholz. Welcom...,,,,,,,...,"{TV,Wifi,""Air conditioning"",Kitchen,""Free park...",,,,1,0.0,1,flexible,False,False
21011115,A bright 2 bedroom flat in Islington,"A sunny family home with a garden, 2 double be...","Our flat is homey, quiet and has a lovely priv...","A sunny family home with a garden, 2 double be...",,"The flat is on a residential street, off the m...",,We are very well served for public transport t...,You will have full use of the entire flat incl...,We will contact you before your arrival to arr...,...,"{TV,Wifi,Kitchen,Heating,""Family/kid friendly""...",,198.0,45.0,2,10.0,2,moderate,False,False
29033913,Amazing Apartment Near Hyde Park & Oxford Street,,,,,,,,,,...,"{TV,""Cable TV"",Wifi,Kitchen,""Pets allowed"",Ele...",,200.0,50.0,1,0.0,2,strict_14_with_grace_period,False,False


In [47]:
%%time
%%chime
y_pred_test = pipe.predict(X_test)

CPU times: total: 1.47 s
Wall time: 21.1 s


In [48]:
y_pred_test = pd.DataFrame(y_pred_test, columns=['price'], index=X_test.index)
y_pred_test

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
24366684,161.703510
20329256,76.553143
27087563,45.162014
21995708,60.841133
19372467,125.670140
...,...
19819730,41.977211
27683820,120.984147
21011115,100.427615
29033913,173.214637


In [49]:
y_pred_test.to_csv('submission.csv')