### Work in progress
Вариант с более тщательным отбором признаков для преобразования в унитарный код.

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
import chime

In [2]:
%load_ext chime

In [3]:
class Concatenator:
    # Adding calendar and review data
    def __init__(self):
        self.calendar = pd.read_csv('calendar_availability.csv',
                                   index_col='listing_id')
        self.reviews = pd.read_csv('reviews_stat.csv',
                                  index_col='listing_id')
        
    def fit(self, df, y=None):
        result = df.copy().join([self.calendar, self.reviews], how='left')
        result.drop(columns='host_id', inplace=True)
        return result
    
    def transform(self, df, y=None):
        result = df.copy().join([self.calendar, self.reviews], how='left')
        result.drop(columns='host_id', inplace=True)
        return result

In [4]:
class PercentConvert:
    #Convert column with the percent sign
    def __init__(self):
        pass
    
    def fit(self, df, y=None):
        return self
    
    def transform(self, df, y=None):
        result = df.copy()
        result['host_response_rate'] = result['host_response_rate']\
            .str.replace(r'\D+', '', regex=True)\
            .astype('float')
        return result

In [5]:
class FeatureTransform:
    # Transform features depending on type
    def __init__(self, n):
        self.n = n
        self.uncorrelated_features = ['latitude', 'longitude',
                                      'available', 'comments',
                                      'canceled', 'notice',
                                      'host_response_rate']
        self.correlated_features = ['accommodates', 'bathrooms', 'beds','guests_included',
                                    'security_deposit','bedrooms','minimum_nights',
                                    'extra_people','square_feet','cleaning_fee']
        self.features = self.correlated_features + ['latitude', 'longitude']

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df = X.copy()

        # Convert datetimes into numbers for further use as a numeric variable
        # This will apply to the host_since column
        date_features = df.select_dtypes(include='datetime').columns
        df[date_features] = df[date_features].astype(np.int64)
        
        # Identify and scale numeric features
        numeric_features = df.select_dtypes(include='number').columns
        df[numeric_features] = pd.DataFrame(
            StandardScaler().fit_transform(df[numeric_features]),
            columns=numeric_features,
            index=df.index
        )
        
        # Identify categorical features with more than n unique values
        # and convert each into a binary feature: data provided or not
        cat_features = df.select_dtypes(include=['object', 'bool']).nunique()
        long = cat_features[cat_features > self.n].index
        df[long] = df[long].isna()
        
        # Filling NaNs in uncorrelated features
        df[self.uncorrelated_features] = df[self.uncorrelated_features].fillna(0, axis=0)

        # Fill NaN in correlated features 
        # and one hot encode categorical features with unique values <= n
        short = cat_features[cat_features <= self.n].index
        
        # Добавил долготу и широту, чтобы не пропадали колонки
        impute_encode = ColumnTransformer(
            transformers = [
                ('corr', KNNImputer(n_neighbors=50), self.features),
                ('get_dummies', OneHotEncoder(sparse=False, drop=None, handle_unknown='ignore'), short)
            ],
            remainder='passthrough',
            n_jobs=-1
        )
         
        pipe_impute_encode = Pipeline([('impute_encode', impute_encode)])
        result = pipe_impute_encode.fit_transform(df)
        return result

In [6]:
train = pd.read_csv('train.csv',
                    index_col='id',
                    true_values=['t'],
                    false_values=['f'],
                    na_values='none',
                    parse_dates=['host_since'],
                   low_memory=False)
train

Unnamed: 0_level_0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,...,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20145777,Double in a friendly house,A double bedroom in a cozy and modern apartm...,,A double bedroom in a cozy and modern apartm...,,,,,,,...,,,,1,0.0,1,flexible,False,False,90.0
22630537,London flat with great transport links,"Light, airy and modern one bedroom flat 2 mins...",,"Light, airy and modern one bedroom flat 2 mins...",,,,,,,...,,,25.0,2,20.0,1,moderate,False,False,75.0
27355619,"Studio Apartments, 5 mins to Kings Cross Station!","Based in high quality student accommodation, o...",What To Expect: * 16/17m2 Studio with modern f...,"Based in high quality student accommodation, o...",,As one of Central London’s most vibrant and ne...,• Do you have free WiFi? Yes we have super fas...,You will be in an incredible position for gett...,"You will enjoy your own studio, with a private...",Our reception can help you out with anything y...,...,,80.0,45.0,1,0.0,3,flexible,False,False,89.0
21011236,"Big, Beautiful, Sunny, West Kensington TWIN Room","Big Beautiful, airy, West Kensington TWIN room...",This is a pretty terraced house in a great are...,"Big Beautiful, airy, West Kensington TWIN room...",,We’re in a fantastic location - close to many ...,We lay on a good breakfast that guests serve t...,,The room is exclusively yours while you're her...,I'm Matthew and I live here with my son Alex. ...,...,,,12.0,1,11.0,1,moderate,False,False,60.0
24754494,Dashing 1BR in Bayswater by Sonder,"At this Sonder, you'll love the chic decor, st...",Every booking is instantly confirmed. Every ca...,"At this Sonder, you'll love the chic decor, st...",,"Your Sonder is in a beautiful dwelling, conver...","This Sonder does not have air conditioning, bu...",Your Sonder is located at the end of a cul-de-...,,"Our concierge is available by phone, email, or...",...,,300.0,63.0,2,5.0,2,strict_14_with_grace_period,False,False,169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18151315,Sunny studio with private kitchen and bathroom,"Quiet, sunny studio flat just minutes from the...",A small and very cosy flat with all necessary ...,"Quiet, sunny studio flat just minutes from the...",,Crystal Palace is a vibrant bohemian oasis sur...,,"2 mins to Gipsy Hill Rail, direct links to Vi...",You have shared access through main front door...,I have an apartment manager who lives close by...,...,,,25.0,1,0.0,2,moderate,False,False,65.0
28105053,A comfy double room in modern flat ★ Haggerston,It's a double room for 2 guests.,,It's a double room for 2 guests. I will be at ...,,,,There is Haggerston overground station only 3 ...,,I will be at the Airbnb to greet you and check...,...,,0.0,10.0,1,10.0,2,strict_14_with_grace_period,False,False,31.0
26342100,Goldsborough House - Apartment London SW8,"Spacious three bedroom flat, fantastic locatio...",,"Spacious three bedroom flat, fantastic locatio...",,,,,,,...,,,,1,0.0,1,flexible,False,False,150.0
13833612,Modern and Luxurious 1 Bedroom Flat,"You’ll love my place because of the ambiance, ...",This is a beautiful and comfortable one bedroo...,"You’ll love my place because of the ambiance, ...",,The are is central and well connected. For pub...,I expect people to treat the flat with respect...,"Easy access to public transport (DLR, Overgrou...",Full access to all spaces. The flat is all you...,I am available to help via email / phone and c...,...,,80.0,25.0,1,0.0,4,moderate,False,False,105.0


In [7]:
X = train.drop(columns='price')
y = train.price

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.2,
                                                  random_state=8,
                                                 shuffle=True)

In [9]:
X_val.shape

(7254, 41)

### Пропущенные значения

Для признаков, между которыми есть корреляция, использовал KNNImputer.
У независимых друг от друга признаков использовал среднее значение, равное 0, т.к. признаки стандартизованы. Сделал это зараннее в классе FeatureTransform, чтобы дать KNNImputer за что "зацепиться": KNN Imputer выкидывает колонки, если не может подобрать соседей, поэтому включил в список признаков долготу и широту (у них пропущенные значения уже заполнены). (Проблема с потерянными колонками возникает при преобразовании малых выборок на этапе тестирования.)

correlated_features = ['accommodates', 'bathrooms', 'beds','guests_included','security_deposit',
                       'bedrooms','minimum_nights','extra_people','square_feet','cleaning_fee']
features = correlated_features + ['latitude', 'longitude']

# Добавил долготу и широту, чтобы не пропадали колонки
impute_encode = ColumnTransformer(
    transformers = [
        ('corr', KNNImputer(n_neighbors=50), features)
    ],
    remainder=OneHotEncoder(sparse=False, handle_unknown='ignore'),
    n_jobs=-1
)


correlated_features = ['accommodates', 'bathrooms', 'beds','guests_included','security_deposit',
                       'bedrooms','minimum_nights','extra_people','square_feet','cleaning_fee']
features = correlated_features + ['latitude', 'longitude']

# Добавил долготу и широту, чтобы не пропадали колонки
impute_encode = ColumnTransformer(
    transformers = [
        ('get_dummies', OneHotEncoder(sparse=False, handle_unknown='ignore'), cols)
    ],
    
    n_jobs=-1
)


In [10]:
preprocess_steps = [('percent_convert', PercentConvert()),
                   ('join_data', Concatenator()),
                   ('transform', FeatureTransform(10))
                   ]
pipe_preprocess = Pipeline(preprocess_steps, verbose=True)

In [11]:
%%time
%%chime
X_train_transf = pipe_preprocess.fit_transform(X_train); X_train_transf

[Pipeline] ... (step 1 of 3) Processing percent_convert, total=   0.0s
[Pipeline] ......... (step 2 of 3) Processing join_data, total=   0.3s
[Pipeline] ......... (step 3 of 3) Processing transform, total= 1.1min
CPU times: total: 938 ms
Wall time: 1min 4s


In [12]:
X_train_transf.shape

(29016, 70)

In [13]:
X_train_transf

array([[0.3846094631742136, 1.1390736547809788, 0.1793403520770507, ...,
        -0.20051885555717758, -0.3364019047375185, 0.0],
       [0.3846094631742136, -0.47794391425224697, 0.1793403520770507,
        ..., -0.10061329694835491, 0.13439225549506964,
        -0.3359467322925266],
       [0.8842883466461836, 1.1390736547809788, 0.9930838600295439, ...,
        -0.050660517643943574, -0.3364019047375185, 0.0],
       ...,
       [-0.6147483037697266, -0.47794391425224697, -0.6344031558754425,
        ..., -0.3503771934704116, -0.3364019047375185, 0.0],
       [0.3846094631742136, -0.47794391425224697, 0.1793403520770507,
        ..., 2.7466951234030907, -0.3364019047375185, 0.0],
       [0.3846094631742136, -0.47794391425224697, 0.1793403520770507,
        ..., -0.4003299727748229, -0.3364019047375185, 0.0]], dtype=object)

In [14]:
%%time
%%chime
X_val_transf = pipe_preprocess.transform(X_val); X_val_transf

CPU times: total: 359 ms
Wall time: 4.6 s


In [15]:
X_val_transf.shape

(7254, 67)

# Выбор модели

## Предсказание по средней цене

In [16]:
y_val.describe()

count    7254.000000
mean      115.931624
std       175.953193
min         0.000000
25%        45.000000
50%        85.000000
75%       135.000000
max      7716.000000
Name: price, dtype: float64

In [17]:
(y_val - y_val.mean()).abs().mean()

74.18344137864436

## Линейная регрессия

In [18]:
params = {'alpha': np.arange(0.2, 1.0, 0.1).tolist() + np.arange(1.0, 10, 1).tolist(),
         'l1_ratio': [0.1, 0.5, 0.8, 0.95, 0.99, 1]}

In [19]:
regressor = ElasticNet(max_iter=1000,
                       random_state=8)
lin_reg = GridSearchCV(estimator = regressor,
                      param_grid = params,
                      scoring='neg_mean_absolute_error',
                      n_jobs=-1, 
                      refit=True,
                      cv=5)

In [20]:
%%time
%%chime
lin_reg.fit(X_train_transf, y_train)

CPU times: total: 1min 18s
Wall time: 1min 48s


In [21]:
lin_reg.best_score_

-48.952944071094954

In [22]:
lin_reg.best_params_

{'alpha': 2.0, 'l1_ratio': 0.95}

In [23]:
best_regressor = lin_reg.best_estimator_

In [24]:
y_pred_1 = best_regressor.predict(X_val_transf)

ValueError: X has 67 features, but ElasticNet is expecting 70 features as input.

In [18]:
tmp = OneHotEncoder(sparse=False, drop=None, handle_unknown='ignore').fit(X_train)

In [19]:
tmp2 = tmp.transform(X_val)

In [20]:
tmp2.shape

(7254, 306255)

In [12]:
uncorrelated_features = ['latitude', 'longitude']
                              
correlated_features = ['accommodates', 'bathrooms', 'beds','guests_included',
                            'security_deposit','bedrooms','minimum_nights',
                            'extra_people','square_feet','cleaning_fee']
features = correlated_features + ['latitude', 'longitude']
cat_features = X_train.select_dtypes(include=['object', 'bool']).nunique()
long = cat_features[cat_features > 10].index
X_train[long] = X_train[long].isna()

# Filling NaNs in uncorrelated features
X_train[uncorrelated_features] = X_train[uncorrelated_features].fillna(0, axis=0)

# Fill NaN in correlated features 
# and one hot encode categorical features with unique values <= n
short = cat_features[cat_features <= 10].index

# Добавил долготу и широту, чтобы не пропадали колонки
impute_encode = ColumnTransformer(
    transformers = [
        ('corr', KNNImputer(n_neighbors=50), features),
        ('get_dummies', OneHotEncoder(sparse=False, drop=None, handle_unknown='ignore'), short)
    ],
    remainder='passthrough',
    n_jobs=-1
)

In [14]:
pipe_impute_encode = Pipeline([('impute_encode', impute_encode)])
df1 = pipe_impute_encode.fit_transform(X_train)

In [16]:
df1.shape

(29016, 80)

In [17]:
%%chime
df2 = pipe_impute_encode.transform(X_val)

In [18]:
df2.shape

(7254, 80)

In [19]:
f = ['latitude', 'longitude',
                                      'available', 'comments',
                                      'canceled', 'notice'
                                      ]

In [20]:
X_train

Unnamed: 0_level_0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,...,amenities,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25761038,False,False,False,False,,False,False,False,False,False,...,False,,0.0,60.0,2,15.0,2,strict_14_with_grace_period,False,False
18440734,False,False,False,False,,True,True,True,True,True,...,False,,300.0,55.0,1,0.0,2,strict_14_with_grace_period,False,False
15457138,False,False,True,False,,False,True,False,True,False,...,False,,350.0,0.0,1,0.0,2,strict_14_with_grace_period,False,False
17555101,False,False,False,False,,True,True,False,True,True,...,False,,,,1,0.0,2,strict_14_with_grace_period,False,False
28234588,False,False,False,False,,True,True,True,True,True,...,False,,0.0,5.0,1,10.0,2,flexible,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13749393,False,False,False,False,,False,True,False,False,False,...,False,,500.0,30.0,1,15.0,4,moderate,False,False
27457076,False,False,False,False,,False,True,False,False,False,...,False,,,,1,0.0,4,strict_14_with_grace_period,False,False
25771345,False,False,False,False,,True,True,True,True,True,...,False,,,,1,0.0,1,flexible,False,False
19995526,False,False,False,False,,False,False,False,False,False,...,False,,0.0,17.0,1,0.0,2,strict_14_with_grace_period,False,False


In [23]:
X_train['experiences_offered'] = X_train['experiences_offered'].fillna(0, axis=0)
X_train

Unnamed: 0_level_0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,...,amenities,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25761038,False,False,False,False,0,False,False,False,False,False,...,False,,0.0,60.0,2,15.0,2,strict_14_with_grace_period,False,False
18440734,False,False,False,False,0,True,True,True,True,True,...,False,,300.0,55.0,1,0.0,2,strict_14_with_grace_period,False,False
15457138,False,False,True,False,0,False,True,False,True,False,...,False,,350.0,0.0,1,0.0,2,strict_14_with_grace_period,False,False
17555101,False,False,False,False,0,True,True,False,True,True,...,False,,,,1,0.0,2,strict_14_with_grace_period,False,False
28234588,False,False,False,False,0,True,True,True,True,True,...,False,,0.0,5.0,1,10.0,2,flexible,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13749393,False,False,False,False,0,False,True,False,False,False,...,False,,500.0,30.0,1,15.0,4,moderate,False,False
27457076,False,False,False,False,0,False,True,False,False,False,...,False,,,,1,0.0,4,strict_14_with_grace_period,False,False
25771345,False,False,False,False,0,True,True,True,True,True,...,False,,,,1,0.0,1,flexible,False,False
19995526,False,False,False,False,0,False,False,False,False,False,...,False,,0.0,17.0,1,0.0,2,strict_14_with_grace_period,False,False
