In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR 

from sklearn.ensemble import StackingRegressor, VotingRegressor, RandomForestRegressor

In [2]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

train['amenities_empty'] = train["amenities"].str.contains("\[\]").astype('object')

train['bathrooms'] = train['bathrooms_text'].str.extract(r'(\d+\.\d+|\d+)').astype('float')


train['type'] = 'other'

train.loc[train['property_type'].str.contains('room', case=False), 'type'] = 'room'
train.loc[train['property_type'].str.contains('Entire', case=False), 'type'] = 'entire'

In [3]:
y = train.loc[:, ['price']]
X = train.loc[:, [#'host_location', #maybe
                  #'host_response_time', #maybe
                  'host_acceptance_rate', 
                  'host_is_superhost', 
                  'neighbourhood_cleansed', 
                  'property_type', #'room_type'?
                  'accommodates', 
                  'bathrooms', 
                  'beds',  #not bedrooms
                  'amenities', # or if it even has amenities, or pools vs hottub or pets
                  #'maximum_maximum_nights', 
                  'maximum_nights',
                  'number_of_reviews', 
                  #'number_of_reviews_ltm',  #check this
                  'instant_bookable'
]] 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=307)

cat_features = X_train.columns[X_train.dtypes==object]
num_features = X_train.columns[X_train.dtypes!=object]


In [5]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer()),
           ('poly', PolynomialFeatures(degree=2, include_bias=False)),
           ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore")),
        ("selector", SelectPercentile(f_regression, percentile=50))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)])


svr_pipe = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("model", SVR())])

In [None]:
numeric = Pipeline(steps=[('impute', SimpleImputer(strategy='median')), 
                 ('polynomial', PolynomialFeatures(degree = 2, include_bias=False)),
                 ('standardize', StandardScaler()),
                 ('percent', SelectPercentile(f_regression, percentile=40))
                 ])

categorical = Pipeline(steps=[('impute2', SimpleImputer(strategy='most_frequent')),
                     ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
                     ('percent', SelectPercentile(f_regression, percentile=60))
                     ])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric, num_features),
        ("categorical", categorical, cat_features)
    ]
)


tree_pipe = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('model', DecisionTreeRegressor(random_state=486))
           ]
)

In [None]:
numeric = Pipeline(steps=[('impute', SimpleImputer(strategy='median')), 
                 ('polynomial', PolynomialFeatures(degree = 2, include_bias=False)),
                 ('standardize', StandardScaler()),
                 #('percent', SelectPercentile(f_regression, percentile=40))
                 ])

categorical = Pipeline(steps=[('impute2', SimpleImputer(strategy='most_frequent')),
                     ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
                     ('percent', SelectPercentile(f_regression, percentile=60))
                     ])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric, num_features),
        ("categorical", categorical, cat_features)
    ]
)


knn_pipe = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('model', KNeighborsRegressor(n_neighbors=3))
           ]
)


In [None]:
numeric = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree = 2, include_bias=False)),
    ('standardize', StandardScaler()),
    ('percent', SelectPercentile(f_regression, percentile=40))])

categorical = Pipeline(steps=[('impute2', SimpleImputer(strategy='most_frequent')),
                     ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
                     ('percent', SelectPercentile(f_regression, percentile=60))
                     ])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric, num_features),
        ("categorical", categorical, cat_features)]
)
rfPipe = Pipeline([('preprocessor', preprocessor),
           ('model', RandomForestRegressor())])

In [None]:
vr = VotingRegressor([('decision', tree_pipe), ('SVM', svr_pipe), ('KNN', knn_pipe), ('rfPipe', rfPipe)])

pred = vr.fit(X_train, y_train).predict(test)

In [None]:
mean_absolute_error(y_test, pred)