## Airbnb Group Project

In [1]:
import numpy as np
import pandas as pd
import re
import warnings
from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Read in the training file

In [5]:
train = pd.read_csv('train.csv')


## Feature Selection & Engineering (from Caroline)

In [6]:
def extract_number(text):
    match = re.search(r'(\d+\.?\d*)', str(text))
    if match:
        return float(match.group())
    else:
        return 0.5  # if it doesn't have a number, it is 'half-bath'

train['baths'] = train['bathrooms_text'].apply(extract_number)
train['host_acceptance_rate'] = train['host_acceptance_rate'].str.rstrip('%').astype(float) / 100

train['host_is_superhost'] = train['host_is_superhost'].astype('category')
train['host_location'] = train['host_location'].astype('category')
train['neighbourhood_cleansed'] = train['neighbourhood_cleansed'].astype('category')
train['property_type'] = train['property_type'].astype('category')
train['instant_bookable'] = train['instant_bookable'].astype('category')
train['amenities'] = train['amenities'].astype('category')
train['host_response_time'] = train['host_response_time'].astype('category')

### Set Features and Target

In [7]:
y = train.loc[:, ['price']]
X = train.loc[:, ['host_location', #maybe
                  'host_response_time', #maybe
                  'host_acceptance_rate', 
                  'host_is_superhost', 
                  'neighbourhood_cleansed', 
                  'property_type', #'room_type'
                  'accommodates', 
                  'baths', 
                  'beds',  #not bedrooms
                  'amenities', # or if it even has amenities, or pools vs hottub or pets
                  'maximum_maximum_nights', 
                  'maximum_nights',
                  'number_of_reviews', 
                  'number_of_reviews_ltm',  #check this
                  'instant_bookable']]

num_features = ['host_acceptance_rate', 'accommodates', 'baths', 'beds', 
                'maximum_maximum_nights', 'maximum_nights', 'number_of_reviews', 'number_of_reviews_ltm']

cat_features = ['host_response_time','host_location', 'host_is_superhost', 'neighbourhood_cleansed', 'property_type',
                'amenities', 'instant_bookable']

## Creating the Pipe


In [9]:
numeric = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree = 2, include_bias=False)),
    ('standardize', StandardScaler()),
    ('percent', SelectPercentile(f_regression, percentile=40))])

categorical = Pipeline(steps=[('impute2', SimpleImputer(strategy='most_frequent')),
                     ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
                     ('percent', SelectPercentile(f_regression, percentile=60))
                     ])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric, num_features),
        ("categorical", categorical, cat_features)]
)
rfPipe = Pipeline([('preprocessor', preprocessor),
           ('model', RandomForestRegressor())])

## Fitting Random Forest

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

rfPipe = rfPipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


In [11]:
y_pred = rfPipe.predict(X_test)  # Make predictions
mae = mean_absolute_error(y_test, y_pred)  # Calculate MAE

mae

116.09030732428853

## Attmpeting CV random search

In [20]:
# Define hyperparameters for Random Search
from scipy.stats import randint

params = {
    'model__n_estimators': randint(50, 200),
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

# Use RandomizedSearchCV
rnd_search = RandomizedSearchCV(
    pipe, param_distributions=params, cv=10,
    scoring='neg_mean_squared_error', random_state=42)

# Assuming X_train and y_train are your training data
rnd_search.fit(X_train, y_train)

# Print the best hyperparameter combinations and best MSE
print("The best hyperparameter combinations are:")
print(rnd_search.best_params_)

print("\nBest MSE:")
print(rnd_search.best_score_)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs

The best hyperparameter combinations are:
{'model__bootstrap': True, 'model__max_depth': 90, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 98}

Best MSE:
-1278053.9685107516


### updating RF Pipe


In [30]:
# Use the best hyperparameters from random search
best_params = {
    'model__bootstrap': True,
    'model__max_depth': 90,
    'model__max_features': 'sqrt',
    'model__min_samples_leaf': 1,
    'model__min_samples_split': 2,
    'model__n_estimators': 98
}

# Update the pipeline with the best hyperparameters
pipe.set_params(**best_params)

rfPipe = pipe.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


In [31]:
y_pred = rfPipe.predict(X_test)  # Make predictions
mae = mean_absolute_error(y_test, y_pred)  # Calculate MAE

mae

179.26195384681944

camilla is gonna do bagging and random forests

## BAGGING

In [35]:
from sklearn.ensemble import BaggingClassifier
bgPipe = Pipeline([('preprocessor', preprocessor),
           ('model', BaggingClassifier())])

In [36]:
bgPipe = bgPipe.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [37]:
y_predBG = bgPipe.predict(X_test)  # Make predictions
maeBG = mean_absolute_error(y_test, y_predBG)  # Calculate MAE

maeBG

182.8036135113904

## Prepare for Kaggle Submission

In [13]:
df_test = pd.read_csv("test.csv")

def extract_number(text):
    match = re.search(r'(\d+\.?\d*)', str(text))
    if match:
        return float(match.group())
    else:
        return 0.5  # if it doesn't have a number, it is 'half-bath'

df_test['baths'] = df_test['bathrooms_text'].apply(extract_number)
df_test['host_acceptance_rate'] = df_test['host_acceptance_rate'].str.rstrip('%').astype(float) / 100
df_test['host_is_superhost'] = df_test['host_is_superhost'].astype('category')
df_test['host_location'] = df_test['host_location'].astype('category')
df_test['neighbourhood_cleansed'] = df_test['neighbourhood_cleansed'].astype('category')
df_test['property_type'] = df_test['property_type'].astype('category')
df_test['instant_bookable'] = df_test['instant_bookable'].astype('category')
df_test['amenities'] = df_test['amenities'].astype('category')
df_test['host_response_time'] = df_test['host_response_time'].astype('category')

In [14]:
official_preds = rfPipe.predict(df_test)


In [15]:
my_preds = pd.DataFrame(official_preds)
my_preds = my_preds.rename(columns={0: 'price'})
ids = pd.DataFrame(df_test['Id'])
my_sub = ids.join(my_preds)
my_sub.to_csv('submission.csv', index=False)