## Airbnb Group Project

In [23]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Read in the training file

In [5]:
train = pd.read_csv('train.csv')


<property object at 0x1266add00>


## Feature Selection & Engineering (from Caroline)

In [6]:
#sort bathrooms_text to get the count of bathrooms
train['baths'] = train['bathrooms_text'].str.findall(r'(\d+.\d+|\d+)')
train['baths'] = train['baths'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else '')
#get acceptance rate as a float
train['host_acceptance_rate'] = train['host_acceptance_rate'].str.rstrip('%').astype(float) / 100

train['host_is_superhost'] = train['host_is_superhost'].astype('category')
train['neighbourhood_cleansed'] = train['neighbourhood_cleansed'].astype('category')
train['property_type'] = train['property_type'].astype('category')
train['instant_bookable'] = train['instant_bookable'].astype('category')
train['amenities'] = train['amenities'].astype('category')

### Set Features and Target

In [14]:
y = train.loc[:, ['price']]
X = train.loc[:, ['host_location', #maybe
                  'host_response_time', #maybe
                  'host_acceptance_rate', 
                  'host_is_superhost', 
                  'neighbourhood_cleansed', 
                  'property_type', #'room_type'
                  'accommodates', 
                  'baths', 
                  'beds',  #not bedrooms
                  'amenities', # or if it even has amenities, or pools vs hottub or pets
                  'maximum_maximum_nights', 
                  'maximum_nights',
                  'number_of_reviews', 
                  'number_of_reviews_ltm',  #check this
                  'instant_bookable']]

num_features = ['host_acceptance_rate', 'accommodates', 'baths', 'beds', 
                'maximum_maximum_nights', 'maximum_nights', 'number_of_reviews', 'number_of_reviews_ltm']

cat_features = ['host_response_time','host_location', 'host_is_superhost', 'neighbourhood_cleansed', 'property_type',
                'amenities', 'instant_bookable']

In [22]:
X

Unnamed: 0,host_location,host_response_time,host_acceptance_rate,host_is_superhost,neighbourhood_cleansed,property_type,accommodates,baths,beds,amenities,maximum_maximum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,instant_bookable
0,"Oakland, CA",within a few hours,0.89,f,Lakeshore,Entire guesthouse,4,1,1.0,[],365,365,14,14,f
1,"San Francisco, CA",within an hour,0.81,t,Western Addition,Entire rental unit,2,1,1.0,"[""Hair dryer"", ""TV"", ""Dishwasher"", ""Elevator"",...",365,365,2,2,f
2,"San Francisco, CA",within an hour,0.85,t,Outer Sunset,Private room in home,2,1.5,1.0,"[""Dining table"", ""Hot water kettle"", ""Paid was...",92,92,4,0,f
3,"Palo Alto, CA",within a day,0.56,t,Palo Alto,Private room in home,1,1.5,1.0,[],1125,1125,7,1,f
4,,within an hour,0.88,f,Sunnyvale,Room in hotel,4,1,2.0,[],1125,1125,8,8,t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15269,"San Francisco, CA",within an hour,1.00,t,Mission,Entire condo,6,1,3.0,"[""Clothing storage: closet, wardrobe, and dres...",1125,365,53,2,f
15270,"Monterey Park, CA",within an hour,1.00,f,Daly City,Entire home,4,1,2.0,[],365,365,11,11,t
15271,"Saratoga, CA",within an hour,1.00,t,Unincorporated Areas,Entire villa,16,5,13.0,[],365,365,23,16,t
15272,,within an hour,1.00,t,City of Capitola,Entire home,5,1,2.0,[],1125,28,137,49,f


## Creating the Pipe (Daisy)


In [24]:
numeric = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('to_numeric', FunctionTransformer(func=lambda X: X.apply(pd.to_numeric, errors='coerce'))),  # Convert to numeric
    ('polynomial', PolynomialFeatures(degree = 2, include_bias=False)),
    ('standardize', StandardScaler()),
    ('percent', SelectPercentile(f_regression, percentile=40))])

categorical = Pipeline(steps=[('impute2', SimpleImputer(strategy='most_frequent')),
                     ('one_hot', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
                     ('percent', SelectPercentile(f_regression, percentile=60))
                     ])

preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric, num_features),
        ("categorical", categorical, cat_features)]
)
pipe = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('model', RandomForestRegressor())]
)

## Fitting Models

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

pipe.fit(X_train, y_train)

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: ''

In [19]:
# Used a pipeline to transform the model a little bit, now try and fit a much of models with these features 
# and see the MSE at the end

model = LinearRegression()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print(f'Train MSE for LinReg is: {mean_squared_error(y_train, y_train_pred)}')
print(f'Test MSE for LinReg is: {mean_squared_error(y_test, y_test_pred)}')
print('')


alphas = np.logspace(-6,6,30)
model = RidgeCV(alphas=alphas, cv=10)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print(f'Train MSE for Ridge is: {mean_squared_error(y_train, y_train_pred)}')
print(f'Test MSE for Ridge is: {mean_squared_error(y_test, y_test_pred)}')
print('')



model = LassoCV(cv=10)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print(f'Train MSE for Lasso is: {mean_squared_error(y_train, y_train_pred)}')
print(f'Test MSE for Lasso is: {mean_squared_error(y_test, y_test_pred)}')
print('')

ValueError: Input contains NaN

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': RidgeCV(alphas=np.logspace(-6,6,30), cv=10),
    'Lasso Regression': LassoCV(cv=10),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor()
    # ensemble 
    
}

# Create a pipeline for each model
pipelines = {}
for model_name, model in models.items():
    pipelines[model_name] = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

# Fit and evaluate each pipeline
results = {}
for model_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)  # Fit the pipeline
    y_pred = pipeline.predict(X_test)  # Make predictions
    mae = mean_absolute_error(y_test, y_pred)  # Calculate MAE
    results[model_name] = mae

# Print results
for model_name, mae in results.items():
    print(f"{model_name}: MAE = {mae}")

camilla is gonna do bagging and random forests