# Price Predicition Model

This file creates a model using machine learning to predict listing prices in each neighborhood and tract.

[some inspiration](https://towardsdatascience.com/airbnb-price-prediction-using-linear-regression-scikit-learn-and-statsmodels-6e1fc2bd51a6)

[more inspiration and template](https://github.com/mohamedirfansh/Airbnb-Data-Science-Project/blob/master/Machine%20Learning%20Models.ipynb)

### **This file is not cleaned**, we used the produced models in the creation of the streamlit website

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.calibration import CalibrationDisplay
from sklearn.compose import (
    ColumnTransformer,
    make_column_selector,
    make_column_transformer,
)
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestRegressor
from sklearn.feature_selection import (
    RFECV,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LinearRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    DetCurveDisplay,
    PrecisionRecallDisplay,
    RocCurveDisplay,
    classification_report,
    make_scorer,
    r2_score,
    mean_squared_error,
    median_absolute_error,
    explained_variance_score,
    mean_gamma_deviance,
    mean_squared_log_error
)
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
    StandardScaler,
    RobustScaler,
    FunctionTransformer,
    LabelEncoder,
)
from sklearn.svm import LinearSVC

set_config(display="diagram")  # display='text' is the default

pd.set_option(
    "display.max_colwidth", 1000, "display.max_rows", 50, "display.max_columns", None
)

import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option('display.max_rows', 50)

In [4]:
# load cleaned listings data
listings = pd.read_csv('../outputs/listings_clean.csv')

In [5]:
# load calander data
calendar = pd.read_csv('../inputs/calendar.csv.gz', compression='gzip')
calendar["date"] = pd.to_datetime(calendar["date"])

# change price variable to int
calendar.price = calendar.price.str[1:-3]
calendar.price = calendar.price.str.replace(",", "")
calendar.price = calendar.price.astype('int64')

In [6]:
newListingsDF = listings[['id', 'room_type','property_type','bedrooms','amenities',
                          'accommodates', 'census_tract', 'census_NBH', 'price']]
newListingsDF = newListingsDF.copy()
newListingsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3136 entries, 0 to 3135
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             3136 non-null   int64  
 1   room_type      3136 non-null   object 
 2   property_type  3136 non-null   object 
 3   bedrooms       3136 non-null   float64
 4   amenities      3136 non-null   object 
 5   accommodates   3136 non-null   int64  
 6   census_tract   3136 non-null   float64
 7   census_NBH     3136 non-null   object 
 8   price          3136 non-null   float64
dtypes: float64(3), int64(2), object(4)
memory usage: 220.6+ KB


In [7]:
# test to see all possible amenities
amenities_list = list(newListingsDF.amenities)
amenities_list_string = " ".join(amenities_list)
amenities_list_string = amenities_list_string.replace('{', '')
amenities_list_string = amenities_list_string.replace('}', ',')
amenities_list_string = amenities_list_string.replace('"', '')
amenities_set = [x.strip() for x in amenities_list_string.split(',')]
# amenities_set

In [8]:
# Creating column variables for each of the different amenities and adding them into the original dataframe
newListingsDF.loc[newListingsDF['amenities'].str.contains('Air conditioning|Central air conditioning'), 'air_conditioning'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Amazon Echo|Apple TV|Game console|Netflix|Projector and screen|Smart TV'), 'high_end_electronics'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('BBQ grill|Fire pit|Propane barbeque'), 'bbq'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Balcony|Patio'), 'balcony'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Beach view|Beachfront|Lake access|Mountain view|Ski-in/Ski-out|Waterfront|Ocean view'), 'nature_and_views'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Bed linens'), 'bed_linen'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Breakfast'), 'breakfast'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('TV'), 'tv'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Coffee maker|Espresso machine'), 'coffee_machine'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Cooking basics'), 'cooking_basics'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Dishwasher|Dryer|Washer'), 'white_goods'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Elevator'), 'elevator'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Exercise equipment|Gym|gym'), 'gym'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Family/kid friendly|Children|children'), 'child_friendly'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('parking'), 'parking'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Garden|Outdoor|Sun loungers|Terrace'), 'outdoor_space'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Host greets you'), 'host_greeting'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Hot tub|Jetted tub|hot tub|Sauna|Pool|pool'), 'hot_tub_sauna_or_pool'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Internet|Pocket wifi|Wifi'), 'internet'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Long term stays allowed'), 'long_term_stays'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Pets|pet|Cat|Dog'), 'pets_allowed'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Private entrance'), 'private_entrance'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Safe|Security system'), 'secure'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Self check-in'), 'self_check_in'] = 1
newListingsDF.loc[newListingsDF['amenities'].str.contains('Smoking allowed'), 'smoking_allowed'] = 1

In [9]:
# replace na values and drop amenenities variable
newListingsDF.fillna(0, inplace=True)
newListingsDF.drop('amenities', axis=1, inplace=True)

In [10]:
# set property types with low frequency to "other"
newListingsDF.loc[~newListingsDF.property_type.isin(['Entire rental unit', 'Private room in rental unit','Entire condo','Private room in home','Entire serviced apartment',"Entire home",
                                                    'Private room in condo', 'Private room in townhouse', 'Entire townhouse', 'Entire guest suite', 'Private room in bed and breakfast',
                                                    'Room in boutique hotel', 'Room in hotel']), 'property_type'] = 'Other'
newListingsDF.property_type.value_counts()

Entire rental unit                   1317
Private room in rental unit           477
Entire condo                          320
Private room in home                  306
Entire serviced apartment             181
Entire home                           145
Other                                  94
Private room in condo                  71
Private room in townhouse              45
Entire guest suite                     42
Entire townhouse                       42
Private room in bed and breakfast      40
Room in boutique hotel                 34
Room in hotel                          22
Name: property_type, dtype: int64

In [11]:
# create a final data set for machine learning model
listings_model = pd.merge(calendar.rename(columns={"listing_id":"id"})
                          .drop(['available', 'adjusted_price', 'minimum_nights', 'maximum_nights'], axis=1),
                          newListingsDF.drop('price', axis=1), on='id', how='left')
listings_model = listings_model.dropna()

In [16]:
listings_model

Unnamed: 0,id,date,price,room_type,property_type,bedrooms,accommodates,census_tract,census_NBH,air_conditioning,high_end_electronics,bbq,balcony,nature_and_views,bed_linen,breakfast,tv,coffee_machine,cooking_basics,white_goods,elevator,gym,child_friendly,parking,outdoor_space,host_greeting,hot_tub_sauna_or_pool,internet,long_term_stays,pets_allowed,private_entrance,secure,self_check_in,smoking_allowed
0,3781,2023-03-19,125,Entire home/apt,Entire rental unit,1.0,2.0,512.0,East Boston,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3781,2023-03-20,125,Entire home/apt,Entire rental unit,1.0,2.0,512.0,East Boston,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3781,2023-03-21,125,Entire home/apt,Entire rental unit,1.0,2.0,512.0,East Boston,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3781,2023-03-22,125,Entire home/apt,Entire rental unit,1.0,2.0,512.0,East Boston,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3781,2023-03-23,125,Entire home/apt,Entire rental unit,1.0,2.0,512.0,East Boston,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1410018,2021483,2024-03-13,500,Entire home/apt,Entire condo,2.0,6.0,608.0,South Boston,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1410019,2021483,2024-03-14,500,Entire home/apt,Entire condo,2.0,6.0,608.0,South Boston,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1410020,2021483,2024-03-15,500,Entire home/apt,Entire condo,2.0,6.0,608.0,South Boston,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1410021,2021483,2024-03-16,500,Entire home/apt,Entire condo,2.0,6.0,608.0,South Boston,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [12]:
# create seperate data sets for tract and NBH inclusion
listings_model_NBH = listings_model.drop('census_tract', axis=1)
listings_model_tract = listings_model.drop('census_NBH', axis=1)

In [13]:
# create NBH pipeline
cat_pipe_features = ['room_type','property_type', 'bedrooms', 'census_NBH']
num_pipe_features = ['accommodates']
bin_pipe_features = ['air_conditioning', 'high_end_electronics', 'bbq', 'balcony', 'nature_and_views', 'bed_linen', 'breakfast', 'tv', 'coffee_machine', 'cooking_basics', 'white_goods', 'elevator', 'gym', 'child_friendly', 'parking', 'outdoor_space', 'host_greeting', 'hot_tub_sauna_or_pool', 'internet', 'long_term_stays', 'pets_allowed', 'private_entrance', 'secure', 'self_check_in', 'smoking_allowed']
cat_pipe = make_pipeline(OneHotEncoder(sparse_output=False))
num_pipe = make_pipeline(StandardScaler())
binary_pipe = make_pipeline('passthrough')

preproc_pipe = make_column_transformer(
    (cat_pipe, cat_pipe_features),
    (num_pipe, num_pipe_features),
    (binary_pipe, bin_pipe_features))

pipe_NBH = Pipeline([('columntransformer', preproc_pipe),
                 ('regression', RandomForestRegressor())])
pipe_NBH

In [14]:
# Sort dataframe by date
listings_model_NBH = listings_model_NBH.sort_values('date')

# dictionary for storing models (10 models)
models_NBH = {}

# Iterate over range of months
for i in range(0, 10):
    
    # re-initialize pipeline to store unique fits
    pipe_NBH = Pipeline([('columntransformer', preproc_pipe),
                 ('regression', RandomForestRegressor())])
    
    # Select rows for training set and prediction set
    train_set = listings_model_NBH.loc[(listings_model_NBH['date'] < listings_model_NBH['date'].min() + pd.DateOffset(months=i + 2)) & 
                        (listings_model_NBH['date'] >= listings_model_NBH['date'].min() + pd.DateOffset(months=i))]
    pred_set = listings_model_NBH.loc[(listings_model_NBH['date'] < listings_model_NBH['date'].min() + pd.DateOffset(months=i + 3)) & 
                        (listings_model_NBH['date'] >= listings_model_NBH['date'].min() + pd.DateOffset(months=i + 2))]
    
    # create train and test sets
    X_train = train_set.drop(['id', 'date', 'price'], axis=1)
    y_train = train_set['price']
    X_pred = pred_set.drop(['id', 'date', 'price'], axis=1)
    y_pred = pred_set['price']

    # Train pipe and store model
    pipe_NBH.fit(X_train, y_train)
    models_NBH[f"model_{i}"] = pipe_NBH
    
    # Make predictions and evaluate performance in sample
    y_pred_hat = pipe_NBH.predict(X_pred)
    mse = mean_squared_error(y_pred, y_pred_hat)
    r2 = r2_score(y_pred, y_pred_hat)
    print('In-Sample Scores:')
    print(f'Model {i} MSE: {mse}')
    print(f'Model {i} r2: {r2}')
    print("===================================")

In-Sample Scores:
Model 0 MSE: 17510.775210664833
Model 0 r2: 0.7799831747544209
In-Sample Scores:
Model 1 MSE: 9095.450011552779
Model 1 r2: 0.8839883469511212
In-Sample Scores:
Model 2 MSE: 11407.98440650583
Model 2 r2: 0.8528534323823938
In-Sample Scores:
Model 3 MSE: 43699.708777905515
Model 3 r2: 0.5935838527804385
In-Sample Scores:
Model 4 MSE: 40552.40145512895
Model 4 r2: 0.7012352045437359
In-Sample Scores:
Model 5 MSE: 17127.810769054093
Model 5 r2: 0.862954156910228
In-Sample Scores:
Model 6 MSE: 15409.845669775144
Model 6 r2: 0.8564496698486397
In-Sample Scores:
Model 7 MSE: 133313.58513651646
Model 7 r2: 0.42842907257720575
In-Sample Scores:
Model 8 MSE: 115561.96680888478
Model 8 r2: 0.6421645968459752
In-Sample Scores:
Model 9 MSE: 14597.689646298875
Model 9 r2: 0.955106547641848


- Some months might not have enough listings with certain attributes to have good r2 scores

- If in-sample test scores are high, and out of sample remains low:
    - Alt idea:
        - create many models, determining which is the best at predicting each month (e.g. prev 3 months train)
        - have month_select be the selector for which model you use
        - this would give an airBnB host the ability to get a suggested price in 2024
        - I could do a rolling sample (prev 3 predict future 1 month, loop +1 and predict each month)
    - If I am **not** making a *master model* that is supposed to predict all future airbnb prices... but instead I am simply giving an idea of what you should price a hypothetical listing based on its features
        - Then I would not need a holdout set, because I am not using these models on data it has never seen
        - Issue with this approach is that I may as well show the mean price based on those attributes with no regression at all.
        - **But a mean price with the specific attributes you input may not exist, model time?**
        
    

In [15]:
# create tract pipeline
cat_pipe_features = ['room_type','property_type', 'bedrooms', 'census_tract']
num_pipe_features = ['accommodates']
bin_pipe_features = ['air_conditioning', 'high_end_electronics', 'bbq', 'balcony', 'nature_and_views', 'bed_linen', 'breakfast', 'tv', 'coffee_machine', 'cooking_basics', 'white_goods', 'elevator', 'gym', 'child_friendly', 'parking', 'outdoor_space', 'host_greeting', 'hot_tub_sauna_or_pool', 'internet', 'long_term_stays', 'pets_allowed', 'private_entrance', 'secure', 'self_check_in', 'smoking_allowed']
cat_pipe = make_pipeline(OneHotEncoder(sparse_output=False))
num_pipe = make_pipeline(StandardScaler())
binary_pipe = make_pipeline('passthrough')


preproc_pipe = make_column_transformer(
    (cat_pipe, cat_pipe_features),
    (num_pipe, num_pipe_features),
    (binary_pipe, bin_pipe_features))

pipe_tract = Pipeline([('columntransformer', preproc_pipe),
                 ('regression', RandomForestRegressor())])
pipe_tract

In [16]:
# model rolling OOS testing with tracts

# Sort dataframe by date
listings_model_tract = listings_model_tract.sort_values('date')

# dictionary for storing models
models_tract = {}

# Iterate over range of months
for i in range(0, 10):
    
    # re-initialize pipeline for storage
    pipe_tract = Pipeline([('columntransformer', preproc_pipe),
                 ('regression', RandomForestRegressor())])
    
    # Select rows for training set and prediction set
    train_set = listings_model_tract.loc[(listings_model_tract['date'] < listings_model_tract['date'].min() + pd.DateOffset(months=i + 2)) & 
                        (listings_model_tract['date'] >= listings_model_tract['date'].min() + pd.DateOffset(months=i))]
    pred_set = listings_model_tract.loc[(listings_model_tract['date'] < listings_model_tract['date'].min() + pd.DateOffset(months=i + 3)) & 
                        (listings_model_tract['date'] >= listings_model_tract['date'].min() + pd.DateOffset(months=i + 2))]
    
    # create train and test sets
    X_train = train_set.drop(['id', 'date', 'price'], axis=1)
    y_train = train_set['price']
    X_pred = pred_set.drop(['id', 'date', 'price'], axis=1)
    y_pred = pred_set['price']

    # Train pipe and store model
    pipe_tract.fit(X_train, y_train)
    models_tract[f"model_{i}"] = pipe_tract

    # Make predictions and evaluate performance in sample
    y_pred_hat = pipe_tract.predict(X_pred)
    mse = mean_squared_error(y_pred, y_pred_hat)
    r2 = r2_score(y_pred, y_pred_hat)
    print('In-Sample Scores:')
    print(f'Model {i} MSE: {mse}')
    print(f'Model {i} r2: {r2}')
    print("===================================")

In-Sample Scores:
Model 0 MSE: 17330.437271668623
Model 0 r2: 0.7822490584935435
In-Sample Scores:
Model 1 MSE: 8893.988589748376
Model 1 r2: 0.8865579694040421
In-Sample Scores:
Model 2 MSE: 11120.373656713069
Model 2 r2: 0.8565631968012333
In-Sample Scores:
Model 3 MSE: 42832.51221455998
Model 3 r2: 0.6016489565537417
In-Sample Scores:
Model 4 MSE: 38753.51815467916
Model 4 r2: 0.71448825447478
In-Sample Scores:
Model 5 MSE: 15110.669890808967
Model 5 r2: 0.8790940346807957
In-Sample Scores:
Model 6 MSE: 14634.856271807683
Model 6 r2: 0.8636690792006905
In-Sample Scores:
Model 7 MSE: 133724.8545082076
Model 7 r2: 0.4266657892932302
In-Sample Scores:
Model 8 MSE: 115287.30381402225
Model 8 r2: 0.6430150855163581
In-Sample Scores:
Model 9 MSE: 14605.772878864931
Model 9 r2: 0.9550816886247777


### IDEA: Have the model accept input parameters to be a host list price calculator.
- Benefits of using small ending sample:
    - best in-sample scores
    - more forward looking
    - speculative intrinsic value

In [17]:
test = listings_model_NBH.drop(columns=['id','date','price'])
test = test.iloc[4].to_frame().T
test.columns

Index(['room_type', 'property_type', 'bedrooms', 'accommodates', 'census_NBH',
       'air_conditioning', 'high_end_electronics', 'bbq', 'balcony',
       'nature_and_views', 'bed_linen', 'breakfast', 'tv', 'coffee_machine',
       'cooking_basics', 'white_goods', 'elevator', 'gym', 'child_friendly',
       'parking', 'outdoor_space', 'host_greeting', 'hot_tub_sauna_or_pool',
       'internet', 'long_term_stays', 'pets_allowed', 'private_entrance',
       'secure', 'self_check_in', 'smoking_allowed'],
      dtype='object')

In [19]:
len(test.columns)

30

In [19]:
y_test = pipe_NBH.predict(test)
y_test[0]

425.0

In [17]:
# save models using pickle

In [21]:
import pickle

# Save the models
with open('../outputs/models_NBH.pkl', 'wb') as f:
    pickle.dump(models_NBH, f)

In [22]:
# with open('../outputs/models_tract.pkl', 'wb') as f:
#     pickle.dump(models_tract, f)

In [None]:
# zipped pickle
import zipfile
import os

In [None]:
# zip path
filename = '../outputs/models_NBH.pkl'

# name for the output zip file
zip_filename = '../outputs/zip_models_NBH.zip'

# create a zipfile object and write the file to it
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zip_file:
    zip_file.write(filename, os.path.basename(filename))

In [40]:
# zip path
filename = '../outputs/models_tract.pkl'

# name for the output zip file
zip_filename = '../outputs/zip_models_tract.zip'

# create a zipfile object and write the file to it
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zip_file:
    zip_file.write(filename, os.path.basename(filename))

In [33]:
# to load zip

# name of the zipped file you want to extract
zip_filename = '../outputs/zip_models_NBH.zip'

# path where you want to extract the contents of the zip file
extract_path = '../inputs/'

# create a zipfile object and extract the contents
with zipfile.ZipFile(zip_filename, 'r') as zip_file:
    zip_file.extractall(extract_path)

# load models
with open('inputs/models/models_NBH.pkl', 'rb') as f:
    models_NBH = pickle.load(f)
with open('inputs/models/models_tract.pkl', 'rb') as f:
    models_tract = pickle.load(f)

In [48]:
# name of the zipped file you want to extract
zip_filename = '../outputs/zip_models_NBH.zip'
# create a zipfile object
with zipfile.ZipFile(zip_filename, 'r') as zip_file:
    # load models from the pickle files in the zip file
    with zip_file.open('models_NBH.pkl') as f:
        models_NBH = pickle.load(f)

['models_NBH.pkl']


In [50]:
# name of the zipped file you want to extract
zip_filename = '../outputs/zip_models_tract.zip'
# create a zipfile object
with zipfile.ZipFile(zip_filename, 'r') as zip_file:
    # load models from the pickle files in the zip file
    with zip_file.open('models_tract.pkl') as f:
        models_tract = pickle.load(f)

In [34]:
import pickle
with open('../inputs/models_NBH.pkl', 'rb') as f:
    models_NBH = pickle.load(f)

In [23]:
# # Load the dictionary from the file
# with open('../outputs/models_NBH.pkl', 'rb') as f:
#     models_NBH = pickle.load(f)

In [38]:
date_select = 'April 2023'
if date_select == 'April 2023':
    model = models_NBH['model_0']
elif date_select == 'May 2023' or date_select == 'June 2023':
    model = models_NBH['model_1']
elif date_select == 'July 2023':
    model = models_NBH['model_2']
elif date_select == 'August 2023':
    model = models_NBH['model_3']
elif date_select == 'September 2023':
    model = models_NBH['model_4']
elif date_select == 'October 2023':
    model = models_NBH['model_5']
elif date_select == 'November 2023' or date_select == 'December 2023':
    model = models_NBH['model_6']
elif date_select == 'January 2024' or date_select == 'Febuary 2024' or date_select == 'March 2024':
    model = models_NBH['model_9']
model

In [39]:
model.predict(test)

array([390.40758159])

In [None]:
import seaborn as sns
plt.figure(figsize=(12,12))
sns.scatterplot(x='bedrooms', y='price', data=listings[['bedrooms', 'price']])

In [None]:
plt.figure(figsize=(12,12))

sns.heatmap(listings.groupby(['property_type', 'bedrooms']).price.mean().unstack(),annot=True, fmt=".0f", cmap = sns.cm.rocket_r, cbar_kws={'label': 'mean_price'})
