In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import pickle

In [2]:
# Load the dataset
def load_ds(path: Path, filename: str) -> pd.DataFrame:
    """Read the dataset csv file as a pandas dataframe."""
    return pd.read_csv(path / filename)

# Load dataset
dataset_path = Path().absolute() / "data"
filename = "X_y_train.csv"
X_y_train = load_ds(dataset_path, filename)

print(f"Shape: {X_y_train.shape}")

Shape: (11243, 39)


Create an empty dictionary to store the values

In [3]:
col_values = dict()

# Year

In [4]:
date = pd.to_datetime(X_y_train["Host Since"], format="%Y-%m-%d")
print(f"min year: {int(date.dt.year.min())}")
print(f"max year: {int(date.dt.year.max())}")
print(f"year mode: {int(date.dt.year.mode()[0])}")

min year: 2009
max year: 2019
year mode: 2015


In [5]:
col_values["year"] = {
    "min": int(date.dt.year.min()), 
    "max": int(date.dt.year.max()),
    "mode": int(date.dt.year.mode()[0])
}

In [6]:
col_values

{'year': {'min': 2009, 'max': 2019, 'mode': 2015}}

In [7]:
col_values["year"]

{'min': 2009, 'max': 2019, 'mode': 2015}

# Is Superhost

In [8]:
X_y_train["Is Superhost"].unique()

array(['f', 't', nan], dtype=object)

In [9]:
col_values["Is Superhost"] = {
    "mode": X_y_train["Is Superhost"].mode()[0]
}

# Property Type

In [10]:
X_y_train["Property Type"].unique()

array(['Apartment', 'House', 'Serviced apartment', 'Other', 'Condominium',
       '*', 'Townhouse', 'Bed and breakfast', 'Loft', 'Guest suite',
       'Hostel', 'Guesthouse', 'Boutique hotel', 'Boat', 'Bungalow',
       'Hotel'], dtype=object)

In [11]:
property_type = X_y_train["Property Type"].copy()
# property_type[property_type == "*"] = np.nan
property_type[~property_type.isin(["*"])].unique().tolist()

['Apartment',
 'House',
 'Serviced apartment',
 'Other',
 'Condominium',
 'Townhouse',
 'Bed and breakfast',
 'Loft',
 'Guest suite',
 'Hostel',
 'Guesthouse',
 'Boutique hotel',
 'Boat',
 'Bungalow',
 'Hotel']

In [12]:
col_values["Property Type"] = {
    "categories": property_type[~property_type.isin(["*"])].unique().tolist(),
    "mode": X_y_train["Property Type"].mode()[0]
}

In [13]:
col_values["Property Type"]

{'categories': ['Apartment',
  'House',
  'Serviced apartment',
  'Other',
  'Condominium',
  'Townhouse',
  'Bed and breakfast',
  'Loft',
  'Guest suite',
  'Hostel',
  'Guesthouse',
  'Boutique hotel',
  'Boat',
  'Bungalow',
  'Hotel'],
 'mode': 'Apartment'}

# Room Type

In [14]:
X_y_train["Room Type"].unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [15]:
col_values["Room Type"] = {
    "categories": X_y_train["Room Type"].unique().tolist(),
    "mode": X_y_train["Room Type"].mode()[0]
}

In [16]:
col_values["Room Type"]

{'categories': ['Private room', 'Entire home/apt', 'Shared room'],
 'mode': 'Entire home/apt'}

# Accomodates

In [17]:
X_y_train["Accomodates"].unique()

array(['1', '4', '2', '5', '3', '6', '10', '8', '9', '7', '12', '*'],
      dtype=object)

In [18]:
accomodates = X_y_train["Accomodates"].copy()
accomodates = accomodates[~accomodates.isin(["*"])].unique().tolist()
accomodates = [int(i) for i in accomodates]
accomodates

[1, 4, 2, 5, 3, 6, 10, 8, 9, 7, 12]

In [19]:
col_values["Accomodates"] = {
    "min": min(accomodates),
    "max": max(accomodates),
    "mode": int(X_y_train["Accomodates"].mode()[0])
}

In [20]:
col_values["Accomodates"]

{'min': 1, 'max': 12, 'mode': 2}

# Bathrooms

In [21]:
X_y_train["Bathrooms"].unique()

array(['1.0', '2.0', '1.5', '2.5', '3.0', '0.0', '0.5', '*', nan],
      dtype=object)

In [22]:
bathrooms = X_y_train["Bathrooms"].copy()
bathrooms = bathrooms[~bathrooms.isin(["*", np.nan])].unique().tolist()
bathrooms = [float(i) for i in bathrooms]
bathrooms

[1.0, 2.0, 1.5, 2.5, 3.0, 0.0, 0.5]

In [23]:
col_values["Bathrooms"] = {
    "min": min(bathrooms),
    "max": max(bathrooms),
    "mode": float(X_y_train["Bathrooms"].mode()[0])
}

In [24]:
col_values["Bathrooms"]

{'min': 0.0, 'max': 3.0, 'mode': 1.0}

# Bedrooms

In [25]:
X_y_train["Bedrooms"].unique()

array(['1.0', '2.0', '0.0', '4.0', '3.0', '5.0', nan, '*'], dtype=object)

In [26]:
bedrooms = X_y_train["Bedrooms"].copy()
bedrooms = bedrooms[~bedrooms.isin(["*", np.nan])].unique().tolist()
bedrooms = [float(i) for i in bedrooms]
bedrooms

[1.0, 2.0, 0.0, 4.0, 3.0, 5.0]

In [27]:
col_values["Bedrooms"] = {
    "min": min(bedrooms),
    "max": max(bedrooms),
    "mode": float(X_y_train["Bedrooms"].mode()[0])
}

In [28]:
col_values["Bedrooms"]

{'min': 0.0, 'max': 5.0, 'mode': 1.0}

# Beds

In [29]:
X_y_train["Beds"].unique()

array(['1.0', '2.0', '3.0', '7.0', '4.0', '6.0', '5.0', '*', '0.0', '8.0',
       '10.0', nan, '9.0'], dtype=object)

In [30]:
bed = X_y_train["Beds"].copy()
bed = bed[~bed.isin(["*", np.nan])].unique().tolist()
bed = [float(i) for i in bed]
bed

[1.0, 2.0, 3.0, 7.0, 4.0, 6.0, 5.0, 0.0, 8.0, 10.0, 9.0]

In [31]:
col_values["Beds"] = {
    "min": min(bed),
    "max": max(bed),
    "mode": float(X_y_train["Beds"].mode()[0])
}

In [32]:
col_values["Beds"]

{'min': 0.0, 'max': 10.0, 'mode': 1.0}

# Min Nights

In [33]:
X_y_train["Min Nights"].unique()

array(['*', '3', '2', '6', '1', '60', '7', '5', '4', '28', '90', '14',
       '15', '25', '21', '30', '61', '20', '9', '62', '8', '10', '13',
       '12', '180'], dtype=object)

In [34]:
min_nights = X_y_train["Min Nights"].copy()
min_nights = min_nights[~min_nights.isin(["*", np.nan])].unique().tolist()
min_nights = [float(i) for i in min_nights]
min_nights

[3.0,
 2.0,
 6.0,
 1.0,
 60.0,
 7.0,
 5.0,
 4.0,
 28.0,
 90.0,
 14.0,
 15.0,
 25.0,
 21.0,
 30.0,
 61.0,
 20.0,
 9.0,
 62.0,
 8.0,
 10.0,
 13.0,
 12.0,
 180.0]

In [35]:
col_values["Min Nights"] = {
    "min": min(min_nights),
    "max": max(min_nights),
    "mode": float(X_y_train["Min Nights"].mode()[0])
}

In [36]:
col_values["Min Nights"]

{'min': 1.0, 'max': 180.0, 'mode': 2.0}

# Instant Bookable

In [37]:
X_y_train["Instant Bookable"].unique()

array(['f', 't'], dtype=object)

In [38]:
col_values["Instant Bookable"] = {
    "mode": X_y_train["Instant Bookable"].mode()[0]
}

In [39]:
col_values

{'year': {'min': 2009, 'max': 2019, 'mode': 2015},
 'Is Superhost': {'mode': 'f'},
 'Property Type': {'categories': ['Apartment',
   'House',
   'Serviced apartment',
   'Other',
   'Condominium',
   'Townhouse',
   'Bed and breakfast',
   'Loft',
   'Guest suite',
   'Hostel',
   'Guesthouse',
   'Boutique hotel',
   'Boat',
   'Bungalow',
   'Hotel'],
  'mode': 'Apartment'},
 'Room Type': {'categories': ['Private room',
   'Entire home/apt',
   'Shared room'],
  'mode': 'Entire home/apt'},
 'Accomodates': {'min': 1, 'max': 12, 'mode': 2},
 'Bathrooms': {'min': 0.0, 'max': 3.0, 'mode': 1.0},
 'Bedrooms': {'min': 0.0, 'max': 5.0, 'mode': 1.0},
 'Beds': {'min': 0.0, 'max': 10.0, 'mode': 1.0},
 'Min Nights': {'min': 1.0, 'max': 180.0, 'mode': 2.0},
 'Instant Bookable': {'mode': 'f'}}

# Save dictionary

In [40]:
# open a file, where you ant to store the data
file = open('models/col_values.pkl', 'wb')

# dump information to that file
pickle.dump(col_values, file)

# close the file
file.close()