In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv -O AB_NYC_2019.csv

## Data preparation

In [3]:
usecols = [
    'room_type', 'neighbourhood_group',
    'latitude', 'longitude', 'price','minimum_nights',
    'number_of_reviews', 'reviews_per_month',
    'calculated_host_listings_count', 'availability_365'
]

data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv"

df = pd.read_csv(data, usecols=usecols)

In [4]:
df['reviews_per_month'] = df.reviews_per_month.fillna(0)

In [5]:
df['price'] = df['price'] >= 152

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [8]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
cat = ['neighbourhood_group', 'room_type']

num = [
    'latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
    'reviews_per_month', 'calculated_host_listings_count',
    'availability_365'
]

## Training the model

You get a convergence warning:

In [12]:
train_dict = df_train[cat + num].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

model = LogisticRegression(solver='lbfgs', C=1.0, max_iter=15000)
model.fit(X_train, y_train)

We can fix this model by using a scaler. You can read more about scalers
[here](https://scikit-learn.org/stable/modules/preprocessing.html).

Also, we'll show you how to use `OneHotEncoding` instead of `DictVectorizer`

## Feature scaling + OHE

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

First, we prepare the numerical variables. We'll use the scaler for that
and write the results to `X_train_num`:

In [14]:
X_train_num = df_train[num].values

scaler = StandardScaler()
#scaler = MinMaxScaler()

X_train_num = scaler.fit_transform(X_train_num)

The scaler scales the numerical features. Compare the un-scaled version of
latitude with the scaled one:

In [15]:
df_train.latitude.values

array([40.7276 , 40.70847, 40.83149, ..., 40.79994, 40.69585, 40.64438])

In [16]:
X_train_num[:, 0]

array([-0.02524398, -0.37616878,  1.88053632, ...,  1.3017764 ,
       -0.60767275, -1.5518494 ])

Now let's process categorical features using `OneHotEncoding`.
We'll write the results to `X_train_cat`:

In [18]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [19]:
X_train_cat = ohe.fit_transform(df_train[cat].values)

In [21]:
ohe.get_feature_names_out()

array(['x0_Bronx', 'x0_Brooklyn', 'x0_Manhattan', 'x0_Queens',
       'x0_Staten Island', 'x1_Entire home/apt', 'x1_Private room',
       'x1_Shared room'], dtype=object)

Now we need to combine two matrices into one - `X_train`:

In [22]:
X_train = np.column_stack([X_train_num, X_train_cat])

And now let's train the model:

In [23]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train)

We can check it's accuracy:

In [24]:
X_val_num = df_val[num].values
X_val_num = scaler.transform(X_val_num)

X_val_cat = ohe.transform(df_val[cat].values)

X_val = np.column_stack([X_val_num, X_val_cat])

In [25]:
y_pred = model.predict_proba(X_val)[:, 1]
accuracy_score(y_val, y_pred >= 0.5)

0.7978320891706718

It's a little bit better than the version without scaled features.