# Tutorial 2 - Log Regression

We will predict whether the price is greater than $150 (`price_gte_150` column) of an AirBNB listing in Boston given a number of features about the listing.

**Therefore, our unit of analysis is an AIRBNB LISTING**

# Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(142)


# Get the data

In [2]:
#We will predict the "price" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_$75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_$75-$150


# Split the data into train and test

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(airbnb, test_size=0.3)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [4]:
train_set.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          3
room_type                              0
accommodates                           0
bathrooms                              8
bedrooms                               8
beds                                   5
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 569
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

In [5]:
test_set.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          0
room_type                              0
accommodates                           0
bathrooms                              6
bedrooms                               2
beds                                   4
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 231
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

# Data Prep

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Drop the variables we can't use in this tutorial

In [7]:
# We can't use the following columns in this tutorial, because they are for classification tasks

train = train_set.drop(['price', 'price_category'], axis=1)
test = test_set.drop(['price', 'price_category'], axis=1)

## Separate the target variable (we don't want to transform it)

In [8]:
train_y = train[['price_gte_150']]
test_y = test[['price_gte_150']]

train_inputs = train.drop(['price_gte_150'], axis=1)
test_inputs = test.drop(['price_gte_150'], axis=1)

##  Identify the numerical and categorical columns

### Option 1: Manually

### Option 2: Programmatically

In [9]:
train_inputs.dtypes

host_is_superhost                      int64
host_identity_verified                 int64
neighbourhood_cleansed                object
latitude                             float64
longitude                            float64
property_type                         object
room_type                             object
accommodates                           int64
bathrooms                            float64
bedrooms                             float64
beds                                 float64
bed_type                              object
Number of amenities                    int64
guests_included                        int64
price_per_extra_person                 int64
minimum_nights                         int64
number_of_reviews                      int64
number_days_btw_first_last_review      int64
review_scores_rating                 float64
cancellation_policy                   object
dtype: object

In [10]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [11]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['host_is_superhost', 'host_identity_verified']

In [12]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [13]:
binary_columns

['host_is_superhost', 'host_identity_verified']

In [14]:
numeric_columns

['latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'Number of amenities',
 'guests_included',
 'price_per_extra_person',
 'minimum_nights',
 'number_of_reviews',
 'number_days_btw_first_last_review',
 'review_scores_rating']

In [15]:
categorical_columns

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy']

# Pipeline

In [16]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())])

In [17]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [18]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [19]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='drop')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [20]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-0.57516407, -0.16927383, -1.15578785, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.1958557 ,  0.18733179, -0.58455222, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.42207037,  0.69457073, -0.01331659, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.04321532, -0.1499537 , -1.15578785, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.7776294 , -1.47612124, -0.58455222, ...,  0.        ,
         1.        ,  0.        ],
       [-1.41278267, -0.78143866, -0.58455222, ...,  0.        ,
         0.        ,  1.        ]])

In [21]:
train_x.shape

(2488, 66)

# Tranform: transform() for TEST

In [22]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 1.39510971,  1.60175586,  1.70039032, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.77451956, -1.74858009,  1.70039032, ...,  0.        ,
         0.        ,  1.        ],
       [-0.05462059,  0.13283793,  0.55791905, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.60448836,  0.73614916,  1.12915468, ...,  0.        ,
         0.        ,  0.        ],
       [-0.99210521, -0.23519825, -1.15578785, ...,  0.        ,
         1.        ,  1.        ],
       [-1.44872042, -0.7116745 , -0.58455222, ...,  0.        ,
         0.        ,  1.        ]])

In [23]:
test_x.shape

(1067, 66)

# Calculate the baseline

In [24]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5020096463022508


In [27]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.4967197750702905


# Train a Logistic Regression model

In [28]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='none')

log_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Predicted vs. Actual values

In [29]:
log_reg.predict(test_x)

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [30]:
# Create a new DataFrame

predictions = pd.DataFrame(log_reg.predict(test_x), columns=['Predicted'])

predictions

Unnamed: 0,Predicted
0,1
1,1
2,1
3,1
4,0
...,...
1062,1
1063,1
1064,0
1065,0


In [31]:
# Add the actual to the same DataFrame

predictions['Actual'] = np.array(test_y)

predictions

Unnamed: 0,Predicted,Actual
0,1,1
1,1,1
2,1,0
3,1,1
4,0,0
...,...,...
1062,1,1
1063,1,0
1064,0,0
1065,0,0


# Calculate the overall accuracy

In [32]:
from sklearn.metrics import accuracy_score

In [33]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8786173633440515

In [34]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.85941893158388

## Classification Matrix

In [35]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[448,  89],
       [ 61, 469]], dtype=int64)

## Classification Report

In [36]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.86       537
           1       0.84      0.88      0.86       530

    accuracy                           0.86      1067
   macro avg       0.86      0.86      0.86      1067
weighted avg       0.86      0.86      0.86      1067



# No need for Stochastic Gradient Descent

- We can do everything such L1, L2, ElasticNet regularization within LogisticRegression(). 
- Look at the documentation for the options and solvers we can use.

**Note: Logistic Regression and Stochastic Gradient Descent use different loss functions/solvers. They might come to the same conclusion, but their approach are different.**

# Change solver to liblinear solver (better for small data sets)

In [37]:
log_reg = LogisticRegression(solver='liblinear')

log_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [38]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8737942122186495

In [39]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8641049671977507

# Stochastic Gradient Classifier for comparison

In [40]:
from sklearn.linear_model import SGDClassifier 

# tol = stopping criterion
# eta0 = learning rate
# penalty = regularization term
# max_iter = number of passes over training data (i.e., epochs)

sgd_logreg = SGDClassifier(max_iter=100, penalty=None, eta0=0.01) 

sgd_logreg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [41]:
#Predict the train values
train_y_pred = sgd_logreg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8705787781350482

In [42]:
#Predict the test values
test_y_pred = sgd_logreg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.851921274601687

# L2 Regularization

In [43]:
log_reg = LogisticRegression(penalty='l2')

log_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8741961414790996

In [45]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8641049671977507

# L1 Regularization

In [46]:
log_reg = LogisticRegression(solver='liblinear', penalty='l1')

log_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [47]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8741961414790996

In [48]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8641049671977507

# ElasticNet

In [49]:
log_reg = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.5)

log_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [50]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8697749196141479

In [51]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8650421743205249

# Polynomial Logistic Regression

This is done by creating the polynomial "variables" of the existing variables, then fitting them in a regular logistic regression model


In [52]:
from sklearn.preprocessing import PolynomialFeatures

# Create second degree terms and interaction terms
poly_features = PolynomialFeatures(degree=2).fit(train_x)

train_x_poly = poly_features.transform(train_x)

test_x_poly = poly_features.transform(test_x)

#Mind you, this will create the polynomial terms of the categorical variables too

#if degree=3, then it creates all combinations: a, a^2, a^3, b, b^2, b^3, a.b, a^2.b, a.b^2, a^2.b^2 

In [53]:
log_reg = LogisticRegression()

log_reg.fit(train_x_poly, train_y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
#Predict the train values
train_y_pred = log_reg.predict(train_x_poly)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9437299035369775

In [55]:
#Predict the test values
test_y_pred = log_reg.predict(test_x_poly)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8434864104967198

## Let's regularize the polynomial regression

In [56]:
# C is the inverse of alpha. It can only be positive
# Lower C means more regularization

log_reg = LogisticRegression(penalty='l2', 
                              C=0.01)

log_reg.fit(train_x_poly, train_y)

  y = column_or_1d(y, warn=True)


In [57]:
#Predict the train values
train_y_pred = log_reg.predict(train_x_poly)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8794212218649518

In [58]:
#Predict the test values
test_y_pred = log_reg.predict(test_x_poly)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8603561387066542