# Tutorial 1 - SVM Classification


We will perform two prediction tasks:
1) Whether the price of an AIRBNB listing is greater than or equal to $150 (`price_gte_150` column),<br>
2) What is the price category, among 4 categories, of an AIRBNB listing (`price_category` column)

**The unit of analysis is an AIRBNB LISTING**

# Setup

In [5]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [6]:
#We will predict the "price_gte_150" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_$75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_$75-$150


In [7]:
airbnb

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_$75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_$75-$150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550,0,1,Charlestown,42.380968,-71.083795,Apartment,Private room,3,1.0,1.0,...,1,24,2,4,344,90.0,strict,69,0,lte_$75
3551,0,1,Allston,42.347503,-71.118336,Apartment,Private room,2,1.0,1.0,...,1,0,3,0,0,,strict,150,1,btw_$75-$150
3552,0,0,Charlestown,42.371771,-71.071300,Apartment,Entire home/apt,4,1.0,1.0,...,1,0,3,0,0,,flexible,198,1,btw_$151-$225
3553,0,1,Charlestown,42.385226,-71.080923,Apartment,Private room,2,1.0,1.0,...,1,30,1,2,8,90.0,strict,65,0,lte_$75


# Split the data into train and test

In [8]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(airbnb, test_size=0.3)

## Drop the variables we can't use for the binary task

In [9]:
# We can't use the following columns in this tutorial, because they are not for binary classification tasks

train = train_set.drop(['price', 'price_category'], axis=1)
test = test_set.drop(['price', 'price_category'], axis=1)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [10]:
train_set.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          2
room_type                              0
accommodates                           0
bathrooms                             10
bedrooms                               8
beds                                   6
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 556
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

In [11]:
test_set.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          1
room_type                              0
accommodates                           0
bathrooms                              4
bedrooms                               2
beds                                   3
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 244
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

# Data Prep

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Separate the target variable (we don't want to transform it)

In [13]:
train_y = train[['price_gte_150']]
test_y = test[['price_gte_150']]

train_inputs = train.drop(['price_gte_150'], axis=1)
test_inputs = test.drop(['price_gte_150'], axis=1)

##  Identify the numerical and categorical columns

### Option 1: Manually

### Option 2: Programmatically

In [14]:
train_inputs.dtypes

host_is_superhost                      int64
host_identity_verified                 int64
neighbourhood_cleansed                object
latitude                             float64
longitude                            float64
property_type                         object
room_type                             object
accommodates                           int64
bathrooms                            float64
bedrooms                             float64
beds                                 float64
bed_type                              object
Number of amenities                    int64
guests_included                        int64
price_per_extra_person                 int64
minimum_nights                         int64
number_of_reviews                      int64
number_days_btw_first_last_review      int64
review_scores_rating                 float64
cancellation_policy                   object
dtype: object

In [15]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [16]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['host_is_superhost', 'host_identity_verified']

In [17]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [18]:
binary_columns

['host_is_superhost', 'host_identity_verified']

In [19]:
numeric_columns

['latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'Number of amenities',
 'guests_included',
 'price_per_extra_person',
 'minimum_nights',
 'number_of_reviews',
 'number_days_btw_first_last_review',
 'review_scores_rating']

In [20]:
categorical_columns

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy']

# Pipeline

In [21]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [22]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [23]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [24]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='drop')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [25]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.82254842,  0.69215829,  0.54753414, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.55146572,  0.15729058,  0.54753414, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.07311286, -1.97951247, -0.59100739, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.61093878, -0.07631528,  3.96315871, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.17819153, -0.94575177, -1.16027815, ...,  0.        ,
         0.        ,  0.        ],
       [-0.33618088,  1.03587419, -0.59100739, ...,  0.        ,
         0.        ,  1.        ]])

In [26]:
train_x.shape

(2488, 66)

# Tranform: transform() for TEST

In [27]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.21269719, -1.20324989,  0.54753414, ...,  0.        ,
         0.        ,  1.        ],
       [-2.86419979, -2.67831359, -0.59100739, ...,  0.        ,
         0.        ,  0.        ],
       [-0.11443035,  1.26295963, -0.59100739, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.47803436, -1.63486781, -0.59100739, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.59928397,  0.34795157,  2.82461719, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.19953968,  0.22845713, -0.59100739, ...,  0.        ,
         0.        ,  1.        ]])

In [28]:
test_x.shape

(1067, 66)

# SVM - Binary classification

## Baseline Accuracy

In [29]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
# This is the baseline Train Accuracy

dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5084405144694534


In [32]:
# This is the baseline Test Accuracy

dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.4817244611059044


## SVC(kernel='linear')


In [33]:
from sklearn.svm import SVC
 
lin_svm = SVC(kernel="linear")

lin_svm.fit(train_x, train_y) #np.array(train_y).ravel()

  y = column_or_1d(y, warn=True)


## Accuracy

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
#Predict the train values
train_y_pred = lin_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.867363344051447

In [36]:
#Predict the test values
test_y_pred = lin_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8762886597938144

## Classification Matrix

In [37]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[463,  90],
       [ 42, 472]], dtype=int64)

## Classification Report

In [38]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88       553
           1       0.84      0.92      0.88       514

    accuracy                           0.88      1067
   macro avg       0.88      0.88      0.88      1067
weighted avg       0.88      0.88      0.88      1067



## SVC(kernel='poly') 



In [43]:
from sklearn.svm import SVC

# You need to enter a value for gamma. Remember, gamma controls the shape of the bell curve for rbf
# You can also set it is as gamma='scale'. This will be the default option in future releases

pol_svm = SVC(kernel="poly", degree=3, coef0=0.1, C=1)

pol_svm.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [44]:
#Predict the train values
train_y_pred = pol_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8870578778135049

In [45]:
#Predict the test values
test_y_pred = pol_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8772258669165885

## SVC(kernel='rbf')



In [1]:
rbf_svm = SVC(kernel="rbf", C=5,gamma='scale') #can also use gamma='auto'

rbf_svm.fit(train_x, train_y)

NameError: name 'SVC' is not defined

In [2]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

NameError: name 'rbf_svm' is not defined

In [60]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8781630740393627

# Multi Class Classification


In [61]:
train_set[['price_category']].head(10)

Unnamed: 0,price_category
1965,gte_226
1450,btw_$151-$225
2503,lte_$75
944,lte_$75
199,btw_$75-$150
2167,gte_226
2022,gte_226
801,btw_$75-$150
2140,gte_226
2289,btw_$75-$150


In [64]:
# Assign new target variable
train_y_multiclass = train_set[['price_category']]
test_y_multiclass = test_set[['price_category']]

## Baseline

In [65]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y_multiclass)

In [66]:
from sklearn.metrics import accuracy_score

In [67]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y_multiclass, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.3311897106109325


In [68]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y_multiclass, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.3402061855670103


## SVC(kernel='linear')

In [68]:
svm_clf = SVC(kernel="linear", C=10, decision_function_shape='ovr')#ovr mean one vs the rest (AUTOMATICALLY CLASSES TO BINARY OR ANYTHING)

svm_clf.fit(train_x, train_y_multiclass)

  y = column_or_1d(y, warn=True)


In [69]:
#Predict the train values
train_y_pred = svm_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y_multiclass, train_y_pred)

0.6229903536977492

In [70]:
#Predict the test values
test_y_pred = svm_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_y_pred)

0.6494845360824743

In [71]:
#We usually create the confusion matrix on test set
confusion_matrix(test_y_multiclass, test_y_pred)

array([[148,  35,  59,   3],
       [ 53, 201,  13,  96],
       [ 54,  13, 154,   0],
       [  3,  43,   2, 190]], dtype=int64)

## SVC(kernel='poly')

In [104]:
pol_svm2 = SVC(kernel="poly", degree=3, coef0=0.1, C=1, decision_function_shape='ovr')

pol_svm2.fit(train_x, train_y_multiclass)

  y = column_or_1d(y, warn=True)


In [105]:
#Predict the train values
train_y_pred = pol_svm2.predict(train_x)

#Train accuracy
accuracy_score(train_y_multiclass, train_y_pred)

0.7206591639871383

In [106]:
#Predict the test values
test_y_pred = pol_svm2.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_y_pred)

0.6532333645735707

In [107]:
#We usually create the confusion matrix on test set
confusion_matrix(test_y_multiclass, test_y_pred)

array([[136,  44,  63,   2],
       [ 64, 239,   9,  51],
       [ 66,   9, 146,   0],
       [  4,  58,   0, 176]], dtype=int64)

## SVC(kernel='rbf')

In [117]:
rbf_svm = SVC(kernel="rbf", C=10, gamma=0.1, decision_function_shape='ovr')

rbf_svm.fit(train_x, train_y_multiclass)

  y = column_or_1d(y, warn=True)


In [118]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y_multiclass, train_y_pred)

0.9397106109324759

In [119]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_y_pred)

0.6307403936269915

In [120]:
#We usually create the confusion matrix on test set
confusion_matrix(test_y_multiclass, test_y_pred)

array([[115,  53,  74,   3],
       [ 58, 225,  15,  65],
       [ 50,  16, 155,   0],
       [  5,  55,   0, 178]], dtype=int64)

# Grid Search

In [121]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 4 (2×2) combinations of hyperparameters
    {'C': [5, 15], 
     'gamma': [0.1, 0.2]}
  ]

rbf_svm = SVC(kernel="rbf", decision_function_shape='ovr')

# train across 5 folds, that's a total of 4*5=20 rounds of training 
grid_search = GridSearchCV(rbf_svm, param_grid, cv=5,
                           scoring='accuracy', return_train_score=True)

grid_search.fit(train_x, train_y_multiclass)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


The best hyperparameter combination found:

In [122]:
grid_search.best_params_

{'C': 5, 'gamma': 0.1}

In [123]:
grid_search.best_estimator_

Let's look at the score of each hyperparameter combination tested during the grid search:

In [124]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.5811834864609343 {'C': 5, 'gamma': 0.1}
0.5763625932300631 {'C': 5, 'gamma': 0.2}
0.5711409016347079 {'C': 15, 'gamma': 0.1}
0.5703312242935525 {'C': 15, 'gamma': 0.2}


In [125]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.579297,0.039616,0.281025,0.022028,5,0.1,"{'C': 5, 'gamma': 0.1}",0.598394,0.584337,0.576305,...,0.581183,0.009821,1,0.917588,0.917588,0.918593,0.914114,0.918132,0.917203,0.00159
1,0.828623,0.014586,0.318754,0.024515,5,0.2,"{'C': 5, 'gamma': 0.2}",0.566265,0.598394,0.574297,...,0.576363,0.017029,2,0.968844,0.966834,0.971859,0.966349,0.965846,0.967947,0.002205
2,0.660875,0.026258,0.26873,0.024485,15,0.1,"{'C': 15, 'gamma': 0.1}",0.576305,0.580321,0.558233,...,0.571141,0.011142,3,0.960302,0.958794,0.959799,0.96233,0.960321,0.960309,0.001153
3,0.784936,0.068181,0.284267,0.018079,15,0.2,"{'C': 15, 'gamma': 0.2}",0.580321,0.578313,0.568273,...,0.570331,0.012246,4,0.98995,0.989447,0.98794,0.989453,0.987443,0.988847,0.000973


# Grid Search: randomized

In [126]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import random

param_distribs = {
        'C': randint(low=5, high=50),
        'gamma': uniform(0.1, 0.5),    
    }

rbf_svm = SVC(kernel="rbf", decision_function_shape='ovr')

rbf_search = RandomizedSearchCV(rbf_svm, param_distributions=param_distribs,
                                n_iter=5, cv=5, scoring='accuracy', random_state=42)

rbf_search.fit(train_x, train_y_multiclass)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [127]:
cvres = rbf_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.5273278223558217 {'C': 43, 'gamma': 0.49827149343011645}
0.5341615960825192 {'C': 19, 'gamma': 0.4659969709057026}
0.5687247985907413 {'C': 25, 'gamma': 0.17800932022121826}
0.5679240099229917 {'C': 23, 'gamma': 0.14998745790900145}
0.52050293730253 {'C': 15, 'gamma': 0.5330880728874676}


## Run the final model on the Test Set

In [128]:
final_model = grid_search.best_estimator_

test_predictions = final_model.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_predictions)

0.6504217432052484