# STEP -01: Import all the standard libraries¶

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy import stats
np.random.seed(1)

# Step 02  Load the data

In [2]:
X_train = pd.read_csv('./airbnb_train_X_price_gte_150.csv') 
y_train = pd.read_csv('./airbnb_train_y_price_gte_150.csv') 
X_test = pd.read_csv('./airbnb_test_X_price_gte_150.csv') 
y_test = pd.read_csv('./airbnb_test_y_price_gte_150.csv') 

# Step 3 Fit a SVM classification model using polynomial kernal with precision metric by using Grid search cv

In [17]:
score_measure = "precision"
kfolds = 5

param_grid = {
     'C': [0.1,1, 10, 100], 
     'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}

svm = SVC()
grid_search = GridSearchCV(estimator = svm, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


  y = column_or_1d(y, warn=True)


The best precision score is 0.9370404920282969
... with parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'poly'}


In [18]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.6073102 Precision=0.9117647 Recall=0.2335217 F1=0.3718141


# Fit a SVM classification model using polynomial kernal with precision metric by using Randomized search cv

In [19]:
score_measure = "precision"
kfolds = 5

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}

svc_rs = SVC()
rand_search = RandomizedSearchCV(estimator = svc_rs, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestPrecisionTree = rand_search.best_estimator_



Fitting 5 folds for each of 16 candidates, totalling 80 fits


  y = column_or_1d(y, warn=True)


The best precision score is 0.9370404920282969
... with parameters: {'kernel': 'poly', 'gamma': 0.01, 'C': 0.1}


In [20]:
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.6073102 Precision=0.9117647 Recall=0.2335217 F1=0.3718141


# Fit a Decision tree classifier model by optimising  precision metric by using Grid search cv

In [21]:
score_measure = "precision"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(30,36),  
    'min_samples_leaf': np.arange(6,12),
    'min_impurity_decrease': np.arange(0.0048, 0.0054, 0.0001),
    'max_leaf_nodes': np.arange(162,168), 
    'max_depth': np.arange(15,21), 
    'criterion': ['entropy'],
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

Fitting 5 folds for each of 9072 candidates, totalling 45360 fits
The best precision score is 0.8470330066484271
... with parameters: {'criterion': 'entropy', 'max_depth': 15, 'max_leaf_nodes': 162, 'min_impurity_decrease': 0.0048, 'min_samples_leaf': 10, 'min_samples_split': 30}


In [22]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8462980 Precision=0.8379374 Recall=0.8568738 F1=0.8472998


# Fit a Decision tree classifier model by optimising precision metric by using Random search cv

In [23]:
score_measure = "precision"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestPrecisionTree = rand_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best precision score is 0.8570034316615264
... with parameters: {'min_samples_split': 2, 'min_samples_leaf': 20, 'min_impurity_decrease': 0.0021, 'max_leaf_nodes': 56, 'max_depth': 31, 'criterion': 'gini'}


In [24]:
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8472352 Precision=0.8333333 Recall=0.8662900 F1=0.8494922


# Analysis:
1. Firstly, as mentioned in question I have loaded the data from airbnb with a target of price_gte_150 t.
2. Then I used the Grid search cv and also the Random search cv for fitting my two models svm and decition trees by optimising the precision metric.
3. From the results if i observe for SVM classifier with polynomial kernel both grid search and random search are performing best and having the best precision score of about 93.7.
4. If we consider the results for decision tree the best precision score using grid search cv is 84.7.
5. By using random search cv is 85.7.
6. so in decision trees random search cv is performing best.
7. overall, if we consider best precision score for svm model is high when compared to decision trees.