## Setup

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

np.random.seed(1)

## Load data

In [2]:
X_train = pd.read_csv('./airbnb_train_X_price_gte_150.csv') 
y_train = pd.read_csv('./airbnb_train_y_price_gte_150.csv') 
X_test = pd.read_csv('./airbnb_test_X_price_gte_150.csv') 
y_test = pd.read_csv('./airbnb_test_y_price_gte_150.csv') 

##  Model the data

In [3]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

## SVM classification model using polynomial kernal

In [9]:
from sklearn.svm import SVC

In [11]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [12]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"svm with polynomial kernel", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [13]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with polynomial kernel,0.867854,0.855839,0.883239,0.869323


## DTree Classifer

In [None]:
Conduct an initial random search across a wide range of possible parameters.

# Randomized Search

In [33]:
score_measure = "precision"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,50),  
    'min_samples_leaf': np.arange(1,50),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 200), 
    'max_depth': np.arange(1,60), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = rand_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best precision score is 0.8543925220428145
... with parameters: {'min_samples_split': 24, 'min_samples_leaf': 11, 'min_impurity_decrease': 0.0031, 'max_leaf_nodes': 56, 'max_depth': 37, 'criterion': 'entropy'}


50 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\akhil\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\akhil\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\akhil\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

 0.82488631 0.84158705 0.82488631 0.82488631 0.83346137 0.8317569
 0.8251445

In [22]:
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8462980 Precision=0.8379374 Recall=0.8568738 F1=0.8472998


In [None]:
Conduct an exhaustive search across a smaller range of parameters around the parameters found in the initial random search.

## Grid Search

In [34]:
score_measure = "precision"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(22,26),  
    'min_samples_leaf': np.arange(9,13),
    'min_impurity_decrease': np.arange(0.0029, 0.0033, 0.0001),
    'max_leaf_nodes': np.arange(54,58), 
    'max_depth': np.arange(35,40), 
    'criterion': ['entropy'],
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits
The best precision score is 0.8557167358082418
... with parameters: {'criterion': 'entropy', 'max_depth': 35, 'max_leaf_nodes': 54, 'min_impurity_decrease': 0.0031999999999999993, 'min_samples_leaf': 10, 'min_samples_split': 22}


In [35]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8472352 Precision=0.8552124 Recall=0.8342750 F1=0.8446139


## discussion section

In [None]:
#Here we did Random search with the number of iterations of 500 and then doing Grid search around the values which we found on random serach.
#With these values we found around 8000 fits in Grid search
#By Comparing with these models, We found SVM (with poly kernel) has the better precision of 0.855839