In [17]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#time series
import datetime
import time
from statsmodels.tsa.seasonal import seasonal_decompose
import requests

import sklearn
from sklearn.decomposition import FactorAnalysis
import sklearn.datasets
from factor_analyzer import FactorAnalyzer

#decision tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

#visualizing tree
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydot

complete_df = pd.read_csv("DataSets/Complete_Data.csv")
%matplotlib inline

# Decision Tree
<br>

This section will discuss the working of a decision tree in generar.<br><br>
The following explanation about the structure of a decision tree is given by Hunt (1962) in one of the earliest papers about decision trees:<br>
* Let Dt be the set of training records that reaches a node t. 
* If Dt contains records that belong to the same class, let that class be called Yt, then t is a leaf node labelled as Yt. 
* If Dt is an empty set or the attribute values are the same, then t is a leaf node labelled by the default class, Yd (the majority class for the parent Dt). 
* If Dt contains records that belong to more than one class, use an attribute test (this differs for different data types) to split the data into smaller subsets. 
* Recursively apply this procedure to each subset.
<br><br>
Following this algorithm produces a decision tree. <br>
A decision tree is a tree-like graph system that is used to learn a classification function which concludes the value of a dependent attribute given the values of the independent input variables

# J48 Decision Tree
A major task of a decision tree, is determining on which attribute to split next. <br>
J48 uses entropy (entropy measures the homogeneity of a node) to decide on which attribute to split next. 

Formula 1 shows how to calculate the entropy on the dataset.
<br><br>
1.  Entropy(t)= $\sum_{j}$ 𝑝(𝑗|𝑡) 𝑙𝑜𝑔2 𝑝(𝑗|𝑡)                 
<br><br>
Here p(j|t) is the relative frequency of class j at node t. Entropy ranges from 0.0 (where all tuples belong to one class, which implies the most information gain), to 1.0 (the tuples are split evenly between the classes). 
<br> Once the Entropy for every leaf, coming from a parent node is calculated, the average Entropy gain (information gain) for splitting a certain parent attribute into x amount of leaves, can be calculated with formula 2:
<br><br>
2. 𝐺𝐴𝐼𝑁𝑠𝑝𝑙𝑖𝑡=Entropy (p) − ( $\sum_{i=1}^k$ 𝑛𝑖/𝑛 * 𝐸𝑛𝑡𝑟𝑜𝑝𝑦(𝑖) )      
<br><br>
Here p is a parent node (that is split into k partitions). <br>
𝑛𝑖/n is the number of tuples in partition i, divided by the total amount of tuples that reached the parent node.
<br>However, instead of information gain (Gain split) the current j48 decision tree uses gain ratio. 
<br>This is a slightly more optimised measure to decide on which attribute to split compared to the standard information gain. 
<br>This is because it gives a higher entropy to decision trees with a lot of small partitions. 
<br>In data mining complex trees are often times avoided because attribute splits into a lot of different leaves tend to produce models that overfit. 
<br>This is because the more complex a model becomes, the smaller the amount of tuples become that reach a specific leaf node. 
<br>A small amount of tuples per leaf node means that there is a high chance that noise, or just simply variance dominates the classifier.
<br>The following formula attempt to solve this problem by calculating the gain ratio instead.
<br><br>
3. SplitINFO=−  $\sum_{i=1}^k$ * 𝑛𝑖/𝑛 * 𝑙𝑜𝑔2 * 𝑛i/𝑛



In [18]:
def Price_TO_Category(x): #decision tree can only use categoric value
    """
    transforms dependent variable price/day into 4 equal groups:
    1 cheap       price < €96
    2 mid-range   96  < price <= 125
    3 expensive   125 < price <= 175
    4 overpriced  price > 175
    """
    if  x <=96:
        return 1                   #cheap
    if  x > 96 and x <= 125:
         return 2                  #mid-range
    if  x > 125 and x <= 175:
         return 3                  #expensive
    if x >175:
        return 4                   #overpriced
    
complete_df["price"] = complete_df["price"].apply(Price_TO_Category)


def transform_neighbourhood(x):
    """transforms neighbourhood into 4 categories
        Noord = 1
        East  = 2
        West  = 3
        Zuid  = 4
     """
    if  x.__contains__("Noord"):
        return 1
    if  x.__contains__("Oost"):
        return 2
    if  x.__contains__("Zuid"):
        return 3
    if  x.__contains__("West"):
        return 4
    else:
        return 5
    
complete_df["neighbourhood"] = complete_df["neighbourhood"].apply(transform_neighbourhood)


def transform_room_type(x):
    """transforms neighbourhood into 4 categories
        Noord = 1
        East  = 2
        West  = 3
        Zuid  = 4
     """
    if  x.__contains__("Private room"):
        return 1
    if  x.__contains__("Entire home/apt"):
        return 2
    if  x.__contains__("Shared room"):
        return 3

complete_df["room_type"] = complete_df["room_type"].apply(transform_room_type)


def property_type(x):
    """transforms neighbourhood into 4 categories
        Noord = 1
        East  = 2
        West  = 3
        Zuid  = 4
     """
    
    #house
    if  x.__contains__("Townhouse" or "houseboat" or "house" or "Tiny house" or "Earth house" or "Villa" or "Cottage"):
        return 1 
    #hotel                
    if  x.__contains__("Bed and breakfast" or "Hostel" or "Hotel" or "Aparthotel" or "Guest suite" or "Loft" or "Cabin"):
        return 2
    #apartment
    if  x.__contains__("Apartment"):
        return 3
    #something else
    else: 
        return 4
   
complete_df["property_type"] = complete_df["property_type"].apply(property_type)
                                                          
    
def bed_type(x):
    """transforms neighbourhood into 4 categories
        Noord = 1
        East  = 2
        West  = 3
        Zuid  = 4
     """
    if  x.__contains__("Real Bed"):
        return 1
    if  x.__contains__("Futon"):
        return 2
    if  x.__contains__("Pull-out Sofa"):
        return 3
    if  x.__contains__("Airbed"):
        return 4
    if  x.__contains__("Couch"):
        return 5

complete_df["bed_type"] = complete_df["bed_type"].apply(bed_type)                                                      
                                                          

In [19]:
#we split into even frequencies so zeroR accuracy would be 25%
complete_df["price"].value_counts()

2    5086
3    5081
1    5019
4    4844
Name: price, dtype: int64

# Training the model

In [20]:
#independent var
X = complete_df.drop("price",axis=1)

#dependent var
y = complete_df["price"]

#splitting on train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#instantiating decision tree
dtree = DecisionTreeClassifier()

#fitting tree
dtree.fit(X_train,y_train)

predictions = dtree.predict(X_test)


# Results

In [21]:
print(confusion_matrix(y_test,predictions))
print("\n")
print(classification_report(y_test,predictions))

[[757 376 259 120]
 [412 506 405 223]
 [243 438 478 353]
 [125 242 355 717]]


              precision    recall  f1-score   support

           1       0.49      0.50      0.50      1512
           2       0.32      0.33      0.33      1546
           3       0.32      0.32      0.32      1512
           4       0.51      0.50      0.50      1439

   micro avg       0.41      0.41      0.41      6009
   macro avg       0.41      0.41      0.41      6009
weighted avg       0.41      0.41      0.41      6009



In [22]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)

In [23]:
print(confusion_matrix(y_test,rfc_pred))
print("\n")
print(classification_report(y_test,rfc_pred))


[[915 331 205  61]
 [381 575 396 194]
 [181 451 499 381]
 [ 59 145 312 923]]


              precision    recall  f1-score   support

           1       0.60      0.61      0.60      1512
           2       0.38      0.37      0.38      1546
           3       0.35      0.33      0.34      1512
           4       0.59      0.64      0.62      1439

   micro avg       0.48      0.48      0.48      6009
   macro avg       0.48      0.49      0.48      6009
weighted avg       0.48      0.48      0.48      6009



# Visualiazing regular Decision tree

In [11]:
features = list(complete_df.columns[1:])
features

['room_type',
 'minimum_nights',
 'number_of_reviews',
 'calculated_host_listings_count',
 'availability_365',
 'host_since',
 'property_type',
 'accommodates',
 'bedrooms',
 'bed_type',
 'amenities',
 'guests_included',
 'maximum_nights',
 'review_scores_rating',
 'cancellation_policy',
 'average_review_other',
 'price']

In [27]:
dot_data = StringIO()
export_graphviz(dtree,out_file=dot_data,feature_names=features,filled=True,round)

graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph[0].create_png())

SyntaxError: positional argument follows keyword argument (<ipython-input-27-3cc3d2c3cb6b>, line 2)

In [26]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydot 

features = list(complete_df.columns[1:])
features

['room_type',
 'minimum_nights',
 'number_of_reviews',
 'calculated_host_listings_count',
 'availability_365',
 'host_since',
 'property_type',
 'accommodates',
 'bedrooms',
 'bed_type',
 'amenities',
 'guests_included',
 'maximum_nights',
 'review_scores_rating',
 'cancellation_policy',
 'average_review_other',
 'price']

In [29]:
dot_data = StringIO()  
export_graphviz(dtree, out_file=dot_data,feature_names=features,filled=True,rounded=True)

graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png())  

FileNotFoundError: [WinError 2] "dot" not found in path.