# Decision Tree

In [11]:
%pylab inline
%matplotlib inline
from matplotlib import pyplot as plt

from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark import *
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from collections import defaultdict
from functools import reduce

Populating the interactive namespace from numpy and matplotlib


In [2]:
sc = SparkContext()

To select the Dataset to use delete comment mark (#) from it 
Example- If u wanna select Bike sharing Dataset , remove # from File = 'hour.csv' and other places where code is commented

The code will create noheader file according to dataset so we need to change the dataset name in Decision Tree Code only
and remove # from required code in Gradient Boosted and Linear Regression.

In [27]:
file = 'TourneyDetailedResults.csv'
#file = '2017.csv'

In [28]:
#Creating a file noheader.csv by removing the headers from the file

In [29]:
with open (file,'rb') as inp:
    with open('noheader_Tourney.csv','wb') as out:
        inp.readline()
        for line in inp:
            out.write(line)

In [6]:
path = "noheader.csv"
raw_data = sc.textFile(path)
num_data = raw_data.count()
records = raw_data.map(lambda x: x.split(","))
first = records.first()
print('First record: ', first)
print('Total number of records: ', num_data)

First record:  ['1', '2011-01-01', '1', '0', '1', '0', '0', '6', '0', '1', '0.24', '0.2879', '0.81', '0', '3', '13', '16']
Total number of records:  17379


In [7]:
records.cache()

PythonRDD[4] at RDD at PythonRDD.scala:49

In [8]:
def get_mapping(rdd, idx):
    return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()

## Categorical features

In [9]:
print("Mapping of first categorical feature column: %s" % get_mapping(records, 2))

Mapping of first categorical feature column: {'1': 0, '4': 1, '2': 2, '3': 3}


In [13]:
mappings = [get_mapping(records, i) for i in range(2,10)]
#mappings = [get_mapping(records, i) for i in range(2,3)]
cat_len = sum((len(b) for b in mappings))
num_len = len(records.first()[11:15])
#num_len = len(records.first()[3:11])
total_len = num_len + cat_len
cat_fea = reduce(lambda a, b: dict(a,**b), mappings)

In [14]:
print("Feature vector length for categorical features: %d" % cat_len)
print("Feature vector length for numerical features: %d" % num_len)
print("Total feature vector length: %d" % total_len)

Feature vector length for categorical features: 57
Feature vector length for numerical features: 4
Total feature vector length: 61


In [15]:
def extract_features_dt(record):
    return np.array(record[2:14])
    #return np.array(record[3:11])

In [16]:
def extract_label(record):
    return float(record[-1])

In [17]:
data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r)))

In [18]:
first_point = data_dt.first()
print("Decision Tree Label: " + str(first_point.label))
print("Decision Tree feature vector: " + str(first_point.features))
print("Decision Tree feature vector length: " + str(len(first_point.features)))

Decision Tree Label: 16.0
Decision Tree feature vector: [1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0]
Decision Tree feature vector length: 12


In [19]:
dt_model = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo={})
preds = dt_model.predict(data_dt.map(lambda p: p.features))
actual = data_dt.map(lambda p: p.label)
true_vs_predicted_dt = actual.zip(preds)
print("Decision tree prediction:" +str(true_vs_predicted_dt.take(5)))
print("Decision Tree depth: " + str(dt_model.depth()))
print("Decision Tree number of nodes: " + str(dt_model.numNodes()))

Decision tree prediction:[(16.0, 54.913223140495866), (40.0, 54.913223140495866), (32.0, 53.171052631578945), (13.0, 14.284023668639053), (1.0, 14.284023668639053)]
Decision Tree depth: 5
Decision Tree number of nodes: 63


In [20]:
# Root Suarred Error
def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2

# Aboslute Error
def abs_error(actual, pred):
     return np.abs(pred - actual)
# Mean Squared Error     
def squared_error(actual, pred):
    return (pred - actual)**2

In [21]:
rmsle_dt = np.sqrt(true_vs_predicted_dt.map(lambda lp: squared_log_error(lp[0], lp[1])).mean())
mse = true_vs_predicted_dt.map(lambda lp: squared_error(lp[0], lp[1])).mean()
mae = true_vs_predicted_dt.map(lambda lp: abs_error(lp[0], lp[1])).mean()


In [22]:
print("Decision Tree Model - Root Mean Squared Log Error: %2.4f" % rmsle_dt)
print("Decision Tree Model - Mean Squared Error: %2.4f" % mse)
print("Decision Tree Model - Mean Absolute Error: %2.4f" % mae)

Decision Tree Model - Root Mean Squared Log Error: 0.6251
Decision Tree Model - Mean Squared Error: 11611.4860
Decision Tree Model - Mean Absolute Error: 71.1502


In [26]:
def extract_features_dt_c(record):
    return np.array(record[2:10])
    
def extract_label_c(record):
    return float(record[-1])

data_dt_c = records.map(lambda r: LabeledPoint(extract_label_c(r), extract_features_dt_c(r)))
dt_model_c = DecisionTree.trainRegressor(data_dt_c, categoricalFeaturesInfo={})
preds_c = dt_model_c.predict(data_dt_c.map(lambda p: p.features))
actual_c = data_dt_c.map(lambda p: p.label)
true_vs_predicted_dt_c = actual_c.zip(preds_c)
mse_c = true_vs_predicted_dt_c.map(lambda lp: squared_error(lp[0], lp[1])).mean()
mae_c = true_vs_predicted_dt_c.map(lambda lp: abs_error(lp[0], lp[1])).mean()
rmsle_dt_c = np.sqrt(true_vs_predicted_dt_c.map(lambda lp: squared_log_error(lp[0], lp[1])).mean())


print("Decision Tree Categorical Features - Mean Squared Error: %2.4f" % mse_c)
print("Decision Tree Categorical Features - Mean Absolute Error: %2.4f" % mae_c)
print("Decision Tree Categorical Features - Root Mean Squared Log Error: %2.4f" % rmsle_dt_c)

Decision Tree Model - Root Mean Squared Log Error: 0.6343
Decision Tree Model - Mean Squared Error: 12705.6879
Decision Tree Model - Mean Absolute Error: 73.3905


In [None]:
targets = records.map(lambda r: float(r[-1])).collect()
hist(targets, bins=40, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)

In [None]:
log_targets = records.map(lambda r: np.log(float(r[-1]))).collect()
hist(log_targets, bins=40, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)

In [None]:
sqrt_targets = records.map(lambda r: np.sqrt(float(r[-1]))).collect()
hist(sqrt_targets, bins=40, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)

## Decision Tree Log

In [None]:
data_dt_log = data_dt.map(lambda lp:LabeledPoint(np.log(lp.label), lp.features))
dt_model_log = DecisionTree.trainRegressor(data_dt_log,categoricalFeaturesInfo={})
preds_log = dt_model_log.predict(data_dt_log.map(lambda p:p.features))
actual_log = data_dt_log.map(lambda p: p.label)
true_vs_predicted_dt_log = actual_log.zip(preds_log).map(lambda p: (np.exp(p[0]), np.exp(p[1])))
mse_log_dt = true_vs_predicted_dt_log.map(lambda p: squared_error(p[0], p[1])).mean()
mae_log_dt = true_vs_predicted_dt_log.map(lambda p: abs_error(p[0], p[1])).mean()
rmsle_log_dt = np.sqrt(true_vs_predicted_dt_log.map(lambda p:squared_log_error(p[0], p[1])).mean())



In [None]:
print ("Decision Tree Log - Mean Squared Error: %2.4f" % mse_log_dt)
print ("Decision Tree Log - Mean Absolue Error: %2.4f" % mae_log_dt)
print ("Decision Tree Log - Root Mean Squared Log Error: %2.4f" % rmsle_log_dt)
print ("Decision Tree Log - Non log-transformed predictions:" +str(true_vs_predicted_dt.take(3)))
print ("Decision Tree Log - Log-transformed predictions:" +str(true_vs_predicted_dt_log.take(3)))

In [None]:
# Spliting the data into Train and Test Data

In [None]:
data_with_idx_dt = data_dt.zipWithIndex().map(lambda p: (p[1],p[0]))
test_dt = data_with_idx_dt.sample(False, 0.2, 42)
train_dt = data_with_idx_dt.subtractByKey(test_dt)
train_data_dt = train_dt.map(lambda p: p[1])
test_data_dt = test_dt.map(lambda p: p[1])

In [None]:
def evaluate_dt(train, test, maxDepth, maxBins):
    model = DecisionTree.trainRegressor(train, categoricalFeaturesInfo={},impurity='variance', maxDepth=maxDepth, maxBins=maxBins)
    preds = model.predict(test.map(lambda p: p.features))
    actual = test.map(lambda p: p.label)
    tp = actual.zip(preds)
    rmsle = np.sqrt(tp.map(lambda p: squared_log_error(p[0], p[1])).mean())
    return rmsle


## Decision Tree Max Bins

In [None]:
params = [2, 4, 8, 16, 32, 64, 100]
metrics = [evaluate_dt(train_data_dt, test_data_dt, 5, param) for param in params]
print (params)
print (metrics)
plot(params, metrics)
fig = matplotlib.pyplot.gcf()
plt.xlabel('Max Bins')
plt.ylabel('RMSLE')
plt.title('Decision Trees - Max Bins')

## Decision Tree Max Bins

In [None]:
params = [1, 2, 3, 4, 5, 10, 20]
metrics = [evaluate_dt(train_data_dt, test_data_dt, param, 32) for param in params]
print (params)
print (metrics)
plot(params, metrics)
fig = matplotlib.pyplot.gcf()
plt.xlabel('Max Depth')
plt.ylabel('RMSLE')
plt.title('Decision Trees - Max Depth')