In [40]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
import sklearn
import numpy as np
from scipy.io import arff
import pandas as pd
import math
import os.path

In [None]:
"""decision tree regression"""

In [41]:
def decision_tree_regressor(X, y, criterion='squared_error', max_depth=None, training_percentage=0.85, testing_percentage=0.15):
 
    num_training_examples = math.floor(X.shape[0]*training_percentage) # number of rows in a single column * our training %
    num_testing_examples = math.ceil(X.shape[0]*testing_percentage)

    X_train = X.iloc[:num_training_examples, :] # isolating 85% of input data for our training
    X_test = X.iloc[:num_testing_examples, :]   # isolating 15% of input data for out testing

    y_train = y.iloc[:num_training_examples].to_numpy().reshape(-1)    # isolating 85% of our target to match with our training input
    y_test = y.iloc[:num_testing_examples].to_numpy().reshape(-1)      # isolating 15% of our target to match with our testing input

    """fitting decision tree regression model"""
    # C is a regularization parameter 
    decision_tree = sklearn.tree.DecisionTreeRegressor(criterion=criterion, max_depth=max_depth).fit(X_train, y_train)
    
    # predicting
    y_pred = decision_tree.predict(X_test)
    
    accuracy1 = sklearn.metrics.mean_squared_error(y_test, y_pred)
    accuracy2 = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    accuracy3 = sklearn.metrics.max_error(y_test, y_pred)
    accuracy4 = sklearn.metrics.r2_score(y_test, y_pred)
    accuracy6 = sklearn.metrics.explained_variance_score(y_test, y_pred)
    
    print('                 MINIMIZE: ')
    print("Mean squared error\t", accuracy1)
    print("Mean absolute error\t", accuracy2)
    print("Max error\t\t", accuracy3)
    print('                 MAXIMIZE: ')
    print("r2 Score\t\t", accuracy4)
    print("Explained Variance Score", accuracy6)
    print('---------------------------------------------')

In [None]:
"""importing datasets""";

In [28]:
"""1. Wine Quality"""
# Data Details: 11 features, 1 target (quality), 1599 samples for red    
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/wine_quality/winequality-red.csv'
# path = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-red.csv'
# dirname = os.path.dirname(path)
# directory = 'winequality-red.csv'

data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Red wine quality""")
decision_tree_regressor(X, y)

# # Data Details: 11 features, 1 target (quality), 4898 samples for white    
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/wine_quality/winequality-white.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""White wine quality""")
decision_tree_regressor(X, y)

Red wine quality
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0.0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------
White wine quality
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0.0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [29]:
"""2. Communities and Crime"""
# Data Details: 123 samples, 126 features , 1 target
# directory = './datasets/communities/communities.data'
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/communities/communities.data'
data = pd.read_csv(directory, delimiter=',', header=None) # breaking up our inputs from our target values
data = data.drop(3, axis=1)
data = data[(data != '?').all(axis=1)]
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Communities and Crime""")
decision_tree_regressor(X, y)

Communities and Crime
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0.0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [30]:
"""3. QSAR Aquatic Toxicity"""
# Data Details: 9 features, 1 target, 545 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/qsar_aquatic_toxicity/qsar_aquatic_toxicity.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""qsar aquatic toxicity""")  
decision_tree_regressor(X, y)

qsar aquatic toxicity
                 MINIMIZE: 
Mean squared error	 0.0018211463414634155
Mean absolute error	 0.007951219512195124
Max error		 0.2530000000000001
                 MAXIMIZE: 
r2 Score		 0.9992402800476856
Explained Variance Score 0.999241602520318
---------------------------------------------


In [31]:
"""4. Facebook metrics"""
# Data Details: 18 features/inputs, 1 target, 495 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/dataset_Facebook/dataset_Facebook.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

string_values = data["Type"]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(string_values)
data["Type"] = integer_encoded
data = data.dropna()

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Facebook metrics""")  
decision_tree_regressor(X, y)

Facebook metrics
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0.0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [32]:
"""5. Bike Sharing"""
# Data Details: 
# directory = './datasets/bike_sharing/hour.csv'
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/bike_sharing/hour.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
data = data.drop('dteday', axis=1)

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Bike Sharing""")
decision_tree_regressor(X, y)

Bike Sharing
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0.0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [33]:
"""6. Student Performance""" # lots of string data
# Data Details: 
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/student-por/student-por.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

# loop to iterate through all columns in dataframe and check if the data is string type
for column in data:
    if type(data[column][0]) == str:
        # if data in column is type string, we want to convert it to equivalent numerical labels
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Student Performance""")  
decision_tree_regressor(X, y)

Student Performance
                 MINIMIZE: 
Mean squared error	 0.0
Mean absolute error	 0.0
Max error		 0.0
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [39]:
"""7. Concrete Compressive Strength"""
# binary file format .xls
# Data Details: 1030 samples, 8 features columns, 1 target column
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/Concrete_Data/Concrete_Data.xls'
data = pd.read_excel(directory)  

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Concrete Compressive Strength""")  
decision_tree_regressor(X, y)

Concrete Compressive Strength
                 MINIMIZE: 
Mean squared error	 5.268818770537899
Mean absolute error	 0.31933859380645174
Max error		 24.748741019999994
                 MAXIMIZE: 
r2 Score		 0.9762373661203197
Explained Variance Score 0.9762373661203197
---------------------------------------------


In [35]:
"""8. SGEMM GPU kernel performance (predict how fast two matrices can be multiplied by a GPU)"""
# Data Details: 17 featrues, 1 target, 241600 samples

directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/sgemm_product/sgemm_product.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""SGEMM GPU kernel performance""")  
decision_tree_regressor(X, y)

SGEMM GPU kernel performance
                 MINIMIZE: 
Mean squared error	 5.154587813364669e-32
Mean absolute error	 7.254437423599257e-18
Max error		 1.4210854715202004e-14
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------
