In [2]:
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import sklearn
import numpy as np
from scipy.io import arff
import pandas as pd
import math
import os.path

In [None]:
"""radnom forest regression"""

In [15]:
def random_forest_regressor(X, y, criterion='squared_error', max_depth=None, n_estimators=100, training_percentage=0.85, testing_percentage=0.15):
 
    num_training_examples = math.floor(X.shape[0]*training_percentage) # number of rows in a single column * our training %
    num_testing_examples = math.ceil(X.shape[0]*testing_percentage)

    X_train = X.iloc[:num_training_examples, :] # isolating 85% of input data for our training
    X_test = X.iloc[:num_testing_examples, :]   # isolating 15% of input data for out testing

    y_train = y.iloc[:num_training_examples].to_numpy().reshape(-1)    # isolating 85% of our target to match with our training input
    y_test = y.iloc[:num_testing_examples].to_numpy().reshape(-1)      # isolating 15% of our target to match with our testing input
    
    """fitting radnom forest regression model"""
    # C is a regularization parameter 
    random_forest = sklearn.ensemble.RandomForestRegressor(criterion=criterion, max_depth=max_depth, n_estimators= n_estimators, random_state=0).fit(X_train, y_train)

    # predicting
    y_pred = random_forest.predict(X_test)
    
    accuracy1 = sklearn.metrics.mean_squared_error(y_test, y_pred)
    accuracy2 = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    accuracy3 = sklearn.metrics.max_error(y_test, y_pred)
    accuracy4 = sklearn.metrics.r2_score(y_test, y_pred)
    accuracy6 = sklearn.metrics.explained_variance_score(y_test, y_pred)
    
    print('                 MINIMIZE: ')
    print("Mean squared error\t", accuracy1)
    print("Mean absolute error\t", accuracy2)
    print("Max error\t\t", accuracy3)
    print('                 MAXIMIZE: ')
    print("r2 Score\t\t", accuracy4)
    print("Explained Variance Score", accuracy6)
    print('---------------------------------------------')

In [None]:
"""importing datasets""";

In [79]:
"""1. Wine Quality"""
# Data Details: 11 features, 1 target (quality), 1599 samples for red    
directory = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-red.csv'
# path = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-red.csv'
# dirname = os.path.dirname(path)
# directory = 'winequality-red.csv'

data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Red wine quality""")
random_forest_regressor(X, y)

# # Data Details: 11 features, 1 target (quality), 4898 samples for white    
directory = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-white.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""White wine quality""")
random_forest_regressor(X, y)

Red wine quality
                 MINIMIZE: 
Mean squared error	 0.03916833333333334
Mean absolute error	 0.14141666666666666
Max error		 0.7699999999999996
                 MAXIMIZE: 
r2 Score		 0.9087302884420891
Explained Variance Score 0.9087479105141794
---------------------------------------------
White wine quality
                 MINIMIZE: 
Mean squared error	 0.0505039455782313
Mean absolute error	 0.16282993197278914
Max error		 1.0999999999999996
                 MAXIMIZE: 
r2 Score		 0.9349671444984197
Explained Variance Score 0.934978472304986
---------------------------------------------


In [16]:
"""2. Communities and Crime"""
# Data Details: 123 samples, 126 features , 1 target
# directory = './datasets/communities/communities.data'
directory = '/Users/annikatimermanis/Desktop/project/datasets/communities/communities.data'
data = pd.read_csv(directory, delimiter=',', header=None) # breaking up our inputs from our target values
data = data.drop(3, axis=1)
data = data[(data != '?').all(axis=1)]
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Communities and Crime""")
random_forest_regressor(X, y)

Communities and Crime
                 MINIMIZE: 
Mean squared error	 0.0033874663157894805
Mean absolute error	 0.048431578947368464
Max error		 0.12309999999999943
                 MAXIMIZE: 
r2 Score		 0.9450274513153398
Explained Variance Score 0.9459262501573372
---------------------------------------------


In [87]:
"""3. QSAR Aquatic Toxicity"""
# Data Details: 9 features, 1 target, 545 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/qsar_aquatic_toxicity/qsar_aquatic_toxicity.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""qsar aquatic toxicity""")  
random_forest_regressor(X, y)

qsar aquatic toxicity
                 MINIMIZE: 
Mean squared error	 0.1526455683697759
Mean absolute error	 0.29836702990708497
Max error		 1.1616625000000047
                 MAXIMIZE: 
r2 Score		 0.936321490874974
Explained Variance Score 0.9364686689716515
---------------------------------------------


In [86]:
"""4. Facebook metrics"""
# Data Details: 18 features/inputs, 1 target, 495 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/dataset_Facebook/dataset_Facebook.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

string_values = data["Type"]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(string_values)
data["Type"] = integer_encoded
data = data.dropna()

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Facebook metrics""")  
random_forest_regressor(X, y)

Facebook metrics
                 MINIMIZE: 
Mean squared error	 40.658345333333344
Mean absolute error	 3.1033333333333335
Max error		 36.960000000000036
                 MAXIMIZE: 
r2 Score		 0.9993090372118117
Explained Variance Score 0.999309313909995
---------------------------------------------


In [85]:
"""5. Bike Sharing"""
# Data Details: 
# directory = './datasets/bike_sharing/hour.csv'
directory = '/Users/annikatimermanis/Desktop/project/datasets/bike_sharing/hour.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
data = data.drop('dteday', axis=1)

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Bike Sharing""")
random_forest_regressor(X, y)

Bike Sharing
                 MINIMIZE: 
Mean squared error	 0.4142609512850023
Mean absolute error	 0.1814000767165327
Max error		 23.74000000000001
                 MAXIMIZE: 
r2 Score		 0.9999321819544078
Explained Variance Score 0.9999321890225281
---------------------------------------------


In [83]:
"""6. Student Performance""" # lots of string data
# Data Details: 
directory = '/Users/annikatimermanis/Desktop/project/datasets/student-por/student-por.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

# loop to iterate through all columns in dataframe and check if the data is string type
for column in data:
    if type(data[column][0]) == str:
        # if data in column is type string, we want to convert it to equivalent numerical labels
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Student Performance""")  
random_forest_regressor(X, y)

Student Performance
                 MINIMIZE: 
Mean squared error	 0.2642479591836734
Mean absolute error	 0.25132653061224486
Max error		 4.35
                 MAXIMIZE: 
r2 Score		 0.9293416098226467
Explained Variance Score 0.9293418353425955
---------------------------------------------


In [84]:
"""7. Concrete Compressive Strength"""
# binary file format .xls
# Data Details: 1030 samples, 8 features columns, 1 target column
directory = '/Users/annikatimermanis/Desktop/project/datasets/Concrete_Data/Concrete_Data.xls'
data = pd.read_excel(directory)  

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Concrete Compressive Strength""")  
random_forest_regressor(X, y)

Concrete Compressive Strength
                 MINIMIZE: 
Mean squared error	 8.451965258511702
Mean absolute error	 1.375338560077343
Max error		 24.494232500587007
                 MAXIMIZE: 
r2 Score		 0.9618812176412577
Explained Variance Score 0.9622061137918856
---------------------------------------------


In [89]:
"""8. SGEMM GPU kernel performance (predict how fast two matrices can be multiplied by a GPU)"""
# Data Details: 17 featrues, 1 target, 241600 samples

directory = '/Users/annikatimermanis/Desktop/project/datasets/sgemm_product/sgemm_product.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""SGEMM GPU kernel performance""")  
random_forest_regressor(X, y)

SGEMM GPU kernel performance
                 MINIMIZE: 
Mean squared error	 0.1284350012660045
Mean absolute error	 0.18972170529801372
Max error		 3.689799999999906
                 MAXIMIZE: 
r2 Score		 0.9998329261489852
Explained Variance Score 0.9998329614025344
---------------------------------------------
