In [7]:
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
import sklearn
import numpy as np
from scipy.io import arff
import pandas as pd
import math
import os.path
import scipy
import scipy.stats

In [None]:
"""support vector regression"""

In [8]:
def support_vector_regression(X, y, kernel='rbf', num_iter=20, training_percentage=0.85, testing_percentage=0.15):
    
    # kernel types:
    # 'linear'
    # 'rbf'
    # 'Polynomial' 'poly'
    # 'sigmoid'
    
    num_training_examples = math.floor(X.shape[0]*training_percentage) # number of rows in a single column * our training %
    num_testing_examples = math.ceil(X.shape[0]*testing_percentage)

    X_train = X.iloc[:num_training_examples, :] # isolating 85% of input data for our training
    X_test = X.iloc[:num_testing_examples, :]   # isolating 15% of input data for out testing

    y_train = y.iloc[:num_training_examples].to_numpy().reshape(-1)    # isolating 85% of our target to match with our training input
    y_test = y.iloc[:num_testing_examples].to_numpy().reshape(-1)      # isolating 15% of our target to match with our testing input
    
    # preprocessing training x
    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    """fitting support vector regression model"""
    # C is a regularization parameter 
    randomized_search_model = hyperparameterSearch(X_train, y_train, num_iter)

    # preprocessing testing x
    X_test = scaler.transform(X_test)
    
    # predicting
    y_pred = randomized_search_model.predict(X_test)
    
    accuracy1 = sklearn.metrics.mean_squared_error(y_test, y_pred)
    accuracy2 = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    accuracy3 = sklearn.metrics.max_error(y_test, y_pred)
    accuracy4 = sklearn.metrics.r2_score(y_test, y_pred)
    accuracy6 = sklearn.metrics.explained_variance_score(y_test, y_pred)
    
    print('                 MINIMIZE: ')
    print("Mean squared error\t", accuracy1)
    print("Mean absolute error\t", accuracy2)
    print("Max error\t\t", accuracy3)
    print('                 MAXIMIZE: ')
    print("r2 Score\t\t", accuracy4)
    print("Explained Variance Score", accuracy6)
    print('---------------------------------------------')
    

In [11]:
def hyperparameterSearch(X_train, y_train, num_iter):
    svr = sklearn.svm.SVR(kernel='rbf').fit(X_train, y_train)
    param_distribution = {'C': scipy.stats.uniform(1, 1000), 'gamma': scipy.stats.uniform(0.01, 1000)}
    randomized_search_ = sklearn.model_selection.RandomizedSearchCV(svr, param_distribution, n_iter=num_iter, verbose=1, random_state=0).fit(X_train,y_train)
    print('Best Hyperparameters = ' + str(randomized_search_.best_params_))
    return randomized_search_

## Importing Datasets

In [6]:
"""1. Wine Quality"""
# Data Details: 11 features, 1 target (quality), 1599 samples for red    
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/wine_quality/winequality-red.csv'
# path = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-red.csv'
# dirname = os.path.dirname(path)
# directory = 'winequality-red.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Red wine quality""")
support_vector_regression(X, y, 'rbf')

# # Data Details: 11 features, 1 target (quality), 4898 samples for white    
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/wine_quality/winequality-white.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""White wine quality""")
support_vector_regression(X, y, 'rbf')

Red wine quality
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters = {'C': 144.3532874090464, 'gamma': 944.6789170495839}
                 MINIMIZE: 
Mean squared error	 0.010000191747397332
Mean absolute error	 0.10000033368654375
Max error		 0.10051867515016788
                 MAXIMIZE: 
r2 Score		 0.9766976396840452
Explained Variance Score 0.9792979272169552
---------------------------------------------
White wine quality
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters = {'C': 72.03605819788694, 'gamma': 87.13929970154071}
                 MINIMIZE: 
Mean squared error	 0.009982352424182987
Mean absolute error	 0.09991151245817012
Max error		 0.10057740713243835
                 MAXIMIZE: 
r2 Score		 0.9871459373057863
Explained Variance Score 0.9879062116633788
---------------------------------------------


In [75]:
"""2. Communities and Crime"""
# Data Details: 123 samples, 126 features , 1 target
# directory = './datasets/communities/communities.data'
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/communities/communities.data'
data = pd.read_csv(directory, delimiter=',', header=None) # breaking up our inputs from our target values
data = data.drop(3, axis=1)
data = data[(data != '?').all(axis=1)]
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Communities and Crime""")
support_vector_regression(X, y, 'rbf')

Communities and Crime
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters = {'C': 549.8135039273247, 'gamma': 715.1993663724195}
                 MINIMIZE: 
Mean squared error	 0.008438158968651702
Mean absolute error	 0.0889673550966023
Max error		 0.10037974683544365
                 MAXIMIZE: 
r2 Score		 0.8630636996887749
Explained Variance Score 0.8631140439847698
---------------------------------------------


In [76]:
"""3. QSAR Aquatic Toxicity"""
# Data Details: 9 features, 1 target, 545 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/qsar_aquatic_toxicity/qsar_aquatic_toxicity.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""qsar aquatic toxicity""")  
support_vector_regression(X, y, 'rbf')

qsar aquatic toxicity
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters = {'C': 72.03605819788694, 'gamma': 87.13929970154071}
                 MINIMIZE: 
Mean squared error	 0.0116374806692426
Mean absolute error	 0.10193024566824557
Max error		 0.4058693164617031
                 MAXIMIZE: 
r2 Score		 0.9951452411825444
Explained Variance Score 0.9951915809370392
---------------------------------------------


In [12]:
"""4. Facebook metrics"""
# Data Details: 18 features/inputs, 1 target, 495 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/dataset_Facebook/dataset_Facebook.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

string_values = data["Type"]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(string_values)
data["Type"] = integer_encoded
data = data.dropna()

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Facebook metrics""")  
support_vector_regression(X, y, 'rbf') # C=2000

Facebook metrics
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters = {'C': 779.1567509498504, 'gamma': 870.0221482468191}
                 MINIMIZE: 
Mean squared error	 8582.530276379686
Mean absolute error	 10.7960189938872
Max error		 802.3023313595007
                 MAXIMIZE: 
r2 Score		 0.8541453420974399
Explained Variance Score 0.856076528434795
---------------------------------------------


In [78]:
# """5. Bike Sharing"""
# # Data Details: 
# # directory = './datasets/bike_sharing/hour.csv'
# directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/bike_sharing/hour.csv'
# data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
# data = data.drop('dteday', axis=1)

# X = data.iloc[:, :-1] # take everything except last column, our inputs
# y = data.iloc[:, -1:] # tak only last column (our qualities), our target
# print("""Bike Sharing""")
# support_vector_regression(X, y, 'rbf') #, C=200

In [81]:
"""6. Student Performance""" # lots of string data
# Data Details: 
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/student-por/student-por.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

# loop to iterate through all columns in dataframe and check if the data is string type
for column in data:
    if type(data[column][0]) == str:
        # if data in column is type string, we want to convert it to equivalent numerical labels
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Student Performance""")  
support_vector_regression(X, y, 'rbf')

Student Performance
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters = {'C': 549.8135039273247, 'gamma': 715.1993663724195}
                 MINIMIZE: 
Mean squared error	 0.009990065254195341
Mean absolute error	 0.09994969431739163
Max error		 0.10072737068955817
                 MAXIMIZE: 
r2 Score		 0.9973287137928755
Explained Variance Score 0.9973558536944499
---------------------------------------------


In [85]:
"""7. Concrete Compressive Strength"""
# binary file format .xls
# Data Details: 1030 samples, 8 features columns, 1 target column
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/Concrete_Data/Concrete_Data.xls'
data = pd.read_excel(directory)  

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Concrete Compressive Strength""")  
support_vector_regression(X, y, 'rbf') # C=1000

Concrete Compressive Strength
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters = {'C': 72.03605819788694, 'gamma': 87.13929970154071}
                 MINIMIZE: 
Mean squared error	 6.9923977070159316
Mean absolute error	 0.31118483476564723
Max error		 32.89818766349899
                 MAXIMIZE: 
r2 Score		 0.9684639396628987
Explained Variance Score 0.9685846302021524
---------------------------------------------


In [None]:
"""8. SGEMM GPU kernel performance (predict how fast two matrices can be multiplied by a GPU)"""
# Data Details: 17 featrues, 1 target, 241600 samples

directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/sgemm_product/sgemm_product.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""SGEMM GPU kernel performance""")  
support_vector_regression(X, y, 'rbf')

SGEMM GPU kernel performance
