In [1]:
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
import sklearn
import numpy as np
from scipy.io import arff
import pandas as pd
import math
import os.path

In [None]:
"""support vector regression"""

In [2]:
def support_vector_regression(X, y, kernel='rbf', degree=1, gamma=1, coef0=1, C=1, max_iter=-1, training_percentage=0.85, testing_percentage=0.15):
    
    # kernel types:
    # 'linear'
    # 'rbf'
    # 'Polynomial' 'poly'
    # 'sigmoid'
    
    num_training_examples = math.floor(X.shape[0]*training_percentage) # number of rows in a single column * our training %
    num_testing_examples = math.ceil(X.shape[0]*testing_percentage)

    X_train = X.iloc[:num_training_examples, :] # isolating 85% of input data for our training
    X_test = X.iloc[:num_testing_examples, :]   # isolating 15% of input data for out testing

    y_train = y.iloc[:num_training_examples].to_numpy().reshape(-1)    # isolating 85% of our target to match with our training input
    y_test = y.iloc[:num_testing_examples].to_numpy().reshape(-1)      # isolating 15% of our target to match with our testing input
    
    # preprocessing training x
    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    
    """fitting support vector regression model"""
    # C is a regularization parameter 
    svr = sklearn.svm.SVR(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, C=C, max_iter=max_iter).fit(X_train, y_train)

    # preprocessing testing x
    X_test = scaler.transform(X_test)
    
    # predicting
    y_pred = svr.predict(X_test)
    
    accuracy1 = sklearn.metrics.mean_squared_error(y_test, y_pred)
    accuracy2 = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    accuracy3 = sklearn.metrics.max_error(y_test, y_pred)
    accuracy4 = sklearn.metrics.r2_score(y_test, y_pred)
    accuracy6 = sklearn.metrics.explained_variance_score(y_test, y_pred)
    
    print('                 MINIMIZE: ')
    print("Mean squared error\t", accuracy1)
    print("Mean absolute error\t", accuracy2)
    print("Max error\t\t", accuracy3)
    print('                 MAXIMIZE: ')
    print("r2 Score\t\t", accuracy4)
    print("Explained Variance Score", accuracy6)
    print('---------------------------------------------')
    

In [None]:
"""importing datasets""";

In [3]:
"""1. Wine Quality"""
# Data Details: 11 features, 1 target (quality), 1599 samples for red    
directory = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-red.csv'
# path = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-red.csv'
# dirname = os.path.dirname(path)
# directory = 'winequality-red.csv'

data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Red wine quality""")
support_vector_regression(X, y, 'rbf')

# # Data Details: 11 features, 1 target (quality), 4898 samples for white    
directory = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-white.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""White wine quality""")
support_vector_regression(X, y, 'rbf', C=3)

Red wine quality
                 MINIMIZE: 
Mean squared error	 0.06222493168803936
Mean absolute error	 0.16062305471048527
Max error		 1.3352365555360306
                 MAXIMIZE: 
r2 Score		 0.8550040023774802
Explained Variance Score 0.8583822148763456
---------------------------------------------
White wine quality
                 MINIMIZE: 
Mean squared error	 0.021051714934611068
Mean absolute error	 0.10902797113398861
Max error		 1.4066925313039267
                 MAXIMIZE: 
r2 Score		 0.9728921548657552
Explained Variance Score 0.9728940363543791
---------------------------------------------


In [4]:
"""2. Communities and Crime"""
# Data Details: 123 samples, 126 features , 1 target
# directory = './datasets/communities/communities.data'
directory = '/Users/annikatimermanis/Desktop/project/datasets/communities/communities.data'
data = pd.read_csv(directory, delimiter=',', header=None) # breaking up our inputs from our target values
data = data.drop(3, axis=1)
data = data[(data != '?').all(axis=1)]
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Communities and Crime""")
support_vector_regression(X, y, 'rbf')

Communities and Crime
                 MINIMIZE: 
Mean squared error	 0.00844474234015403
Mean absolute error	 0.08900024983344434
Max error		 0.10037974683544307
                 MAXIMIZE: 
r2 Score		 0.8629568632875585
Explained Variance Score 0.863009105616043
---------------------------------------------


In [5]:
"""3. QSAR Aquatic Toxicity"""
# Data Details: 9 features, 1 target, 545 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/qsar_aquatic_toxicity/qsar_aquatic_toxicity.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""qsar aquatic toxicity""")  
support_vector_regression(X, y, 'rbf', C=8)

qsar aquatic toxicity
                 MINIMIZE: 
Mean squared error	 0.20125901339145189
Mean absolute error	 0.20825406656818007
Max error		 3.0228681997303335
                 MAXIMIZE: 
r2 Score		 0.9160416246759583
Explained Variance Score 0.9162681705106567
---------------------------------------------


In [6]:
"""4. Facebook metrics"""
# Data Details: 18 features/inputs, 1 target, 495 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/dataset_Facebook/dataset_Facebook.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

string_values = data["Type"]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(string_values)
data["Type"] = integer_encoded
data = data.dropna()

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Facebook metrics""")  
support_vector_regression(X, y, 'rbf', C=2000)

Facebook metrics
                 MINIMIZE: 
Mean squared error	 0.010005267257868515
Mean absolute error	 0.10002607648424827
Max error		 0.10048675422785891
                 MAXIMIZE: 
r2 Score		 0.9999998299668296
Explained Variance Score 0.9999998520082428
---------------------------------------------


In [None]:
# """5. Bike Sharing"""
# # Data Details: 
# # directory = './datasets/bike_sharing/hour.csv'
# directory = '/Users/annikatimermanis/Desktop/project/datasets/bike_sharing/hour.csv'
# data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
# data = data.drop('dteday', axis=1)

# X = data.iloc[:, :-1] # take everything except last column, our inputs
# y = data.iloc[:, -1:] # tak only last column (our qualities), our target
# print("""Bike Sharing""")
# support_vector_regression(X, y, 'rbf', C=200)

Bike Sharing


In [7]:
"""6. Student Performance""" # lots of string data
# Data Details: 
directory = '/Users/annikatimermanis/Desktop/project/datasets/student-por/student-por.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

# loop to iterate through all columns in dataframe and check if the data is string type
for column in data:
    if type(data[column][0]) == str:
        # if data in column is type string, we want to convert it to equivalent numerical labels
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Student Performance""")  
support_vector_regression(X, y, 'rbf', C=5)

Student Performance
                 MINIMIZE: 
Mean squared error	 0.010642414879546818
Mean absolute error	 0.10172387565027731
Max error		 0.2707614668445082
                 MAXIMIZE: 
r2 Score		 0.9971542792409397
Explained Variance Score 0.9971800975124123
---------------------------------------------


In [15]:
"""7. Concrete Compressive Strength"""
# binary file format .xls
# Data Details: 1030 samples, 8 features columns, 1 target column
directory = '/Users/annikatimermanis/Desktop/project/datasets/Concrete_Data/Concrete_Data.xls'
data = pd.read_excel(directory)  

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Concrete Compressive Strength""")  
support_vector_regression(X, y, 'rbf', C=1000)

(1030, 8)
(1030, 1)
Concrete Compressive Strength
                 MINIMIZE: 
Mean squared error	 13.52421494303052
Mean absolute error	 1.5256876717212988
Max error		 20.28403350198465
                 MAXIMIZE: 
r2 Score		 0.9390051200852889
Explained Variance Score 0.9399107983810426
---------------------------------------------


In [None]:
"""8. SGEMM GPU kernel performance (predict how fast two matrices can be multiplied by a GPU)"""
# Data Details: 17 featrues, 1 target, 241600 samples

directory = '/Users/annikatimermanis/Desktop/project/datasets/sgemm_product/sgemm_product.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""SGEMM GPU kernel performance""")  
support_vector_regression(X, y, 'rbf', C=1)

(241600, 17)
(241600, 1)
SGEMM GPU kernel performance
