In [2]:
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
import sklearn
import numpy as np
from scipy.io import arff
import pandas as pd
import math
import os.path

In [None]:
"""linear regression"""

In [3]:
def linear_regression(X, y, polynomial_degree=1, training_percentage=0.85, testing_percentage=0.15):
    
    num_training_examples = math.floor(X.shape[0]*training_percentage) # number of rows in a single column * our training %
    num_testing_examples = math.ceil(X.shape[0]*testing_percentage)

    X_train = X.iloc[:num_training_examples, :] # isolating 85% of input data for our training
    X_test = X.iloc[:num_testing_examples, :]   # isolating 15% of input data for out testing

    y_train = y.iloc[:num_training_examples]    # isolating 85% of our target to match with our training input
    y_test = y.iloc[:num_testing_examples]      # isolating 15% of our target to match with our testing input
    
    """polynomial transformation"""
    polynomial = sklearn.preprocessing.PolynomialFeatures(polynomial_degree)  # preprocessing our data, moving it to higher polynomial dimension
    X_train_transformed = polynomial.fit_transform(X_train)
    
    """fitting linear regression model"""
    # we fit our model using 85% of preprocessed triaining input data, along with our 85% target y values
    regression_model = sklearn.linear_model.LinearRegression().fit(X_train_transformed, y_train)
    
    """testing linear regression model"""
    X_test_transformed = polynomial.transform(X_test)      # we must also preprocess our testing data 
    y_pred = regression_model.predict(X_test_transformed)
    
    accuracy1 = sklearn.metrics.mean_squared_error(y_test, y_pred)
    accuracy2 = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    accuracy3 = sklearn.metrics.max_error(y_test, y_pred)
    accuracy4 = sklearn.metrics.r2_score(y_test, y_pred)
    accuracy6 = sklearn.metrics.explained_variance_score(y_test, y_pred)
    
    print('                 MINIMIZE: ')
    print("Mean squared error\t", accuracy1)
    print("Mean absolute error\t", accuracy2)
    print("Max error\t\t", accuracy3)
    print('                 MAXIMIZE: ')
    print("r2 Score\t\t", accuracy4)
    print("Explained Variance Score", accuracy6)
    print('---------------------------------------------')


In [4]:
"""importing datasets""";

In [6]:
"""1. Wine Quality"""
# Data Details: 11 features, 1 target (quality), 1599 samples for red    
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/wine_quality/winequality-red.csv'
# path = '/Users/annikatimermanis/Desktop/project/datasets/wine_quality/winequality-red.csv'
# dirname = os.path.dirname(path)
# directory = 'winequality-red.csv'

data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Red wine quality""")
linear_regression(X, y, 2)

# Data Details: 11 features, 1 target (quality), 4898 samples for white    
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/wine_quality/winequality-white.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""White wine quality""")
linear_regression(X, y, 3)

Red wine quality
                 MINIMIZE: 
Mean squared error	 0.3424554158368665
Mean absolute error	 0.4654143263432843
Max error		 1.6117487980800433
                 MAXIMIZE: 
r2 Score		 0.20201335198820713
Explained Variance Score 0.20202082028187196
---------------------------------------------
White wine quality
                 MINIMIZE: 
Mean squared error	 0.44221602516952857
Mean absolute error	 0.5150029041310202
Max error		 3.299259056104347
                 MAXIMIZE: 
r2 Score		 0.4305678390852503
Explained Variance Score 0.4357040629704033
---------------------------------------------


In [46]:
"""2. Communities and Crime"""
# Data Details: 123 samples, 126 features , 1 target
# directory = './datasets/communities/communities.data'
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/communities/communities.data'
data = pd.read_csv(directory, delimiter=',', header=None) # breaking up our inputs from our target values
data = data.drop(3, axis=1)
data = data[(data != '?').all(axis=1)]
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Communities and Crime""")
linear_regression(X, y, 2)

Communities and Crime
                 MINIMIZE: 
Mean squared error	 1.7536234597647473e-23
Mean absolute error	 3.419732333566715e-12
Max error		 9.778622356293454e-12
                 MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [42]:
"""3. QSAR Aquatic Toxicity"""
# Data Details: 9 features, 1 target, 545 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/qsar_aquatic_toxicity/qsar_aquatic_toxicity.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""qsar aquatic toxicity""")  
linear_regression(X, y, 4)

qsar aquatic toxicity
                 MINIMIZE: 
Mean squared error	 0.04105327800408638
Mean absolute error	 0.10879537837058914
Max error		 0.9852508635804282
                 MAXIMIZE: 
r2 Score		 0.9828739768477087
Explained Variance Score 0.9829108753347539
---------------------------------------------


In [15]:
"""4. Facebook metrics"""
# Data Details: 18 features/inputs, 1 target, 495 samples
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/dataset_Facebook/dataset_Facebook.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

string_values = data["Type"]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(string_values)
data["Type"] = integer_encoded
data = data.dropna()

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Facebook metrics""")  
linear_regression(X, y, 2)

Facebook metrics
MINIMIZE: 
Mean squared error	 6.816463806080444e-13
Mean absolute error	 5.258239606335326e-07
Max error		 4.100572937204561e-06
MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [14]:
"""5. Bike Sharing"""
# Data Details: 
# directory = './datasets/bike_sharing/hour.csv'
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/bike_sharing/hour.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
data = data.drop('dteday', axis=1)

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # tak only last column (our qualities), our target
print("""Bike Sharing""")
linear_regression(X, y, 1)

Red wine quality
MINIMIZE: 
Mean squared error	 9.779227067439796e-17
Mean absolute error	 8.381313962169268e-09
Max error		 3.3313979130866755e-08
MAXIMIZE: 
r2 Score		 1.0
Explained Variance Score 1.0
---------------------------------------------


In [5]:
"""6. Student Performance""" # lots of string data
# Data Details: 
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/student-por/student-por.csv'
data = pd.read_csv(directory, delimiter=';') # breaking up our inputs from our target values

# loop to iterate through all columns in dataframe and check if the data is string type
for column in data:
    if type(data[column][0]) == str:
        # if data in column is type string, we want to convert it to equivalent numerical labels
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(data[column])
        data[column] = integer_encoded

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print("""Student Performance""")  
linear_regression(X, y, 2)

Student Performance
MINIMIZE: 
Mean squared error	 0.006729965842214604
Mean absolute error	 0.04925040602077705
Max error		 0.38006562733311
MAXIMIZE: 
r2 Score		 0.9982004456956698
Explained Variance Score 0.9982031080489628
---------------------------------------------


In [13]:
"""7. Concrete Compressive Strength"""
# binary file format .xls
# Data Details: 1030 samples, 8 features columns, 1 target column
directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/Concrete_Data/Concrete_Data.xls'
data = pd.read_excel(directory)  

X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print(X.shape)
print(y.shape)

print("""Concrete Compressive Strength""")  
linear_regression(X, y, 7)

(1030, 8)
(1030, 1)
Concrete Compressive Strength
                 MINIMIZE: 
Mean squared error	 5.270407640154402
Mean absolute error	 0.3412305616379374
Max error		 24.752408331246226
                 MAXIMIZE: 
r2 Score		 0.9762302002395742
Explained Variance Score 0.9762302016569128
---------------------------------------------


In [10]:
"""8. SGEMM GPU kernel performance (predict how fast two matrices can be multiplied by a GPU)"""
# Data Details: 17 featrues, 1 target, 241600 samples

directory = '/Users/annikatimermanis/Desktop/project/datasets/regression/sgemm_product/sgemm_product.csv'
data = pd.read_csv(directory, delimiter=',') # breaking up our inputs from our target values
X = data.iloc[:, :-1] # take everything except last column, our inputs
y = data.iloc[:, -1:] # take only last column (our qualities), our target

print(X.shape)
print(y.shape)

print("""SGEMM GPU kernel performance""")  
linear_regression(X, y, 3)

(241600, 17)
(241600, 1)
SGEMM GPU kernel performance
                 MINIMIZE: 
Mean squared error	 1.0393092546285647
Mean absolute error	 0.5695001279370102
Max error		 8.349321033991885
                 MAXIMIZE: 
r2 Score		 0.9986480211947331
Explained Variance Score 0.9986489335562105
---------------------------------------------
