### Import stuff and setup

In [21]:
sc.stop()

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

# First setup
conf = SparkConf().setAppName("BDProjectRegression").setMaster("local")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

### Linear regression

In [25]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

def TrainLR(trainingData,testData):
    
    lr = LinearRegression(maxIter=15, regParam=0.3, elasticNetParam=0.8)

    # Fit the model
    lrModel = lr.fit(trainingData)

    # Print the coefficients and intercept for linear regression
    #print("Coefficients: %s" % str(lrModel.coefficients))
    #print("Intercept: %s" % str(lrModel.intercept))

    # Summarize the model over the training set and print out some metrics
    trainingSummary = lrModel.summary
    print("numIterations: %d" % trainingSummary.totalIterations)
    #print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
    #trainingSummary.residuals.show()
    print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
    print("r2: %f" % trainingSummary.r2)
    
    # Evaluate on test dataset
    #lr_predictions = lrModel.transform(testData)
    #lr_predictions.select("prediction","MV","features").show(5)
    
    test_result = lrModel.evaluate(testData)
    print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)
    print("R2 on test data = %g" % test_result.r2)
    
    #lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
    #                 labelCol='label',metricName="r2")
    #print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))
    return lrModel

### Decision tree regression

In [27]:
from pyspark.ml.regression import DecisionTreeRegressor

def TrainDT(trainingData,testData):
    # Train a DecisionTree model.
    dt = DecisionTreeRegressor()

    # Train model.  This also runs the indexer.
    model = dt.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
    
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 on test data = %g" % r2)
    
    return model

### Read and prepare data, run models

In [2]:
from utils import *

Sgemm

In [4]:
# Load and parse the data file, converting it to a DataFrame.
sgemm = sqlContext.read.load('Datasets/SGEMM/sgemm_product_dataset/sgemm_product.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          delimiter=',',
                          inferSchema='true')

In [14]:
# Averaging runtimes
sgemm = sgemm.withColumn('Run', (sgemm['Run1 (ms)']+sgemm['Run2 (ms)']+sgemm['Run3 (ms)']+sgemm['Run4 (ms)'])/4)

In [19]:
labelcol = "Run"
readycols = sgemm.columns[:-5]
categoricalColumns = []
dataset = sgemm
dataset_name = 'sgemm'

trainingData,testData = Prepare(dataset,labelcol,readycols,categoricalColumns)

In [26]:
model = TrainLR(trainingData,testData)
model.save('Models/'+dataset_name+'_LR')

numIterations: 15
RMSE: 34455.785291
r2: 0.235681
Root Mean Squared Error (RMSE) on test data = 34378.8
R2 on test data = 0.239658


In [28]:
model = TrainDT(trainingData,testData)
model.save('Models/'+dataset_name+'_DT')

Root Mean Squared Error (RMSE) on test data = 32522.8
R2 on test data = 0.319537


Year

In [29]:
# Load and parse the data file, converting it to a DataFrame.
year = sqlContext.read.load('Datasets/Year/YearPredictionMSD.txt', 
                          format='com.databricks.spark.csv', 
                          header='false', 
                          delimiter=',',
                          inferSchema='true')

In [31]:
labelcol = "_c0"
readycols = year.columns[1:]
categoricalColumns = []
dataset = year
dataset_name = 'year'

trainingData,testData = Prepare(dataset,labelcol,readycols,categoricalColumns)

In [32]:
model = TrainLR(trainingData,testData)
model.save('Models/'+dataset_name+'_LR')

numIterations: 16
RMSE: 10.068330
r2: 0.190657
Root Mean Squared Error (RMSE) on test data = 10.0254
R2 on test data = 0.192908


In [33]:
model = TrainDT(trainingData,testData)
model.save('Models/'+dataset_name+'_DT')

Root Mean Squared Error (RMSE) on test data = 10.2704
R2 on test data = 0.152967
