## COM6012 - 2017:  Practical Quiz 2 

Use logistic regression to perform classification in the [spam](./files/spambase.data) dataset. To test your algorithm split the samples in two datasets, one for training (with 70% of the samples), and one for testing (with 30% of the samples). Use a regularisation parameter of 0.01 and an elastic net parameter of 0.1.

The marks will be assigned like this:
1. Loading the data: 1 Mark
2. Performing the training stage: 2 Marks
3. Performing the testing stage: 2 Marks

Provide your solution in the Notebook.

## Solution

In [1]:
val sparkVersion = "2.0.1"
val scalaVersion = scala.util.Properties.versionNumberString

[36msparkVersion[0m: [32mString[0m = [32m"2.0.1"[0m
[36mscalaVersion[0m: [32mString[0m = [32m"2.11.8"[0m

In [2]:
classpath.add(
    "org.apache.spark" %% "spark-yarn" % sparkVersion,
    "org.apache.spark" %% "spark-mllib" % sparkVersion
)

146 new artifact(s)


146 new artifacts in macro
146 new artifacts in runtime
146 new artifacts in compile




In [3]:
// The usual imports
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.Row
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

[32mimport [36morg.apache.spark.sql.SparkSession[0m
[32mimport [36morg.apache.spark.ml.classification.LogisticRegression[0m
[32mimport [36morg.apache.spark.ml.linalg.Vectors[0m
[32mimport [36morg.apache.spark.ml.param.ParamMap[0m
[32mimport [36morg.apache.spark.sql.Row[0m
[32mimport [36morg.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}[0m
[32mimport [36morg.apache.spark.ml.Pipeline[0m
[32mimport [36morg.apache.spark.ml.evaluation.MulticlassClassificationEvaluator[0m

In [4]:
// "Open the bridge"
val sparkSession = SparkSession
  .builder()
  .master("local[1]")
  .appName("Logistic Regression")
  .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
17/03/23 12:18:16 INFO SparkContext: Running Spark version 2.0.1
17/03/23 12:18:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/03/23 12:18:17 INFO SecurityManager: Changing view acls to: 5e62ac24ea96411ca3b6bebb2b072296
17/03/23 12:18:17 INFO SecurityManager: Changing modify acls to: 5e62ac24ea96411ca3b6bebb2b072296
17/03/23 12:18:17 INFO SecurityManager: Changing view acls groups to: 
17/03/23 12:18:17 INFO SecurityManager: Changing modify acls groups to: 
17/03/23 12:18:17 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(5e62ac24ea96411ca3b6bebb2b072296); groups with view permissions: Set(); users  with modify permissions: Set(5e62ac24ea96411ca3b6bebb2b072296); groups with modify permissions: Set()
17/03/23 12:18:19 INFO Utils: Successfully started service 

[36msparkSession[0m: [32mSparkSession[0m = org.apache.spark.sql.SparkSession@1673c6bf

In [5]:
// Load the data
val text = sparkSession.sparkContext.textFile("files/spambase.data")

[36mtext[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32mrdd[0m.[32mRDD[0m[[32mString[0m] = files/spambase.data MapPartitionsRDD[1] at textFile at Main.scala:25

In [6]:
// Separate into array
val data = text.map(line => line.split(',').map(_.toDouble))

[36mdata[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32mrdd[0m.[32mRDD[0m[[32mArray[0m[[32mDouble[0m]] = MapPartitionsRDD[2] at map at Main.scala:25

In [7]:
// Organise data into feastures and labels
val dataLP = data.map(t => (t(57), Vectors.dense(t.take(57))))

[36mdataLP[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32mrdd[0m.[32mRDD[0m[([32mDouble[0m, [32morg[0m.[32mapache[0m.[32mspark[0m.[32mml[0m.[32mlinalg[0m.[32mVector[0m)] = MapPartitionsRDD[3] at map at Main.scala:26

In [8]:
// Convert to a dataframe
val dataDF = sparkSession.createDataFrame(dataLP).toDF("label", "features")

[36mdataDF[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32msql[0m.[32mpackage[0m.[32mDataFrame[0m = [label: double, features: vector]

In [9]:
// Index the labels
val labelIndexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("indexedLabel")
  .fit(dataDF)

[36mlabelIndexer[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32mml[0m.[32mfeature[0m.[32mStringIndexerModel[0m = strIdx_1f1edaa15a0a

In [10]:
// Index the feature vector
val featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4)
  .fit(dataDF)

[36mfeatureIndexer[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32mml[0m.[32mfeature[0m.[32mVectorIndexerModel[0m = vecIdx_c07e8bf653ef

In [11]:
// Split into training and testing data
val splits = dataDF.randomSplit(Array(0.7, 0.3))
val (trainingData, testData) = (splits(0), splits(1))

[36msplits[0m: [32mArray[0m[[32morg[0m.[32mapache[0m.[32mspark[0m.[32msql[0m.[32mDataset[0m[[32mRow[0m]] = [33mArray[0m([label: double, features: vector], [label: double, features: vector])
[36mtrainingData[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32msql[0m.[32mDataset[0m[[32mRow[0m] = [label: double, features: vector]
[36mtestData[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32msql[0m.[32mDataset[0m[[32mRow[0m] = [label: double, features: vector]

In [12]:
// Create a LogisticRegression instance. 
val lr = new LogisticRegression()
  .setLabelCol("indexedLabel")
  .setFeaturesCol("indexedFeatures")

[36mlr[0m: [32mLogisticRegression[0m = logreg_bdec8c8de0ee

In [13]:
// We can set requested parameters for the regulariser
lr.setMaxIter(10).setRegParam(0.01).setElasticNetParam(0.1)

[36mres12[0m: [32mLogisticRegression[0m = logreg_bdec8c8de0ee

In [14]:
// Convert predictions to labels
val labelConverter = new IndexToString()
  .setInputCol("prediction")
  .setOutputCol("predictedLabel")
  .setLabels(labelIndexer.labels)

[36mlabelConverter[0m: [32mIndexToString[0m = idxToStr_991ff2c3e856

In [15]:
// Set the Pipeline and the Stages
val pipeline = new Pipeline()
  .setStages(Array(labelIndexer, featureIndexer, lr, labelConverter))

[36mpipeline[0m: [32mPipeline[0m = pipeline_11e78e5674ac

In [16]:
// Fir the model
val model = pipeline.fit(trainingData)

[36mmodel[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32mml[0m.[32mPipelineModel[0m = pipeline_11e78e5674ac

In [17]:
// Predictions over test data
val predictions = model.transform(testData)

[36mpredictions[0m: [32morg[0m.[32mapache[0m.[32mspark[0m.[32msql[0m.[32mpackage[0m.[32mDataFrame[0m = [label: double, features: vector ... 6 more fields]

In [18]:
// Create an evaluator using the indexed label and the prediction
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("indexedLabel")
  .setPredictionCol("prediction")
  .setMetricName("accuracy")

[36mevaluator[0m: [32mMulticlassClassificationEvaluator[0m = mcEval_edfe2f6f1038

In [19]:
val accuracy = evaluator.evaluate(predictions)
println("Test Error = " + (1.0 - accuracy))

Test Error = 0.08605341246290799


[36maccuracy[0m: [32mDouble[0m = [32m0.913946587537092[0m