In [None]:
# Install spark on Google Collab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.mirror.amaze.com.au/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
# Import Spark environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

In [None]:
# Initialize Spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [None]:
# Mount filesystem from Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Import relevant packages
from pyspark.sql.functions import monotonically_increasing_id, desc, col
from pyspark.ml.feature import RFormula
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# Load the data into a Spark DataFrame and print the schema
df_SUSY = spark.read.format("csv").load("/content/gdrive/My Drive/Colab Notebooks/SUSY.csv", inferSchema=True)\
  .cache()\
  .toDF("class", "lepton_1_pT", "lepton_1_eta", "lepton_1_phi", "lepton_2_pT", "lepton_2_eta", "lepton_2_phi", "missing_energy_magnitude", "missing_energy_phi", "MET_rel", "axial_MET", "M_R", "M_TR_2", "R", "MT2", "S_R", "M_DELTA_R", "dPhi_r_b", "cos_thera_r1")

df_SUSY.printSchema()

root
 |-- class: double (nullable = true)
 |-- lepton_1_pT: double (nullable = true)
 |-- lepton_1_eta: double (nullable = true)
 |-- lepton_1_phi: double (nullable = true)
 |-- lepton_2_pT: double (nullable = true)
 |-- lepton_2_eta: double (nullable = true)
 |-- lepton_2_phi: double (nullable = true)
 |-- missing_energy_magnitude: double (nullable = true)
 |-- missing_energy_phi: double (nullable = true)
 |-- MET_rel: double (nullable = true)
 |-- axial_MET: double (nullable = true)
 |-- M_R: double (nullable = true)
 |-- M_TR_2: double (nullable = true)
 |-- R: double (nullable = true)
 |-- MT2: double (nullable = true)
 |-- S_R: double (nullable = true)
 |-- M_DELTA_R: double (nullable = true)
 |-- dPhi_r_b: double (nullable = true)
 |-- cos_thera_r1: double (nullable = true)



In [None]:
# Split the data in to training set and test set
# Test set consist of the last 500000 data rows
indexed_SUSY = df_SUSY.withColumn("index", monotonically_increasing_id())
train_SUSY = df_SUSY.limit(4500000)
test_SUSY = indexed_SUSY.orderBy(desc("index")).drop("index").limit(500000)

In [None]:
# Create the spark training pipeline
rForm = RFormula(formula="class ~ .")
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

# Parameter tuning 
params = ParamGridBuilder()\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .addGrid(lr.regParam, [0.1, 2.0])\
    .build()

# Use area under ROC as an evaluation metric
evaluator = BinaryClassificationEvaluator()\
    .setMetricName("areaUnderROC")\
    .setRawPredictionCol("prediction")\
    .setLabelCol("label")

# Split the training dataset into two different groups - training data and validation data
tvs = TrainValidationSplit()\
    .setTrainRatio(0.75)\
    .setEstimatorParamMaps(params)\
    .setEstimator(pipeline)\
    .setEvaluator(evaluator)

In [None]:
# Train the logistic regression model with parameter tuning
tvsFitteds = []
# Split the dataframe into multiple smaller datasets for different classifiers
dfs = train_SUSY.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2])

for df_i in dfs:
  tvsFitted = tvs.fit(df_i)
  # Print out the resulting model after training 
  print(type(tvsFitted))
  print(tvsFitted.bestModel)
  print(tvsFitted.bestModel.stages)
  # Add the classifier to the list of classifiers
  tvsFitteds.append(tvsFitted)

<class 'pyspark.ml.tuning.TrainValidationSplitModel'>
PipelineModel_24370f6b1296
[RFormulaModel: uid=RFormula_59063e31b2ca, resolvedFormula=ResolvedRFormula(label=class, terms=[lepton_1_pT,lepton_1_eta,lepton_1_phi,lepton_2_pT,lepton_2_eta,lepton_2_phi,missing_energy_magnitude,missing_energy_phi,MET_rel,axial_MET,M_R,M_TR_2,R,MT2,S_R,M_DELTA_R,dPhi_r_b,cos_thera_r1], hasIntercept=true), LogisticRegressionModel: uid=LogisticRegression_99162fba90c1, numClasses=2, numFeatures=18]
<class 'pyspark.ml.tuning.TrainValidationSplitModel'>
PipelineModel_48f4d0b296d2
[RFormulaModel: uid=RFormula_59063e31b2ca, resolvedFormula=ResolvedRFormula(label=class, terms=[lepton_1_pT,lepton_1_eta,lepton_1_phi,lepton_2_pT,lepton_2_eta,lepton_2_phi,missing_energy_magnitude,missing_energy_phi,MET_rel,axial_MET,M_R,M_TR_2,R,MT2,S_R,M_DELTA_R,dPhi_r_b,cos_thera_r1], hasIntercept=true), LogisticRegressionModel: uid=LogisticRegression_99162fba90c1, numClasses=2, numFeatures=18]
<class 'pyspark.ml.tuning.TrainValid

In [None]:
# Evaluate the trained model with the test set and report the performance
for tvsFitted in tvsFitteds:
  print('The area under ROC of the model with the test set is: ', evaluator.evaluate(tvsFitted.bestModel.transform(test_SUSY)))

The area under ROC of the model with the test set is:  0.7496154274996407
The area under ROC of the model with the test set is:  0.7498346298756671
The area under ROC of the model with the test set is:  0.7496480146641429
The area under ROC of the model with the test set is:  0.7500984683473715
The area under ROC of the model with the test set is:  0.7498730746923208
