In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.4-bin-hadoop3.2'

In [9]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Initialize Spark Session
spark = SparkSession.builder.appName("ClassificationExample").getOrCreate()

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.csv("leaf.csv", header=False, inferSchema=True)

In [11]:
data.show(5)

+---+---+-------+------+-------+-------+-------+-------+---------+---------+---------+--------+---------+---------+---------+-------+
|_c0|_c1|    _c2|   _c3|    _c4|    _c5|    _c6|    _c7|      _c8|      _c9|     _c10|    _c11|     _c12|     _c13|     _c14|   _c15|
+---+---+-------+------+-------+-------+-------+-------+---------+---------+---------+--------+---------+---------+---------+-------+
|  1|  1|0.72694|1.4742|0.32396|0.98535|    1.0|0.83592|0.0046566|0.0039465|  0.04779| 0.12795| 0.016108|0.0052323|2.7477E-4| 1.1756|
|  1|  2|0.74173|1.5257|0.36116|0.98152|0.99825|0.79867|0.0052423|0.0050016|  0.02416|0.090476|0.0081195| 0.002708|7.4846E-5|0.69659|
|  1|  3|0.76722|1.5725|0.38998|0.97755|    1.0|0.80812|0.0074573| 0.010121| 0.011897|0.057445|0.0032891|9.2068E-4|3.7886E-5|0.44348|
|  1|  4|0.73797|1.4597|0.35376|0.97566|    1.0|0.81697|0.0068768|0.0086068|  0.01595|0.065491|0.0042707|0.0011544|6.6272E-5|0.58785|
|  1|  5|0.82301|1.7707|0.44462|0.97698|    1.0|0.75493| 0.007

In [13]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Initialize Spark Session
spark = SparkSession.builder.appName("ClassificationExample").getOrCreate()

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.csv("leaf.csv", header=False, inferSchema=True)

# Rename the columns to the appropriate names
old_columns = data.columns
new_columns = [
    "Class", "SpecimenNumber", "Eccentricity", "AspectRatio", "Elongation",
    "Solidity", "StochasticConvexity", "IsoperimetricFactor", "MaximalIndentationDepth",
    "Lobedness", "AverageIntensity", "AverageContrast", "Smoothness",
    "ThirdMoment", "Uniformity", "Entropy"
]

for old, new in zip(old_columns, new_columns):
    data = data.withColumnRenamed(old, new)

# VectorAssembler to combine feature columns into a single vector column
assembler = VectorAssembler(
    inputCols=[
        "SpecimenNumber", "Eccentricity", "AspectRatio", "Elongation",
        "Solidity", "StochasticConvexity", "IsoperimetricFactor", "MaximalIndentationDepth",
        "Lobedness", "AverageIntensity", "AverageContrast", "Smoothness",
        "ThirdMoment", "Uniformity", "Entropy"
    ],
    outputCol="features"
)

# Prepare the models with their parameters
rf = RandomForestClassifier(labelCol="Class", featuresCol="features")
dt = DecisionTreeClassifier(labelCol="Class", featuresCol="features")
lr = LogisticRegression(labelCol="Class", featuresCol="features", maxIter=10)

# ParamGrid for cross validation
rf_paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .build()

dt_paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.minInstancesPerNode, [1, 2, 4]) \
    .build()

lr_paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")

# CrossValidator
crossval_rf = CrossValidator(estimator=rf,
                             estimatorParamMaps=rf_paramGrid,
                             evaluator=evaluator,
                             numFolds=5)

crossval_dt = CrossValidator(estimator=dt,
                             estimatorParamMaps=dt_paramGrid,
                             evaluator=evaluator,
                             numFolds=5)

crossval_lr = CrossValidator(estimator=lr,
                             estimatorParamMaps=lr_paramGrid,
                             evaluator=evaluator,
                             numFolds=5)

# Pipelines
pipeline_rf = Pipeline(stages=[assembler, crossval_rf])
pipeline_dt = Pipeline(stages=[assembler, crossval_dt])
pipeline_lr = Pipeline(stages=[assembler, crossval_lr])

# Train the models
model_rf = pipeline_rf.fit(data)
model_dt = pipeline_dt.fit(data)
model_lr = pipeline_lr.fit(data)

# Make predictions and evaluate accuracy
accuracy_rf = evaluator.evaluate(model_rf.transform(data))
accuracy_dt = evaluator.evaluate(model_dt.transform(data))
accuracy_lr = evaluator.evaluate(model_lr.transform(data))

# Fetch the best model's parameters
best_rf_model = model_rf.stages[-1].bestModel
best_dt_model = model_dt.stages[-1].bestModel
best_lr_model = model_lr.stages[-1].bestModel

print("Random Forest Classifier Best Params:", best_rf_model.extractParamMap())
print("Decision Tree Classifier Best Params:", best_dt_model.extractParamMap())
print("Logistic Regression Classifier Best Params:", best_lr_model.extractParamMap())

# Results Table
results = [
    ("Random Forest Classifier", str(best_rf_model.extractParamMap()), accuracy_rf),
    ("Decision Tree Classifier", str(best_dt_model.extractParamMap()), accuracy_dt),
    ("Logistic Regression Classifier", str(best_lr_model.extractParamMap()), accuracy_lr)
]

results_df = spark.createDataFrame(results, ["Method", "Parameters", "Accuracy"])
results_df.show()

# Stop the Spark session
spark.stop()


Random Forest Classifier Best Params: {Param(parent='RandomForestClassifier_74051f6bbcb2', name='bootstrap', doc='Whether bootstrap samples are used when building trees.'): True, Param(parent='RandomForestClassifier_74051f6bbcb2', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False, Param(parent='RandomForestClassifier_74051f6bbcb2', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10, Param(parent='RandomForestClassifier_74051f6bbcb2', name='featureSubsetStrategy', doc="The number of features to consider

In [36]:
best_rf_params = best_rf_model.extractParamMap()
best_dt_params = best_dt_model.extractParamMap()
best_lr_params = best_lr_model.extractParamMap()


# Results Table
results = [
    ("Random Forest Classifier", {param.name: best_rf_params[param] for param in best_rf_params}, accuracy_rf),
    ("Decision Tree Classifier", {param.name: best_dt_params[param] for param in best_dt_params}, accuracy_dt),
    ("Logistic Regression Classifier", {param.name: best_lr_params[param] for param in best_lr_params}, accuracy_lr)
]

# Stop the Spark session
spark.stop()

In [37]:
results

[('Random Forest Classifier',
  {'bootstrap': True,
   'cacheNodeIds': False,
   'checkpointInterval': 10,
   'featureSubsetStrategy': 'auto',
   'featuresCol': 'features',
   'impurity': 'gini',
   'labelCol': 'Class',
   'leafCol': '',
   'maxBins': 32,
   'maxDepth': 10,
   'maxMemoryInMB': 256,
   'minInfoGain': 0.0,
   'minInstancesPerNode': 1,
   'minWeightFractionPerNode': 0.0,
   'numTrees': 30,
   'predictionCol': 'prediction',
   'probabilityCol': 'probability',
   'rawPredictionCol': 'rawPrediction',
   'seed': 35926711538001616,
   'subsamplingRate': 1.0},
  0.9941176470588236),
 ('Decision Tree Classifier',
  {'cacheNodeIds': False,
   'checkpointInterval': 10,
   'featuresCol': 'features',
   'impurity': 'gini',
   'labelCol': 'Class',
   'leafCol': '',
   'maxBins': 32,
   'maxDepth': 10,
   'maxMemoryInMB': 256,
   'minInfoGain': 0.0,
   'minInstancesPerNode': 1,
   'minWeightFractionPerNode': 0.0,
   'predictionCol': 'prediction',
   'probabilityCol': 'probability',
  