# set up environment

In [38]:
# !pip install -r requirements.txt
# !pip install --upgrade pip

In [39]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from pyspark.ml.feature import StringIndexer, VectorAssembler,OneHotEncoder
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [40]:
spark = SparkSession.builder.appName("bigdata-project_withSparkML").getOrCreate()

In [41]:
df = spark.read.csv("./data/Training.csv", header=True, inferSchema=True)
# test = spark.read.csv("./data/Testing.csv", header=True, inferSchema=True)


In [42]:
# type(train), type(test)

# EDA

drop unwanted column 

In [43]:
df = df.drop('_c133')
# test = test.drop('_c133')

In [44]:
null_counts = df.select(
    [sf.sum(sf.col(c).isNull().cast("int")).alias(c) for c in df.columns]
)

null_counts.show()

+-------+---------+--------------------+-------------------+---------+------+----------+------------+-------+----------------+--------------+--------+-------------------+-------------------+-------+-----------+-------+--------------------+-----------+-----------+------------+--------+-----------------+---------------------+-----+----------+-----------+--------------+--------+-----------+-----------+--------+--------------+----------+------+----------------+--------------------+---------+------------+--------------+---------+----------+------------+-----------------+-------------------+----------------+-------------------+-------------------+-------+----------------------------+------+-----------------+---------------+--------------+----------+----------+----------+-----------------+---------------+---------------------------+-------------------+------------+------------------+---------+---------+------+--------+-------+------------+---------------------+-------------------+------------

In [45]:
df.describe().show()

+-------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+-------------------+---------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+--------------------+--------------------+------------------+-------------------+-------------------+--------------------+-------------------+--------------------+----------------+--------------------+-------------------+------------------

# Preprocess

dataset has no null values and consists only of categorical data (no need for fillna, scaling, or one-hot encoding)
so we skip that in preprocess

In [46]:
# turn label column to numeric
labelIndexer = StringIndexer(inputCol="prognosis", outputCol="label")

df = labelIndexer.fit(df).transform(df)
df = df.drop('prognosis')
train, test = df.randomSplit([0.8, 0.2], seed=42)



In [47]:
test.show()

+-------+---------+--------------------+-------------------+---------+------+----------+------------+-------+----------------+--------------+--------+-------------------+-------------------+-------+-----------+-------+--------------------+-----------+-----------+------------+--------+-----------------+---------------------+-----+----------+-----------+--------------+--------+-----------+-----------+--------+--------------+----------+------+----------------+--------------------+---------+------------+--------------+---------+----------+------------+-----------------+-------------------+----------------+-------------------+-------------------+-------+----------------------------+------+-----------------+---------------+--------------+----------+----------+----------+-----------------+---------------+---------------------------+-------------------+------------+------------------+---------+---------+------+--------+-------+------------+---------------------+-------------------+------------

In [48]:
train.show(5)

+-------+---------+--------------------+-------------------+---------+------+----------+------------+-------+----------------+--------------+--------+-------------------+-------------------+-------+-----------+-------+--------------------+-----------+-----------+------------+--------+-----------------+---------------------+-----+----------+-----------+--------------+--------+-----------+-----------+--------+--------------+----------+------+----------------+--------------------+---------+------------+--------------+---------+----------+------------+-----------------+-------------------+----------------+-------------------+-------------------+-------+----------------------------+------+-----------------+---------------+--------------+----------+----------+----------+-----------------+---------------+---------------------------+-------------------+------------+------------------+---------+---------+------+--------+-------+------------+---------------------+-------------------+------------

# class to train and evaluate data

In [49]:

class ModelComparisonPipeline:
    def __init__(self, spark_session, train, test, feature_columns: list, label_column: str):
        self.spark = spark_session
        self.models = []  # Store models for comparison
        self.train = train
        self.test = test
        self.feature_columns = feature_columns
        self.label_column = label_column

    def _build_pipeline(self, model):
        # Step 1: Assemble features
        assembler = VectorAssembler(
            inputCols=self.feature_columns, outputCol="features"
        )
        # Step 2: Build pipeline with assembler and model
        pipeline = Pipeline(stages=[assembler, model])
        return pipeline

    def compare_models(self, models_with_params):
        # Split the data into training and testing sets
        train_df, test_df = self.train, self.test

        # Initialize evaluator
        evaluator = MulticlassClassificationEvaluator(
            labelCol=self.label_column, predictionCol="prediction", metricName="accuracy"
        )

        # Iterate through models and evaluate
        results = []
        for model_name, model in models_with_params:
            print(f"Training and evaluating {model_name}...")

            # Build pipeline with the current model
            pipeline = self._build_pipeline(model)

            # Train the model
            trained_model = pipeline.fit(train_df)

            # Evaluate the model
            predictions = trained_model.transform(test_df)
            accuracy = evaluator.evaluate(predictions)
            print(f"{model_name} Accuracy: {accuracy:.2f}")

            # Store results
            results.append((model_name, accuracy, trained_model))

        # Return sorted results by accuracy
        results.sort(key=lambda x: x[1], reverse=True)
        return results

In [50]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

# Decision Tree Classifier
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
# dt_param_grid = ParamGridBuilder() \
#     .addGrid(dt.maxDepth, [2, 5, 10]) \
#     .addGrid(dt.maxBins, [10, 20, 40]) \
#     .build()

# Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label")
# rf_param_grid = ParamGridBuilder() \
#     .addGrid(rf.numTrees, [10, 50, 100]) \
#     .addGrid(rf.maxDepth, [5, 10, 15]) \
#     .build()

# List of models with their parameter grids
models_with_params = [
    ("Decision Tree", dt),
    ("Random Forest", rf)
]

In [51]:
# Instantiate the pipeline class
ml_pipeline = ModelComparisonPipeline(spark,train,test,train.columns[:-1], "label")

# Run the comparison
results = ml_pipeline.compare_models( models_with_params)

# Display the results
print("\nModel Comparison Results:")
for model_name, accuracy, _ in results:
    print(f"{model_name}: {accuracy:.2f}")

Training and evaluating Decision Tree...
Decision Tree Accuracy: 0.11
Training and evaluating Random Forest...
Random Forest Accuracy: 0.78

Model Comparison Results:
Random Forest: 0.78
Decision Tree: 0.11
