# Install Java and Spark on Hadoop

In [None]:
# install java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# install spark (change the version number if needed)
!wget -q https://downloads.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
# unzip the spark file to the current folder
!tar xf spark-3.3.2-bin-hadoop3.tgz

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
Hit:5 http://archive.ubuntu.com/ubuntu focal InRelease
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:9 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2,681 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:11 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Get:12 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1,343 kB]
Hit:13 http://ppa.launchpad.net/ubuntugis/ppa/ub

In [None]:
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"


In [None]:
!pip install findspark
import findspark
findspark.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


# Creating a SparkSession in Python

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local")\
          .appName("Introduction to Spark")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()

In [None]:
# Import necessary libraries
from pyspark.sql.functions import col, column, expr
from pyspark.sql import functions as f

# Answer the questions

0- Load the data files

In [None]:
!git clone https://github.com/20127304-AQ/Spark_exercises.git

Cloning into 'Spark_exercises'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (10/10), 815.55 KiB | 1.96 MiB/s, done.


In [None]:
df_mushrooms = spark.read.csv("Spark_exercises/Data/mushrooms.csv", header = True ) 

In [None]:
df_mushrooms.show(10, truncate=True)

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [None]:
# 1. Prepare the train and test sets following the ratio 80:20
# 2. Build a decision tree model on the training set
# 3. Build a random forest model on the training set
# 4. Evaluate the two models on the same test set
# 5. Use a pipeline to simultaneously conduct the above experiments.

from pyspark.ml.classification import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def evaluate(pred, metric):
  evaluator = MulticlassClassificationEvaluator(
      predictionCol = 'prediction',
      labelCol = 'class_index',
      metricName = metric
  )
  return round(evaluator.evaluate(pred), 2)

train, test = df_mushrooms.randomSplit([.8, .2])

columns = df_mushrooms.columns
indexed_col = [c+"_index" for c in columns]
features_col = indexed_col.copy()
features_col.remove("class_index")

indexer = StringIndexer(
    inputCols = columns,
    outputCols = indexed_col
)

vectorizer = VectorAssembler(
    inputCols = features_col, 
    outputCol = 'features'
)

model1 = DecisionTreeClassifier(
    labelCol ='class_index', featuresCol='features'
)

model2 = RandomForestClassifier(
    labelCol = 'class_index', featuresCol='features'
)

for model in [model1, model2]:
  pipeline = Pipeline(stages=[indexer, vectorizer, model]).fit(train)
  prediction = pipeline.transform(test)
  print("Acc: ", evaluate(prediction, 'accuracy'))

Acc:  1.0
Acc:  1.0
