# Install Java and Spark on Hadoop

In [None]:
# install java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# install spark (change the version number if needed)
!wget -q https://downloads.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
# unzip the spark file to the current folder
!tar xf spark-3.3.2-bin-hadoop3.tgz

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease
Get:5 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:10 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2,681 kB]
Get:11 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [3,158 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Hit:13 http://ppa.launchpad.net/ubuntugis/ppa/ubuntu

In [None]:
# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"


In [None]:
!pip install findspark
import findspark
findspark.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


# Creating a SparkSession in Python

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local")\
          .appName("Introduction to Spark")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()

In [None]:
# Import necessary libraries
from pyspark.sql.functions import col, column, expr
from pyspark.sql import functions as f

# Answer the questions

0- Load the data files

In [None]:
!git clone https://github.com/20127304-AQ/Spark_exercises.git

Cloning into 'Spark_exercises'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (10/10), 815.55 KiB | 1.53 MiB/s, done.


In [None]:
df_iris = spark.read.csv("Spark_exercises/Data/iris.csv", header = True ) 

In [None]:
df_iris.show(10, truncate=True)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
+---+-------------+------------+-------------+------------+-----

In [None]:
#1. Cluster the given examples by using k-means clustering with k = 2, 3, and 5.
from pyspark.ml.clustering import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
columns = [
    'SepalLengthCm',
    'SepalWidthCm',
    'PetalLengthCm',
    'PetalWidthCm'
]
indexed_cols = [c+"_index" for c in columns]

indexer = StringIndexer(
    inputCols = columns,
    outputCols = indexed_cols
)

vectorizer = VectorAssembler(
    inputCols = indexed_cols,
    outputCol='features'
)

def clustering(data, nCluster):
  kmeans = KMeans(
      k = nCluster
  )

  pipeline = Pipeline(stages = [indexer, vectorizer, kmeans]).fit(data)
  prediction = pipeline.transform(data)
  return prediction

predictions = []
for nCluster in [2, 3, 5]:
  predictions.append(clustering(df_iris, nCluster))
  predictions[-1].select('features', 'prediction').show(3)

+------------------+----------+
|          features|prediction|
+------------------+----------+
| [1.0,9.0,1.0,0.0]|         1|
| [8.0,0.0,1.0,0.0]|         1|
|[24.0,2.0,4.0,0.0]|         1|
+------------------+----------+
only showing top 3 rows

+------------------+----------+
|          features|prediction|
+------------------+----------+
| [1.0,9.0,1.0,0.0]|         1|
| [8.0,0.0,1.0,0.0]|         1|
|[24.0,2.0,4.0,0.0]|         1|
+------------------+----------+
only showing top 3 rows

+------------------+----------+
|          features|prediction|
+------------------+----------+
| [1.0,9.0,1.0,0.0]|         1|
| [8.0,0.0,1.0,0.0]|         0|
|[24.0,2.0,4.0,0.0]|         0|
+------------------+----------+
only showing top 3 rows



In [None]:
#2. Consider the clustering with k = 2 done above. For each cluster, count the number of examples that belong to each of the three species. 
def countClusterSamples(data):
  return data.groupBy('prediction').count().orderBy('prediction')

countClusterSamples(predictions[0]).show(3)

+----------+-----+
|prediction|count|
+----------+-----+
|         0|   47|
|         1|  103|
+----------+-----+



In [None]:
#3. Repeat the counting above for other values of k.
for p in predictions:
  countClusterSamples(p).show(5)

+----------+-----+
|prediction|count|
+----------+-----+
|         0|   47|
|         1|  103|
+----------+-----+

+----------+-----+
|prediction|count|
+----------+-----+
|         0|   32|
|         1|   96|
|         2|   22|
+----------+-----+

+----------+-----+
|prediction|count|
+----------+-----+
|         0|   39|
|         1|   64|
|         2|   20|
|         3|    6|
|         4|   21|
+----------+-----+

