In [None]:
!apt-get update

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [4]:
!wget -q https://mirror.linux-ia64.org/apache/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz

In [None]:
!tar -xvf spark-3.0.3-bin-hadoop2.7.tgz

In [6]:
!pip install -q findspark

In [7]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"

In [8]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [None]:
!wget https://datahub.io/machine-learning/iris/r/iris.csv

In [10]:
df = spark.read.csv('iris.csv', inferSchema=True, header=True)

In [11]:
df.describe().show()

+-------+------------------+-------------------+------------------+------------------+--------------+
|summary|       sepallength|         sepalwidth|       petallength|        petalwidth|         class|
+-------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|                150|               150|               150|           150|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|          null|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|          null|
|    min|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Iris-virginica|
+-------+------------------+-------------------+------------------+------------------+--------------+



In [14]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [17]:
indexer = StringIndexer(inputCol='class', outputCol='classInd')
indexerTrained = indexer.fit(df)
df_features = indexerTrained.transform(df)
indexerTrained.labels

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [97]:
train, test = df_features.randomSplit([0.7, 0.3], seed=2022)

In [98]:
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [99]:
train.show()

+-----------+----------+-----------+----------+---------------+--------+-------------+--------------------+
|sepallength|sepalwidth|petallength|petalwidth|          class|classInd|     classOne|            Features|
+-----------+----------+-----------+----------+---------------+--------+-------------+--------------------+
|        4.3|       3.0|        1.1|       0.1|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.3,3.0,1.1,0.1,...|
|        4.4|       3.2|        1.3|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.4,3.2,1.3,0.2,...|
|        4.5|       2.3|        1.3|       0.3|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.5,2.3,1.3,0.3,...|
|        4.6|       3.1|        1.5|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.6,3.1,1.5,0.2,...|
|        4.6|       3.6|        1.0|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.6,3.6,1.0,0.2,...|
|        4.7|       3.2|        1.3|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.7,3.2,1.3,0.2,...|
|        4.8|       3.1|    

In [100]:
assembler=VectorAssembler(inputCols=['sepallength', 'sepalwidth', 'petallength', 'classInd'], outputCol='Features')

In [101]:
from pyspark.ml import Pipeline

In [102]:
pipeline = Pipeline(stages = 
[
  StringIndexer(inputCol='class', outputCol='classInd'),
  OneHotEncoder(inputCol='classInd', outputCol = 'classOne'),
  VectorAssembler(inputCols=['sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'classInd', 'classOne'], outputCol='Features')
]
)

In [103]:
pipelineTrained = pipeline.fit(df)
pipelineTrained.transform(df).show()

+-----------+----------+-----------+----------+-----------+--------+-------------+--------------------+
|sepallength|sepalwidth|petallength|petalwidth|      class|classInd|     classOne|            Features|
+-----------+----------+-----------+----------+-----------+--------+-------------+--------------------+
|        5.1|       3.5|        1.4|       0.2|Iris-setosa|     0.0|(2,[0],[1.0])|[5.1,3.5,1.4,0.2,...|
|        4.9|       3.0|        1.4|       0.2|Iris-setosa|     0.0|(2,[0],[1.0])|[4.9,3.0,1.4,0.2,...|
|        4.7|       3.2|        1.3|       0.2|Iris-setosa|     0.0|(2,[0],[1.0])|[4.7,3.2,1.3,0.2,...|
|        4.6|       3.1|        1.5|       0.2|Iris-setosa|     0.0|(2,[0],[1.0])|[4.6,3.1,1.5,0.2,...|
|        5.0|       3.6|        1.4|       0.2|Iris-setosa|     0.0|(2,[0],[1.0])|[5.0,3.6,1.4,0.2,...|
|        5.4|       3.9|        1.7|       0.4|Iris-setosa|     0.0|(2,[0],[1.0])|[5.4,3.9,1.7,0.4,...|
|        4.6|       3.4|        1.4|       0.3|Iris-setosa|     

In [104]:
df_features = pipelineTrained.transform(df)
train, test = df_features.randomSplit([0.7, 0.3], seed=2022)
train.show()

+-----------+----------+-----------+----------+---------------+--------+-------------+--------------------+
|sepallength|sepalwidth|petallength|petalwidth|          class|classInd|     classOne|            Features|
+-----------+----------+-----------+----------+---------------+--------+-------------+--------------------+
|        4.3|       3.0|        1.1|       0.1|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.3,3.0,1.1,0.1,...|
|        4.4|       3.2|        1.3|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.4,3.2,1.3,0.2,...|
|        4.5|       2.3|        1.3|       0.3|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.5,2.3,1.3,0.3,...|
|        4.6|       3.1|        1.5|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.6,3.1,1.5,0.2,...|
|        4.6|       3.6|        1.0|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.6,3.6,1.0,0.2,...|
|        4.7|       3.2|        1.3|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.7,3.2,1.3,0.2,...|
|        4.8|       3.1|    

In [105]:
from pyspark.ml.classification import LogisticRegression

In [108]:
lr = LogisticRegression(featuresCol = 'Features', labelCol = 'classInd')
lrModel = lr.fit(train)

In [109]:
train_res = lrModel.transform(train)
test_res = lrModel.transform(test)

In [110]:
train_res.show()

+-----------+----------+-----------+----------+---------------+--------+-------------+--------------------+--------------------+--------------------+----------+
|sepallength|sepalwidth|petallength|petalwidth|          class|classInd|     classOne|            Features|       rawPrediction|         probability|prediction|
+-----------+----------+-----------+----------+---------------+--------+-------------+--------------------+--------------------+--------------------+----------+
|        4.3|       3.0|        1.1|       0.1|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.3,3.0,1.1,0.1,...|[24.5076767416919...|[0.99999999999994...|       0.0|
|        4.4|       3.2|        1.3|       0.2|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.4,3.2,1.3,0.2,...|[25.0833460200028...|[0.99999999999997...|       0.0|
|        4.5|       2.3|        1.3|       0.3|    Iris-setosa|     0.0|(2,[0],[1.0])|[4.5,2.3,1.3,0.3,...|[15.6840751647897...|[0.99999999580516...|       0.0|
|        4.6|       3.1|        1.

In [117]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [118]:
ev = MulticlassClassificationEvaluator(labelCol='classInd')

In [119]:
ev.evaluate(train_res)

1.0

In [120]:
ev.evaluate(test_res)

1.0