In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/home/hu/spark-3.2.2-bin-hadoop3.2"

#Buscando e inicializando la instalación de Spark
import findspark
findspark.init()
findspark.find()

'/home/hu/spark-3.2.2-bin-hadoop3.2'

In [31]:
from pyspark.sql import SparkSession

df_ml = SparkSession.builder.appName('iris').getOrCreate()

In [4]:
dataset = df_ml.read.csv('iris.csv', header = True, inferSchema=True)
dataset.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.6|        3.4|         1.4|        0.3| Setosa|
|         5.0|        3.4|         1.5|        0.2| Setosa|
|         4.4|        2.9|         1.4|        0.2| Setosa|
|         4.9|        3.1|         1.5|        0.1| Setosa|
|         5.4|        3.7|         1.5|        0.2| Setosa|
|         4.8|        3.4|         1.6|        0.2| Setosa|
|         4.8|        3.0|         1.4|        0.1| Setosa|
|         4.3|        3.0|         1.1| 

To work with regression models we have to use *VectorAssembler* to convert the independent variables into a vector that includes them  

vectorize all numerical columns into a single feature column

In [24]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#import sys

In [6]:
feature_cols = dataset.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

Later they are integrated into the dataset that was already loaded using the *transform()* function.

In [9]:
dataset_feature = assembler.transform(dataset)
dataset_feature.show()

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| Setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| Setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| Setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2| Setosa|[4.4,2.9,1.4,0.2]|
|         4.9|        3.1|         1.5|        0.1| Setosa|[4.9,

In [13]:
# convert text labels into indexed label
dataset_feature_label = dataset_feature.select(['features', 'variety'])
label_indexer = StringIndexer(inputCol='variety', outputCol='label').fit(dataset_feature_label)
dataset_feature_label_indexed = label_indexer.transform(dataset_feature_label)

# only select the features and indexed label column
dataset_feature_label_indexed = dataset_feature_label_indexed.select(['features', 'label'])
dataset_feature_label_indexed.show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
|[5.4,3.7,1.5,0.2]|  0.0|
|[4.8,3.4,1.6,0.2]|  0.0|
|[4.8,3.0,1.4,0.1]|  0.0|
|[4.3,3.0,1.1,0.1]|  0.0|
|[5.8,4.0,1.2,0.2]|  0.0|
|[5.7,4.4,1.5,0.4]|  0.0|
|[5.4,3.9,1.3,0.4]|  0.0|
|[5.1,3.5,1.4,0.3]|  0.0|
|[5.7,3.8,1.7,0.3]|  0.0|
|[5.1,3.8,1.5,0.3]|  0.0|
+-----------------+-----+
only showing top 20 rows



In [17]:
# set regularization rate
reg = 0.01

In [27]:
# use Logistic Regression to train on the training set
train, test = dataset_feature_label_indexed.randomSplit([0.75, 0.25])
lr = LogisticRegression(regParam=reg)
model = lr.fit(train)

In [28]:
# predict on the test set
prediction = model.transform(test)
prediction.show(10)

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,2.9,1.4,0.2]|  0.0|[5.05597808989342...|[0.93849285571650...|       0.0|
|[4.5,2.3,1.3,0.3]|  0.0|[3.22953969530902...|[0.56437309512304...|       0.0|
|[4.6,3.1,1.5,0.2]|  0.0|[5.28720111259621...|[0.95687353275535...|       0.0|
|[4.8,3.4,1.6,0.2]|  0.0|[5.78747725440852...|[0.97923989163262...|       0.0|
|[4.9,2.4,3.3,1.0]|  1.0|[-0.0176970272226...|[0.10592684155518...|       1.0|
|[4.9,2.5,4.5,1.7]|  2.0|[-2.1265827070535...|[0.02005684071172...|       1.0|
|[4.9,3.0,1.4,0.2]|  0.0|[4.77640009174079...|[0.91047280060037...|       0.0|
|[5.0,3.3,1.4,0.2]|  0.0|[5.47383322561692...|[0.96448747949404...|       0.0|
|[5.0,3.4,1.6,0.4]|  0.0|[5.1883763573312,...|[0.96567480617597...|       0.0|
|[5.1,3.5,1.4,0.2]|  0.0|[5.90221324038354...|[0.980

In [30]:
# evaluate the accuracy of the model using the test set
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = evaluator.evaluate(prediction)
print('Prediction Accuracy is ' + str(accuracy * 100) + '%')

Prediction Accuracy is 97.61904761904762%
