In [136]:
from pyspark.sql import SparkSession

In [137]:
spark = SparkSession.builder.appName("Arvore de Decisao").getOrCreate()

In [138]:
diretorioArvore = "./Iris.csv"

In [139]:
df_iris = spark.read.format('csv').options(inferSchema=True,header='false',delimiter=',').load(diretorioArvore)

In [140]:
df_iris.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)



In [141]:
df_iris.show()

+---+-------------+------------+-------------+------------+-----------+
|_c0|          _c1|         _c2|          _c3|         _c4|        _c5|
+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-

In [142]:
df_iris = df_iris.selectExpr("_c1 as sep_len", "_c2 as sep_wid", "_c3 as pet_len", "_c4 as pet_wid", "_c5 as label")

In [143]:
df_iris.show(5)

+-------------+------------+-------------+------------+-----------+
|      sep_len|     sep_wid|      pet_len|     pet_wid|      label|
+-------------+------------+-------------+------------+-----------+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [144]:
df_iris.describe(['sep_len', 'sep_wid', 'pet_len', 'pet_wid', 'label']).show() #ENCONTRANDO A ESTATÍSTICA

+-------+------------------+-------------------+------------------+------------------+-----------+
|summary|           sep_len|            sep_wid|           pet_len|           pet_wid|      label|
+-------+------------------+-------------------+------------------+------------------+-----------+
|  count|               151|                151|               151|               151|        151|
|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|       NULL|
| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|       NULL|
|    min|               4.3|                2.0|               1.0|               0.1|Iris-setosa|
|    max|     SepalLengthCm|       SepalWidthCm|     PetalLengthCm|      PetalWidthCm|    Species|
+-------+------------------+-------------------+------------------+------------------+-----------+



In [145]:
df_iris.createOrReplaceTempView("irisTable")

In [146]:
display(spark.sql('select * from irisTable'))

DataFrame[sep_len: string, sep_wid: string, pet_len: string, pet_wid: string, label: string]

CONSTRUÇÃO DA ÁRVORE DE DECISÃO

In [147]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [148]:
from pyspark.sql.functions import col


df_iris = df_iris.select(
    col("sep_len").cast("float").alias("sep_len"),
    col("sep_wid").cast("float").alias("sep_wid"),
    col("pet_len").cast("float").alias("pet_len"),
    col("pet_wid").cast("float").alias("pet_wid"),
    col("label")
)

# Verificação das conversões
df_iris.printSchema()
df_iris.show()

# Remover linhas onde qualquer coluna numérica é nula
df_iris = df_iris.dropna(subset=["sep_len", "sep_wid", "pet_len", "pet_wid"])

# Usar o VectorAssembler para criar a coluna de features
vector_assembler = VectorAssembler(inputCols=['sep_len', 'sep_wid', 'pet_len', 'pet_wid'], outputCol="features")
df_temp = vector_assembler.transform(df_iris)

# Mostrar o DataFrame resultante
df_temp.show()

root
 |-- sep_len: float (nullable = true)
 |-- sep_wid: float (nullable = true)
 |-- pet_len: float (nullable = true)
 |-- pet_wid: float (nullable = true)
 |-- label: string (nullable = true)

+-------+-------+-------+-------+-----------+
|sep_len|sep_wid|pet_len|pet_wid|      label|
+-------+-------+-------+-------+-----------+
|   NULL|   NULL|   NULL|   NULL|    Species|
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|
|    5.4|    3.9|    1.7|    0.4|Iris-setosa|
|    4.6|    3.4|    1.4|    0.3|Iris-setosa|
|    5.0|    3.4|    1.5|    0.2|Iris-setosa|
|    4.4|    2.9|    1.4|    0.2|Iris-setosa|
|    4.9|    3.1|    1.5|    0.1|Iris-setosa|
|    5.4|    3.7|    1.5|    0.2|Iris-setosa|
|    4.8|    3.4|    1.6|    0.2|Iris-setosa|
|    4.8|    3.0|    1.4|    0.1|Iris-setosa|
|    4.3|    3.0|    1.

In [149]:
vector_assembler = VectorAssembler(inputCols=['sep_len', 'sep_wid', 'pet_len', 'pet_wid'], outputCol="features")
df_temp = vector_assembler.transform(df_iris)
df_temp.show(5)

+-------+-------+-------+-------+-----------+--------------------+
|sep_len|sep_wid|pet_len|pet_wid|      label|            features|
+-------+-------+-------+-------+-----------+--------------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.09999990463256...|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.90000009536743...|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.69999980926513...|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|[4.59999990463256...|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|[5.0,3.5999999046...|
+-------+-------+-------+-------+-----------+--------------------+
only showing top 5 rows



In [150]:
# removendo colunas que não serão utilizadas

df_menor = df_temp.drop('sep_len', 'sep_wid', 'pet_len', 'pet_wid')
df_menor.show(5)

+-----------+--------------------+
|      label|            features|
+-----------+--------------------+
|Iris-setosa|[5.09999990463256...|
|Iris-setosa|[4.90000009536743...|
|Iris-setosa|[4.69999980926513...|
|Iris-setosa|[4.59999990463256...|
|Iris-setosa|[5.0,3.5999999046...|
+-----------+--------------------+
only showing top 5 rows



In [151]:
from pyspark.ml.feature import StringIndexer # cria o 'vetor' para cada uma das classes existentes na coluna label

l_indexer = StringIndexer(inputCol='label', outputCol='labelIndex')
df_final = l_indexer.fit(df_menor).transform(df_menor)

In [152]:
df_final.show(5)

+-----------+--------------------+----------+
|      label|            features|labelIndex|
+-----------+--------------------+----------+
|Iris-setosa|[5.09999990463256...|       0.0|
|Iris-setosa|[4.90000009536743...|       0.0|
|Iris-setosa|[4.69999980926513...|       0.0|
|Iris-setosa|[4.59999990463256...|       0.0|
|Iris-setosa|[5.0,3.5999999046...|       0.0|
+-----------+--------------------+----------+
only showing top 5 rows



In [153]:
#dividindo entre dados de treinamento e teste

(train, test) = df_final.randomSplit([0.7,0.3])

In [154]:
test.show(5)

+-----------+--------------------+----------+
|      label|            features|labelIndex|
+-----------+--------------------+----------+
|Iris-setosa|[4.40000009536743...|       0.0|
|Iris-setosa|[4.5,2.2999999523...|       0.0|
|Iris-setosa|[4.69999980926513...|       0.0|
|Iris-setosa|[4.80000019073486...|       0.0|
|Iris-setosa|[4.80000019073486...|       0.0|
+-----------+--------------------+----------+
only showing top 5 rows



In [155]:
from pyspark.ml.classification import DecisionTreeClassifier #biblioteca para o algorítimo da árvore de decisão
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # utilizada para encontrar métodos de desempenho

In [156]:
modeloArvore = DecisionTreeClassifier(labelCol="labelIndex", featuresCol="features")
model = modeloArvore.fit(train) #aplicando o treinamento

In [157]:
predictions = model.transform(test)
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [158]:
avaliacao = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [159]:
acuracia = avaliacao.evaluate(predictions)
print("Acurácia do Modelo = ",(acuracia))

Acurácia do Modelo =  0.9111111111111111


APLICAÇÃO DA FLORESTA RANDÔMICA

In [160]:
from pyspark.ml.classification import RandomForestClassifier

In [161]:
modeloRF = RandomForestClassifier(labelCol="labelIndex", featuresCol="features", numTrees=10)
modelRF = modeloRF.fit(train)

In [162]:
# realizando previsão

predictions = modelRF.transform(test)
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [163]:
avaliacao = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
print("Acurácia do Modelo = ", (acuracia))

Acurácia do Modelo =  0.9111111111111111


In [164]:
print(modelRF)

RandomForestClassificationModel: uid=RandomForestClassifier_d305f4899d1c, numTrees=10, numClasses=3, numFeatures=4
