In [None]:
#instalando o spark e obtendo o contexto
!pip install pyspark
import pyspark
from pyspark import SparkContext
spark = SparkContext(master="local[*]", appName='spark')
spark



In [None]:
#importando o dataset para o google drive
from google.colab import drive
drive.mount('/content/drive')

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('data').getOrCreate()

data = spark.read.csv('drive/MyDrive/kyphosis.csv',inferSchema=True,header=True)
data.columns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['Kyphosis', 'Age', 'Number', 'Start']

In [None]:
#análise dos tipos de dados
data.printSchema()

root
 |-- Kyphosis: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Number: integer (nullable = true)
 |-- Start: integer (nullable = true)



In [None]:
#idade acima de 90 anos...
data.show()

+--------+---+------+-----+
|Kyphosis|Age|Number|Start|
+--------+---+------+-----+
|  absent| 71|     3|    5|
|  absent|158|     3|   14|
| present|128|     4|    5|
|  absent|  2|     5|    1|
|  absent|  1|     4|   15|
|  absent|  1|     2|   16|
|  absent| 61|     2|   17|
|  absent| 37|     3|   16|
|  absent|113|     2|   16|
| present| 59|     6|   12|
| present| 82|     5|   14|
|  absent|148|     3|   16|
|  absent| 18|     5|    2|
|  absent|  1|     4|   12|
|  absent|168|     3|   18|
|  absent|  1|     3|   16|
|  absent| 78|     6|   15|
|  absent|175|     5|   13|
|  absent| 80|     5|   16|
|  absent| 27|     4|    9|
+--------+---+------+-----+
only showing top 20 rows



### Tratamento dos dados

In [None]:
#tirando as linhas com idade absurda
df = data.filter(data["Age"] < 90)

In [None]:
df.show()

+--------+---+------+-----+
|Kyphosis|Age|Number|Start|
+--------+---+------+-----+
|  absent| 71|     3|    5|
|  absent|  2|     5|    1|
|  absent|  1|     4|   15|
|  absent|  1|     2|   16|
|  absent| 61|     2|   17|
|  absent| 37|     3|   16|
| present| 59|     6|   12|
| present| 82|     5|   14|
|  absent| 18|     5|    2|
|  absent|  1|     4|   12|
|  absent|  1|     3|   16|
|  absent| 78|     6|   15|
|  absent| 80|     5|   16|
|  absent| 27|     4|    9|
|  absent| 22|     2|   16|
| present| 15|     7|    2|
|  absent|  9|     5|   13|
|  absent|  8|     3|    6|
|  absent|  4|     3|   16|
|  absent| 31|     3|   16|
+--------+---+------+-----+
only showing top 20 rows



In [None]:
#preparando os dados para treinamento
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
  inputCols=['Age', 'Number', 'Start'],
              outputCol="features")

output = assembler.transform(df)

from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Kyphosis", outputCol="KyphosisIndex")
output_fixed = indexer.fit(output).transform(output)

final_data = output_fixed.select("features",'KyphosisIndex')
train_data,test_data = final_data.randomSplit([0.7,0.3])

### Treinamento

In [None]:
#treino feito em árvore de decisão e floresta aleatória
from pyspark.ml.classification import RandomForestClassifier

rfc = RandomForestClassifier(labelCol='KyphosisIndex',featuresCol='features')

rfc_model = rfc.fit(train_data)

### Previsão

In [None]:
rfc_predictions = rfc_model.transform(test_data)

### Resultados

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator(labelCol="KyphosisIndex", predictionCol="prediction", metricName="accuracy")
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

f1score_evaluator = MulticlassClassificationEvaluator(labelCol="KyphosisIndex", predictionCol="prediction", metricName="f1")
rfc_f1 = f1score_evaluator.evaluate(rfc_predictions)

precision_evaluator = MulticlassClassificationEvaluator(labelCol="KyphosisIndex", predictionCol="prediction", metricName="weightedPrecision")
rfc_precision = precision_evaluator.evaluate(rfc_predictions)

recall_evaluator = MulticlassClassificationEvaluator(labelCol="KyphosisIndex", predictionCol="prediction", metricName="weightedRecall")
rfc_recall = recall_evaluator.evaluate(rfc_predictions)

In [None]:
print(f'Um conjunto de floresta aleatório teve uma acurácia de: {round(rfc_acc*100,2)}%')
print('-'*80)

print(f'Um conjunto de floresta aleatório teve um f1_score de: {round(rfc_f1,2)}')
print('-'*80)

print(f'Um conjunto de floresta aleatório teve um precisão de: {round(rfc_precision,2)}')
print('-'*80)

print(f'Um conjunto de floresta aleatório teve um Recall de: {round(rfc_recall,2)}')
print('-'*80)

Um conjunto de floresta aleatório teve uma acurácia de: 92.308%
--------------------------------------------------------------------------------
Um conjunto de floresta aleatório teve um f1_score de: 0.886
--------------------------------------------------------------------------------
Um conjunto de floresta aleatório teve um precisão de: 0.852
--------------------------------------------------------------------------------
Um conjunto de floresta aleatório teve um Recall de: 0.923
--------------------------------------------------------------------------------
