---
# MLlib Spark - Clusterização


#  (1) Instalação e Configuração

In [3]:
! pip install -q pyspark numpy

[K     |████████████████████████████████| 281.3 MB 43 kB/s 
[K     |████████████████████████████████| 199 kB 66.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [4]:
# Configurando as variaveis de ambiente
import os
os.environ["PYSPARK_PYTHON"] = "python3"

In [5]:
# Definindo uma sessão e contexto Spark
from pyspark.sql import SparkSession
ss = SparkSession.builder.appName("pyspark-notebook").master("local[*]").getOrCreate()
sc = ss.sparkContext

# (2) Obtenção dos Dados

In [6]:
# Instalando o módulo wget
%%capture
! pip install -q wget
!rm -rf data
!mkdir data

In [7]:
# Baixando os dados das tabelas de dimensão do data mart Exame
import wget

url = "https://raw.githubusercontent.com/apache/spark/master/data/mllib/kmeans_data.txt"
wget.download(url, "data/kmeans_data.txt")

'data/kmeans_data.txt'

In [8]:
# Carregando o arquivo de dados
from numpy import array

data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# (3) Tratamento dos Dados

In [9]:
from pyspark.mllib.clustering import KMeans

clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random")

In [10]:
from math import sqrt

def error(point):
	center = clusters.centers[clusters.predict(point)]
	return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))


Within Set Sum of Squared Error = 0.6928203230275529


In [11]:
from pyspark.mllib.clustering import KMeansModel

clusters.save(sc, "data/KMeansModel")
sameModel = KMeansModel.load(sc, "data/KMeansModel")

In [12]:
sc.stop()