# Construcción de Tipologías: Clústers

In [1]:
#from pyspark import SparkContext
#sc = SparkContext()
#from pyspark.sql import SQLContext
#sqlContext=SQLContext(sc)

In [2]:
bd5 = sqlContext.read.format(
    "com.databricks.spark.csv"
).option("header", "true").load("file:/home/cloudera/Documents/Ficheros de trabajo/bd5.csv", inferSchema=True)
sqlContext.registerDataFrameAsTable(bd5, "bd5")

## Reducción de dimensionalidad: PCA

In [3]:
from pyspark.ml.feature import VectorAssembler

a1  = VectorAssembler(
    inputCols=['DepDelay','Distance','DayOfWeek',
               'CRSDepTime','Horario','LogD'],
    outputCol='features')

bd6 = a1.transform(bd5)

In [None]:
from pyspark.ml.feature import PCA
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)
scalerModel = scaler.fit(bd6)
bd6std = scalerModel.transform(bd6)

pca2=PCA(k=2,inputCol='scaledFeatures',outputCol='pca_scaledfeatures')
model2=pca2.fit(bd6std)
bd6pca2=model2.transform(bd6std)


## Clústers - K-Means

In [None]:
from pyspark.ml.clustering import KMeans

#4 clústers
kmeans=KMeans(k=4,seed=123,featuresCol="pca_scaledfeatures",maxIter=10, predictionCol="Cluster")
model3=kmeans.fit(bd6pca2)

bd6Kmeans = model3.transform(bd6pca2)

In [None]:
bd6Kmeans.select('pca_scaledfeatures','Cluster').show()

In [None]:
bd6Kmeans.groupBy('Cluster').count().show()

## Caracterización de los Clústers

In [None]:
centers = model3.clusterCenters()
centers

In [None]:
sqlContext.registerDataFrameAsTable(bd6Kmeans, "bd6Kmeans")

g3 = sqlContext.sql("select Cluster, count(*) as n, \
               avg(DepDelay) as DepDelay, \
               avg(Distance) as Distance, \
               avg(DayOfWeek) as DayOfWeek, \
               avg(CRSDepTime) as CRSDepTime, \
               avg(Horario) as Horario, \
               avg(LogD) as LogD \
               from bd6Kmeans group by Cluster")
g3.toPandas()

## Representación Gráfica

In [None]:
#Extraemos las componentes

from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

p1=udf(lambda v:float(v[0]),FloatType())
p2=udf(lambda v:float(v[1]),FloatType())

bd6Kmeans=bd6Kmeans.withColumn('pca1',p1('pca_scaledfeatures')).withColumn('pca2',p2('pca_scaledfeatures'))

pdf6 = bd6Kmeans.sample(False, 0.1, 0).select('pca1','pca2','Cluster').toPandas()
pdf6

In [None]:
import seaborn as sns
%matplotlib inline

sns.lmplot(x="pca1", y="pca2", hue="Cluster", fit_reg=False, data=pdf6)