In [1]:
# -*- coding: utf-8 -*-
"""
Make sure you give execute privileges
-----------------------------------------------------------------------------

           Spark with Python: Setup Spyder IDE for Spark

             Copyright : V2 Maestros @2016
                    
Execute this script once when Spyder is started on Windows
-----------------------------------------------------------------------------
"""

import os
import sys
os.chdir("D:/SPARK/Practice_Problems/UCI_ML/Seed_Data_Set")
os.curdir

# Configure the environment. Set this up to the directory where
# Spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:/Spark/spark-1.6.0-bin-hadoop2.6'

# Create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exist. The names might change
#as versions change.
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.9-src.zip"))

#Initiate Spark context. Once this is done all other applications can run
from pyspark import SparkContext
from pyspark import SparkConf

# Optionally configure Spark Settings
conf=SparkConf()
conf.set("spark.executor.memory", "12g")
conf.set("spark.cores.max", "4")

conf.setAppName("ma")

## Initialize SparkContext. Run only once. Otherwise you get multiple 
#Context Error.
sc = SparkContext('local', conf=conf)


Load the data

In [3]:
seeds= sc.textFile("seeds.txt")
seeds.first()

u'15.26\t14.84\t0.871\t5.763\t3.312\t2.221\t5.22\t1'

In [5]:
from pyspark.sql import SQLContext,Row
sqlContext = SQLContext(sc)
import math
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics

Perform mean and std of every column

Convert into vector

In [7]:
def transformToNumeric( inputStr) :
    attList=inputStr.split("\t")
    #Filter out columns not wanted at this stage
    values= Vectors.dense([float(attList[0]),  \
                     float(attList[1]),  \
                     float(attList[2]),  \
                     float(attList[3]),  \
                     float(attList[4]),  \
                     float(attList[5]),  \
                     float(attList[6]),  \
                     int(attList[7]),  \
                     ])
    return values

seedsVector = seeds.map(transformToNumeric)
#seedsVector.persist()
seedsVector.collect()


[DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22, 1.0]),
 DenseVector([14.88, 14.57, 0.8811, 5.554, 3.333, 1.018, 4.956, 1.0]),
 DenseVector([14.29, 14.09, 0.905, 5.291, 3.337, 2.699, 4.825, 1.0]),
 DenseVector([13.84, 13.94, 0.8955, 5.324, 3.379, 2.259, 4.805, 1.0]),
 DenseVector([16.14, 14.99, 0.9034, 5.658, 3.562, 1.355, 5.175, 1.0]),
 DenseVector([14.38, 14.21, 0.8951, 5.386, 3.312, 2.462, 4.956, 1.0]),
 DenseVector([14.69, 14.49, 0.8799, 5.563, 3.259, 3.586, 5.219, 1.0]),
 DenseVector([14.11, 14.1, 0.8911, 5.42, 3.302, 2.7, 5.0, 1.0]),
 DenseVector([16.63, 15.46, 0.8747, 6.053, 3.465, 2.04, 5.877, 1.0]),
 DenseVector([16.44, 15.25, 0.888, 5.884, 3.505, 1.969, 5.533, 1.0]),
 DenseVector([15.26, 14.85, 0.8696, 5.714, 3.242, 4.543, 5.314, 1.0]),
 DenseVector([14.03, 14.16, 0.8796, 5.438, 3.201, 1.717, 5.001, 1.0]),
 DenseVector([13.89, 14.02, 0.888, 5.439, 3.199, 3.986, 4.738, 1.0]),
 DenseVector([13.78, 14.06, 0.8759, 5.479, 3.156, 3.136, 4.872, 1.0]),
 DenseVector([13.7

In [8]:
seedsStats=Statistics.colStats(seedsVector)
colMeans=seedsStats.mean()
colVariance=seedsStats.variance()
colStdDev=map(lambda x: math.sqrt(x), colVariance)


Put mean and std as braodcast variable

In [10]:
bcMeans=sc.broadcast(colMeans)
bcStdDev=sc.broadcast(colStdDev)


In [11]:
def centerAndScale(inVector) :
    global bcMeans
    global bcStdDev
    
    meanArray=bcMeans.value
    stdArray=bcStdDev.value
    
    valueArray=inVector.toArray()
    retArray=[]
    for i in range(valueArray.size):
        retArray.append( (valueArray[i] - meanArray[i]) /\
            stdArray[i] )
    return Vectors.dense(retArray)
    
csSeeds = seedsVector.map(centerAndScale)
csSeeds.collect()


[DenseVector([0.1418, 0.2149, 0.0001, 0.3035, 0.1414, -0.9838, -0.3827, -1.2218]),
 DenseVector([0.0112, 0.0082, 0.4275, -0.1682, 0.197, -1.7839, -0.9198, -1.2218]),
 DenseVector([-0.1916, -0.3593, 1.4389, -0.7618, 0.2076, -0.6659, -1.1864, -1.2218]),
 DenseVector([-0.3463, -0.4742, 1.0369, -0.6873, 0.3187, -0.9585, -1.2271, -1.2218]),
 DenseVector([0.4442, 0.3298, 1.3712, 0.0665, 0.8032, -1.5598, -0.4742, -1.2218]),
 DenseVector([-0.1607, -0.2675, 1.02, -0.5474, 0.1414, -0.8235, -0.9198, -1.2218]),
 DenseVector([-0.0541, -0.0531, 0.3767, -0.1479, 0.001, -0.076, -0.3847, -1.2218]),
 DenseVector([-0.2535, -0.3517, 0.8507, -0.4707, 0.1149, -0.6652, -0.8303, -1.2218]),
 DenseVector([0.6126, 0.6897, 0.1566, 0.958, 0.5464, -1.1042, 0.9541, -1.2218]),
 DenseVector([0.5473, 0.5289, 0.7195, 0.5766, 0.6523, -1.1514, 0.2542, -1.2218]),
 DenseVector([0.1418, 0.2226, -0.0592, 0.1929, -0.044, 0.5605, -0.1914, -1.2218]),
 DenseVector([-0.281, -0.3057, 0.364, -0.43, -0.1525, -1.319, -0.8283, -1.2218]

Create spark dataframe

In [13]:
seedsRows=csSeeds.map( lambda f:Row(features=f))
seedsDf = sqlContext.createDataFrame(seedsRows)
seedsDf.select("features").show(10)

+--------------------+
|            features|
+--------------------+
|[0.14175903742014...|
|[0.01116135575161...|
|[-0.1916087289442...|
|[-0.3462638782885...|
|[0.44419577391567...|
|[-0.1606776990753...|
|[-0.0541374850826...|
|[-0.2534707886819...|
|[0.61259804764614...|
|[0.54729920681188...|
+--------------------+
only showing top 10 rows



In [14]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(seedsDf)
predictions = model.transform(seedsDf)
predictions.collect()


[Row(features=DenseVector([0.1418, 0.2149, 0.0001, 0.3035, 0.1414, -0.9838, -0.3827, -1.2218]), prediction=0),
 Row(features=DenseVector([0.0112, 0.0082, 0.4275, -0.1682, 0.197, -1.7839, -0.9198, -1.2218]), prediction=0),
 Row(features=DenseVector([-0.1916, -0.3593, 1.4389, -0.7618, 0.2076, -0.6659, -1.1864, -1.2218]), prediction=0),
 Row(features=DenseVector([-0.3463, -0.4742, 1.0369, -0.6873, 0.3187, -0.9585, -1.2271, -1.2218]), prediction=0),
 Row(features=DenseVector([0.4442, 0.3298, 1.3712, 0.0665, 0.8032, -1.5598, -0.4742, -1.2218]), prediction=0),
 Row(features=DenseVector([-0.1607, -0.2675, 1.02, -0.5474, 0.1414, -0.8235, -0.9198, -1.2218]), prediction=0),
 Row(features=DenseVector([-0.0541, -0.0531, 0.3767, -0.1479, 0.001, -0.076, -0.3847, -1.2218]), prediction=0),
 Row(features=DenseVector([-0.2535, -0.3517, 0.8507, -0.4707, 0.1149, -0.6652, -0.8303, -1.2218]), prediction=0),
 Row(features=DenseVector([0.6126, 0.6897, 0.1566, 0.958, 0.5464, -1.1042, 0.9541, -1.2218]), predict

Plot the results in a scatter plot

In [15]:
import pandas as pd

def unstripData(instr) :
    return ( instr["prediction"], instr["features"][0], \
        instr["features"][1],instr["features"][2],instr["features"][3])
    
unstripped=predictions.map(unstripData)
predList=unstripped.collect()
predPd = pd.DataFrame(predList)

import matplotlib.pylab as plt
plt.cla()
plt.scatter(predPd[3],predPd[4], c=predPd[0])


<matplotlib.collections.PathCollection at 0xdb3f9b0>