In [1]:
import random
import numpy

from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs

# constants
MIN_MEAN_VALUE = 0
MAX_MEAN_VALUE = 100
STEPS = 0.1

# methods
def point_values(means_value, normal_value, std, cluster, dimension):
    values = ""
    for d in range(dimension):
        value = means_value[d] + normal_value[d] * std
        if not values:
            values = str(value)
        else:
            values = values + "," + str(value)
    return (values + "," + str(cluster))

def write_into_csv(file_name, rdd): 
    with open(file_name,'wb') as file:
        for row in rdd.collect():
            file.write(row)
            file.write('\n')

# main code
 
# inputs
file_name = 'out' + '.csv'  # file name to be generated
points = 9 # number of points to be generated
count_cluster = 3 # number of clusters
dimension = 2 # dimension of the data
std = 1 # standard deviation
noise_points = points * 2 # number of noise points to be generated / double the number of points

sc = SparkContext("local", "generator") # spark context

# array of the clusters : clusters = [0, 1, 2]
clusters = sc.parallelize(range(0, count_cluster))

# random means of each cluster : means_cluster = [ (0, [0.6, 80.9]), (1, [57.8, 20.2]), (2, [15.6, 49.9]) ]
lst = list(numpy.arange(MIN_MEAN_VALUE, MAX_MEAN_VALUE, STEPS))
means_cluster = clusters.map(lambda cluster : (cluster, random.sample(lst, dimension)))

# creating random vector using normalVectorRDD 
random_values_vector = RandomRDDs.normalVectorRDD(sc, numRows = points, numCols = dimension, numPartitions = count_cluster, seed = 1)

# assiging a random cluster for each point
cluster_normal_values_vector = random_values_vector.map(lambda point : (random.randint(0, count_cluster - 1), point.tolist()))


In [2]:
cluster_normal_values_vector.collect()

[(1, [-0.7364417928926634, 1.1537268112805235]),
 (1, [0.46316657909752246, 1.7794324881133858]),
 (0, [0.35038251708418316, -1.2078422905234971]),
 (0, [0.9138119812917153, -0.24384354517716442]),
 (2, [-1.4870452542532624, 0.1495792190988491]),
 (0, [0.4917083148777912, 2.550920457799282]),
 (1, [0.6378880776065149, -0.7210538096192401]),
 (0, [-0.23706629033141421, -0.6287100897578689]),
 (2, [0.33124413745521336, -0.011228419005506252])]

In [3]:
points_value_vector = cluster_normal_values_vector.join(means_cluster)
points_value_vector.collect()

[(0, ([0.46316657909752246, 1.7794324881133858], [36.0, 29.5])),
 (0, ([0.9138119812917153, -0.24384354517716442], [36.0, 29.5])),
 (0, ([-1.4870452542532624, 0.1495792190988491], [36.0, 29.5])),
 (0, ([0.6378880776065149, -0.7210538096192401], [36.0, 29.5])),
 (1, ([-0.7364417928926634, 1.1537268112805235], [31.1, 61.900000000000006])),
 (1,
  ([-0.23706629033141421, -0.6287100897578689], [31.1, 61.900000000000006])),
 (2, ([0.35038251708418316, -1.2078422905234971], [48.1, 52.400000000000006])),
 (2, ([0.4917083148777912, 2.550920457799282], [48.1, 52.400000000000006])),
 (2,
  ([0.33124413745521336, -0.011228419005506252], [48.1, 52.400000000000006]))]

In [4]:
points_value_vector.map(lambda x: point_values(x[1][0], x[1][1], std, x[0], dimension))

PythonRDD[12] at RDD at PythonRDD.scala:53

In [12]:
points_value_vector.saveAsTextFile(file_name)

In [10]:
points_value_vector.collect()

[(0, ([0.46316657909752246, 1.7794324881133858], [36.0, 29.5])),
 (0, ([0.9138119812917153, -0.24384354517716442], [36.0, 29.5])),
 (0, ([-1.4870452542532624, 0.1495792190988491], [36.0, 29.5])),
 (0, ([0.6378880776065149, -0.7210538096192401], [36.0, 29.5])),
 (1, ([-0.7364417928926634, 1.1537268112805235], [31.1, 61.900000000000006])),
 (1,
  ([-0.23706629033141421, -0.6287100897578689], [31.1, 61.900000000000006])),
 (2, ([0.35038251708418316, -1.2078422905234971], [48.1, 52.400000000000006])),
 (2, ([0.4917083148777912, 2.550920457799282], [48.1, 52.400000000000006])),
 (2,
  ([0.33124413745521336, -0.011228419005506252], [48.1, 52.400000000000006]))]