In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
import random
import math
import sys
import numpy as np 

In [2]:
# Build the SparkSession
spark = SparkSession.builder \
    .master("local[6]") \
    .appName("Data exploration URL - KNN") \
    .config("spark.executor.memory", "4gb") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.executor.memory', '4gb'),
 ('spark.app.name', 'Data exploration URL - KNN'),
 ('spark.app.startTime', '1617039572649'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1617039574134'),
 ('spark.master', 'local[6]'),
 ('spark.driver.port', '39591'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.sql.warehouse.dir',
  'file:/home/jsarabia/Documents/IA/Data-exploration-url_svmlight/code/spark-warehouse'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.host', 'fedora')]

In [4]:
sc

In [66]:
# Load training data
data = spark.read.format("libsvm")\
    .load("/data/url_svmlight/Dimension_5_x_76.svm")
# Split the data into train and test
seed = random.randrange(500, 1300, 2)
splits = data.randomSplit([0.7, 0.3], 1234)

train = splits[0]
test = splits[1]

In [278]:
# Calcular la distancia euclideana entre las columnas de dos renglones de un dataset, funciona
# con argumentos provenientes de un renglón de un dataframe de Spark
# Args: row1([pyspark.sql.types.Row]): [Recibe una instancia del dataset]
#       row2([pyspark.sql.types.Row]): [Recibe una instancia del dataset]
def euclidean_distance(row1, row2):
    distance = 0.0
    for column in range(len(row1[1])):
        distance += pow(row1[1][column] - row2[1][column], 2)
    distance = math.sqrt(distance)
    return distance

In [279]:
# Prueba de la función euclidean_distance(), se mandan dos renglones del dataset total
length = data.count()               # Se obtiene el total de renglones en el dataset
for row in range(1, (length + 1)):
    distance = euclidean_distance(data.head(1)[-1], data.head(row)[-1])
    print("Dinstancia del renglon 1 con el", row, ":", distance)

Dinstancia del renglon 1 con el 1 : 0.0
Dinstancia del renglon 1 con el 2 : 3.1564148069783875
Dinstancia del renglon 1 con el 3 : 2.0356611659021397
Dinstancia del renglon 1 con el 4 : 1.4404646590152195
Dinstancia del renglon 1 con el 5 : 1.7914667218708056
Dinstancia del renglon 1 con el 6 : 2.8385541305034065
Dinstancia del renglon 1 con el 7 : 2.4695692287281386
Dinstancia del renglon 1 con el 8 : 3.1564148069885256
Dinstancia del renglon 1 con el 9 : 3.310850006525469
Dinstancia del renglon 1 con el 10 : 3.8249485941866586


In [298]:
# Obtener los vecinos más cercanos.
# summary: [
#       Se recorre cada renglón del dataframe dado y se calcula la distancia entre cada uno de            estos y el renglón de pruebañ
#       Se crea la lista "distances", la cual almacenar[a las distancias calculadas,              #       posteriormente se ordena de modo ascendente y se almancenan los primeros k-elementos en 
#       la lista "neighbors"
# ]
# Args: train([pyspark.sql.dataframe.DataFrame]): [Recibe el dataframe de entrenamiento]
#       test_row([pyspark.sql.types.Row]): [Recibe una instancia o renglón del dataset] 
#       k([int]): [Número de k-vecinos que se desean obtener]
def get_neighbors(train, test_row, k):
    distances = []
    total_train_rows = train.count() + 1
    for train_row in range(1, total_train_rows):
        distance = euclidean_distance(test_row, train.head(train_row)[-1])
        if(distance != 0.0):
           distances.append((train.head(train_row)[-1], distance))
    distances.sort(key = lambda tup: tup[1])
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

In [300]:
length = data.count() + 1
neighbors = get_neighbors(data, data.head(1)[-1], k=1)
for neighbor in neighbors:
    print(neighbor)

Row(label=0.0, features=SparseVector(76, {3: 0.0539, 4: 0.0828, 5: 0.1176, 10: 0.2857, 15: 0.2, 16: 0.6557, 17: 0.7074, 18: 0.2835, 20: 0.2857, 21: 0.006, 23: 1.0, 27: 1.0, 35: 1.0, 43: 1.0, 53: 1.0, 55: 1.0, 61: 1.0, 63: 1.0, 65: 1.0, 67: 1.0, 69: 1.0, 71: 1.0, 73: 1.0, 75: 1.0}))


In [24]:
# Test distance function
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]
neighbors = get_neighbors(dataset, dataset[0], k=3)
for neighbor in neighbors:
    print(neighbor)

[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]
[1.38807019, 1.850220317, 0]


In [27]:
def predict_classification(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k = k)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [28]:
# Test distance function
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]
prediction = predict_classification(dataset, dataset[6], k = 1)
print('Expected %d, Got %d.' % (dataset[6][-1], prediction))

Expected 1, Got 1.


# Segmento de Prueba 

In [5]:
# Se asignan los RDD para el posterior procesamiento
rdd_train = train.rdd
rdd_test = test.rdd

In [17]:
# Se utilizan dos array de numpy para alamacenar las instancias del set de entrenamiento y procesar con un RDD y el segundo array almacena las etiquetas. 
#train_array = np.array(train.select('features').collect(), dtype=float)
#train_array_labels = np.array(train.select('label').collect(), dtype=float)

In [322]:
# Etiquteas de las instancias del conjunto de test. 
test_array_labels = np.array(test.select('label').collect(), dtype=float)

In [323]:
print('RDD de entrenamiento: ' + str(rdd_train.count()))
print('RDD de test: ' + str(rdd_test.count()))

RDD de entrenamiento: 131
RDD de test: 69


In [325]:
# Metodo que guarda cada renglon en un archivo .svm
def save_file(data):
    file = open('../data/url_svmlight/Distancia_euclideana_100_x_500000.svm', 'a')
    file.write(data)
    file.close()

In [326]:
"""
[summary:
    Método para calcular la distancia euclídea entre cada una de las 
    columnas del conjunto de test respecto a las columnas del conjunto
    de entrenamiento.
]

Args:
    instance ([pyspark.sql.types.Row]): [
        Recibe cada una de las instancias que hay en el dataset
    ]
"""
def euclidean_distance(instance):
    distance = 0
    instance_distance = ''
    for row in range(len(train_array)):
        instance_distance += str(train_array_labels[row][0]) + ' '
        for column in range(len(instance[1])):
            distance = pow(train_array[row][0][column] - instance.features[column], 2)
            distance = math.sqrt(distance)
            # instance_distance += str(column + 1) +':' + str(distance) + ' ' # -> Si quisiera poner los indices de cada caracteristica.
            instance_distance += str(distance) + ' '
        instance_distance += '\n'
    save_file(instance_distance)

In [None]:
# Ejecuta el método que calcula la distancia euclídea entre los puntos euclidean_distance()
test.foreach(euclidean_distance)

In [290]:
rdd_samp1 = sc.textFile('../data/url_svmlight/arch_prb.svm')
rdd_samp2 = sc.textFile('../data/url_svmlight/arch_prb1.svm')
rdd_samp3 = sc.textFile('../data/url_svmlight/arch_prb2.svm')

In [299]:
five_nearest1 = rdd_samp1.takeOrdered(5)
five_nearest2 = rdd_samp2.takeOrdered(5)
five_nearest3 = rdd_samp3.takeOrdered(5)

In [311]:
def class_average(five_nearest):
    mean = 0
    for i in range(5):
        mean += float(five_nearest[i][0])
    mean = mean / 5
    if(mean > 0.5):
        print('Clase K-NN: 1')
        return 1
    else:
        print('Clase K-NN: 0')
        return 0

In [317]:
def accuracy():
    lista = [five_nearest1, five_nearest2, five_nearest3]
    accuracy = 0.0
    for i in range(len(test_array_labels)):
        if(test_array_labels[i][0] == class_average(lista[i])):
            accuracy += 1
        print('Clase Real: ' + str(test_array_labels[i][0]))
        print('\n')
    accuracy = accuracy / len(test_array_labels)
    print('Accuracy: ' + str(accuracy))

In [318]:
accuracy()

Clase K-NN: 1
Clase Real: 0.0


Clase K-NN: 1
Clase Real: 0.0


Clase K-NN: 1
Clase Real: 0.0


Accuracy: 0.0


In [300]:
lista = [five_nearest1, five_nearest2, five_nearest3]

In [314]:
# listaclass_average

In [215]:
# five_nearest[0][0] # Clase

In [53]:
# rdd_prueba2 = sc.textFile('../data/url_svmlight/Distancia_euclideana_5_x_76.svm')

In [238]:
five_nearest2 = rdd_prueba.takeOrdered(5)
type(five_nearest2)

list

In [224]:
five_nearest2[0][0]

'0'

In [1]:
pwd

'/home/jsarabia/Documents/IA/Data-exploration-url_svmlight/code'