In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
import random
import math
import sys
import numpy as np

In [2]:
# Build the SparkSession
spark = SparkSession.builder \
    .master("local[6]") \
    .appName("Data exploration URL - KNN") \
    .config("spark.executor.memory", "4gb") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.executor.memory', '4gb'),
 ('spark.app.name', 'Data exploration URL - KNN'),
 ('spark.driver.port', '33701'),
 ('spark.driver.memory', '4g'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.id', 'local-1615669488594'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.master', 'local[6]'),
 ('spark.driver.host', 'fedora')]

In [4]:
# Load training data
data = spark.read.format("libsvm")\
    .load("../data/url_svmlight/Dimension_100_x_500000.svm")
# Split the data into train and test
seed = random.randrange(500, 1300, 2)
splits = data.randomSplit([0.7, 0.3], 1234)

train = splits[0]
test = splits[1]

In [5]:
# Se asignan los RDD para el posterior procesamiento
rdd_train = train.rdd
rdd_test = test.rdd

In [6]:
# Se utilizan dos array de numpy para alamacenar las instancias del set de entrenamiento y procesar con un RDD y el segundo array almacena las etiquetas. 
train_array = np.array(train.select('features').collect(), dtype=float)
train_array_labels = np.array(train.select('label').collect(), dtype=float)

KeyboardInterrupt: 

In [322]:
# Etiquteas de las instancias del conjunto de test. 
test_array_labels = np.array(test.select('label').collect(), dtype=float)

In [323]:
print('RDD de entrenamiento: ' + str(rdd_train.count()))
print('RDD de test: ' + str(rdd_test.count()))

RDD de entrenamiento: 131
RDD de test: 69


In [325]:
# Metodo que guarda cada renglon en un archivo .svm
def save_file(data):
    file = open('../data/url_svmlight/Distancia_euclideana_100_x_500000.svm', 'a')
    file.write(data)
    file.close()

In [326]:
"""
[summary:
    Método para calcular la distancia euclídea entre cada una de las 
    columnas del conjunto de test respecto a las columnas del conjunto
    de entrenamiento.
]

Args:
    instance ([pyspark.sql.types.Row]): [
        Recibe cada una de las instancias que hay en el dataset
    ]
"""
def euclidean_distance(instance):
    distance = 0
    instance_distance = ''
    for row in range(len(train_array)):
        instance_distance += str(train_array_labels[row][0]) + ' '
        for column in range(len(instance[1])):
            distance = pow(train_array[row][0][column] - instance.features[column], 2)
            distance = math.sqrt(distance)
            # instance_distance += str(column + 1) +':' + str(distance) + ' ' # -> Si quisiera poner los indices de cada caracteristica.
            instance_distance += str(distance) + ' '
        instance_distance += '\n'
    save_file(instance_distance)

In [None]:
# Ejecuta el método que calcula la distancia euclídea entre los puntos euclidean_distance()
test.foreach(euclidean_distance)

In [290]:
rdd_samp1 = sc.textFile('../data/url_svmlight/arch_prb.svm')
rdd_samp2 = sc.textFile('../data/url_svmlight/arch_prb1.svm')
rdd_samp3 = sc.textFile('../data/url_svmlight/arch_prb2.svm')

In [299]:
five_nearest1 = rdd_samp1.takeOrdered(5)
five_nearest2 = rdd_samp2.takeOrdered(5)
five_nearest3 = rdd_samp3.takeOrdered(5)

In [311]:
def class_average(five_nearest):
    mean = 0
    for i in range(5):
        mean += float(five_nearest[i][0])
    mean = mean / 5
    if(mean > 0.5):
        print('Clase K-NN: 1')
        return 1
    else:
        print('Clase K-NN: 0')
        return 0

In [317]:
def accuracy():
    lista = [five_nearest1, five_nearest2, five_nearest3]
    accuracy = 0.0
    for i in range(len(test_array_labels)):
        if(test_array_labels[i][0] == class_average(lista[i])):
            accuracy += 1
        print('Clase Real: ' + str(test_array_labels[i][0]))
        print('\n')
    accuracy = accuracy / len(test_array_labels)
    print('Accuracy: ' + str(accuracy))

In [318]:
accuracy()

Clase K-NN: 1
Clase Real: 0.0


Clase K-NN: 1
Clase Real: 0.0


Clase K-NN: 1
Clase Real: 0.0


Accuracy: 0.0


# Segmento de Prueba 

In [300]:
lista = [five_nearest1, five_nearest2, five_nearest3]

In [314]:
# listaclass_average

In [215]:
# five_nearest[0][0] # Clase

In [53]:
# rdd_prueba2 = sc.textFile('../data/url_svmlight/Distancia_euclideana_5_x_76.svm')

In [238]:
five_nearest2 = rdd_prueba.takeOrdered(5)
type(five_nearest2)

list

In [224]:
five_nearest2[0][0]

'0'

In [237]:
print('hola')

hola
