In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
import random
import math
import sys
import numpy as np 

In [2]:
# Build the SparkSession
spark = SparkSession.builder \
    .master("local[6]") \
    .appName("Data exploration URL - KNN") \
    .config("spark.executor.memory", "4gb") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.executor.memory', '4gb'),
 ('spark.app.name', 'Data exploration URL - KNN'),
 ('spark.app.id', 'local-1617025841411'),
 ('spark.executor.id', 'driver'),
 ('spark.app.startTime', '1617025838061'),
 ('spark.master', 'local[6]'),
 ('spark.driver.port', '37975'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.sql.warehouse.dir',
  'file:/home/jsarabia/Documents/IA/Data-exploration-url_svmlight/code/spark-warehouse'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.host', 'fedora')]

In [4]:
sc

In [5]:
# Load training data
data = spark.read.format("libsvm")\
    .load("../data/url_svmlight/Dimension_5_x_76.svm")
# Split the data into train and test
seed = random.randrange(500, 1300, 2)
splits = data.randomSplit([0.7, 0.3], 1234)

train = splits[0]
test = splits[1]

In [56]:
# Calcular la distancia euclideana
def euclidean_distance(row1, row2):
    distance = 0.0
    for column in range(len(row1)):
        distance += pow(row1[column] - row2[column], 2)
    distance = math.sqrt(distance)
    return distance

In [57]:
total = np.array(data.select('features').collect(), dtype=float)
row1 = total[9]
for row in total:
    distance = euclidean_distance(row1, row)
    print(distance)

TypeError: only size-1 arrays can be converted to Python scalars

In [58]:
# Obtener los vecinos más cercanos
def get_neighbors(train, test_row, k):
    distances = []
    for train_row in train:
        distance = euclidean_distance(test_row, train_row)
        distances.append((train_row, distance))
    distances.sort(key = lambda tup: tup[1])
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

In [59]:
# Test distance function
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]
neighbors = get_neighbors(dataset, dataset[0], k=3)
for neighbor in neighbors:
    print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]


In [61]:
def predict_classification(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k = k)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [67]:
# Test distance function
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]
prediction = predict_classification(dataset, dataset[6], k = 1)
print('Expected %d, Got %d.' % (dataset[0][-1], prediction))

Expected 0, Got 1.


# Segmento de Prueba 

In [5]:
# Se asignan los RDD para el posterior procesamiento
rdd_train = train.rdd
rdd_test = test.rdd

In [17]:
# Se utilizan dos array de numpy para alamacenar las instancias del set de entrenamiento y procesar con un RDD y el segundo array almacena las etiquetas. 
#train_array = np.array(train.select('features').collect(), dtype=float)
#train_array_labels = np.array(train.select('label').collect(), dtype=float)

In [322]:
# Etiquteas de las instancias del conjunto de test. 
test_array_labels = np.array(test.select('label').collect(), dtype=float)

In [323]:
print('RDD de entrenamiento: ' + str(rdd_train.count()))
print('RDD de test: ' + str(rdd_test.count()))

RDD de entrenamiento: 131
RDD de test: 69


In [325]:
# Metodo que guarda cada renglon en un archivo .svm
def save_file(data):
    file = open('../data/url_svmlight/Distancia_euclideana_100_x_500000.svm', 'a')
    file.write(data)
    file.close()

In [326]:
"""
[summary:
    Método para calcular la distancia euclídea entre cada una de las 
    columnas del conjunto de test respecto a las columnas del conjunto
    de entrenamiento.
]

Args:
    instance ([pyspark.sql.types.Row]): [
        Recibe cada una de las instancias que hay en el dataset
    ]
"""
def euclidean_distance(instance):
    distance = 0
    instance_distance = ''
    for row in range(len(train_array)):
        instance_distance += str(train_array_labels[row][0]) + ' '
        for column in range(len(instance[1])):
            distance = pow(train_array[row][0][column] - instance.features[column], 2)
            distance = math.sqrt(distance)
            # instance_distance += str(column + 1) +':' + str(distance) + ' ' # -> Si quisiera poner los indices de cada caracteristica.
            instance_distance += str(distance) + ' '
        instance_distance += '\n'
    save_file(instance_distance)

In [None]:
# Ejecuta el método que calcula la distancia euclídea entre los puntos euclidean_distance()
test.foreach(euclidean_distance)

In [290]:
rdd_samp1 = sc.textFile('../data/url_svmlight/arch_prb.svm')
rdd_samp2 = sc.textFile('../data/url_svmlight/arch_prb1.svm')
rdd_samp3 = sc.textFile('../data/url_svmlight/arch_prb2.svm')

In [299]:
five_nearest1 = rdd_samp1.takeOrdered(5)
five_nearest2 = rdd_samp2.takeOrdered(5)
five_nearest3 = rdd_samp3.takeOrdered(5)

In [311]:
def class_average(five_nearest):
    mean = 0
    for i in range(5):
        mean += float(five_nearest[i][0])
    mean = mean / 5
    if(mean > 0.5):
        print('Clase K-NN: 1')
        return 1
    else:
        print('Clase K-NN: 0')
        return 0

In [317]:
def accuracy():
    lista = [five_nearest1, five_nearest2, five_nearest3]
    accuracy = 0.0
    for i in range(len(test_array_labels)):
        if(test_array_labels[i][0] == class_average(lista[i])):
            accuracy += 1
        print('Clase Real: ' + str(test_array_labels[i][0]))
        print('\n')
    accuracy = accuracy / len(test_array_labels)
    print('Accuracy: ' + str(accuracy))

In [318]:
accuracy()

Clase K-NN: 1
Clase Real: 0.0


Clase K-NN: 1
Clase Real: 0.0


Clase K-NN: 1
Clase Real: 0.0


Accuracy: 0.0


In [300]:
lista = [five_nearest1, five_nearest2, five_nearest3]

In [314]:
# listaclass_average

In [215]:
# five_nearest[0][0] # Clase

In [53]:
# rdd_prueba2 = sc.textFile('../data/url_svmlight/Distancia_euclideana_5_x_76.svm')

In [238]:
five_nearest2 = rdd_prueba.takeOrdered(5)
type(five_nearest2)

list

In [224]:
five_nearest2[0][0]

'0'

In [1]:
pwd

'/home/jsarabia/Documents/IA/Data-exploration-url_svmlight/code'