# Reducir dimensionalidad a conjuntos de datos con formato libsvm

In [1]:
# Import SparkSession
import findspark
findspark.init()

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.sql import SparkSession
import random
import sys
#path = str(sys.argv[1])
#atr = int(sys.argv[2])

In [2]:
# Build the SparkSession
spark = SparkSession.builder \
    .master("local[6]") \
    .appName("Data exploration URL") \
    .config("spark.executor.memory", "4gb") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.executor.memory', '4gb'),
 ('spark.driver.port', '42475'),
 ('spark.driver.memory', '4g'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.name', 'Data exploration URL'),
 ('spark.app.id', 'local-1614912135613'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.master', 'local[6]'),
 ('spark.driver.host', 'fedora')]

In [4]:
data = sc.textFile("../data/url_svmlight/Instances_1000.svm")

In [5]:
data.take(1)

['0 4:0.0788382 5:0.124138 6:0.117647 11:0.428571 16:0.1 17:0.749633 18:0.843029 19:0.197344 21:0.142856 22:0.142857 23:0.142857 28:1 33:0.0555556 41:0.1 54:1 56:1 64:1 70:1 72:1 74:1 76:1 82:1 84:1 86:1 88:1 90:1 92:1 94:1 96:1 102:1 104:1 106:1 108:1 110:1 112:1 155:1 190:1 204:1 359:1 360:1 361:1 1306:1 1309:1 1310:1 1311:1 2408:1 2921:1 2923:1 7000:1 7001:1 7002:1 7005:1 7006:1 7007:1 7009:1 7010:1 7759:1 7762:1 155153:1 155154:1 155155:1 155156:1 155157:1 155158:1 155159:1 155160:1 155161:1 155163:1 155164:1 155165:1 155166:1 155168:1 155169:1 155170:1 155172:1 155173:1 155174:1 155175:1 155176:1 155177:1 155178:1 155179:1 155180:1 155181:1 155182:1 155183:1 155194:1 155195:1 155196:1 155197:1 155198:1 155199:1 155200:1 155201:1 155202:1 155203:1 155204:1 155205:1 155206:1 155207:1 155208:1 155209:1 155210:1 155211:1 155212:1 155213:1 945789:1 1988571:1 2139257:1 2987739:1 3224681:1']

In [6]:
# Metodo que guarda cada renglon en el archivo .svm
def save_file(data, limit):
    file = open('../data/url_svmlight/Dimension_1000_x_' +str(limit) + '.svm', 'a')
    file.write('\n' + data)
    file.close()

In [7]:
# Metodo para reducir la dimension, a traves de un ciclo va recorriendo cada posicion de la instancia hasta
# llegar al limite establecido, en casos particulares donde la ultima posicion este lejos del limite, se 
# almacena llamando al metodo save_file() despues de haberse cumplido la condicion del 3er if() anidado.
def reduce_dimension(row, limit):
    instance = row.split(' ')
    data = instance[0]

    for feature in range(len(instance)):
        if (feature > 0): 
            pos_aux = instance[feature].split(':')
            if (int(pos_aux[0]) <= limit):  
                data += ' ' + str(instance[feature])
                if (len(instance) - 1 == feature):
                    save_file(data, limit)
            else:
                save_file(data, limit)
                data = ''
                print('Se rompe el ciclo')
                break

In [8]:
new_data = data.map(lambda row: reduce_dimension(row, 400000)) # El 2o. argumento es el limite de caracteristicas.

In [9]:
new_data.count()

2000

In [10]:
# Celda de prueba:
# instance = data.take(1)
# limit_dimension = 80
# reduce_dimension(instance, limit_dimension)