In [2]:
# Import SparkSession
import findspark
findspark.init()

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.sql import SparkSession
import random
import sys
#path = str(sys.argv[1])
#atr = int(sys.argv[2])

In [4]:
# Build the SparkSession
spark = SparkSession.builder \
    .master("local[6]") \
    .appName("Data exploration URL") \
    .config("spark.executor.memory", "4gb") \
    .getOrCreate()

sc = spark.sparkContext

In [7]:
sc._conf.getAll()

[('spark.executor.memory', '4gb'),
 ('spark.driver.memory', '4g'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.name', 'Data exploration URL'),
 ('spark.driver.port', '36657'),
 ('spark.master', 'local[6]'),
 ('spark.app.id', 'local-1615397379447'),
 ('spark.driver.host', 'fedora'),
 ('spark.ui.showConsoleProgress', 'true')]

In [8]:
# Load training data
data = spark.read.format("libsvm")\
    .load("../data/url_svmlight/Dimension_General_x_100.svm")
# Split the data into train and test
seed = random.randrange(500, 1300, 2)
splits = data.randomSplit([0.6, 0.4], seed)

train = splits[0]
test = splits[1]

In [9]:
data.take(1)

[Row(label=0.0, features=SparseVector(100, {1: 1.0, 3: 0.0747, 4: 0.1172, 5: 0.1176, 8: 1.0, 9: 1.0, 10: 0.1429, 15: 0.3, 16: 0.8055, 17: 0.8276, 18: 0.8391, 20: 0.2857, 21: 0.0238, 23: 1.0, 24: 1.0, 27: 1.0, 32: 0.0556, 35: 1.0, 40: 0.1, 43: 1.0, 53: 1.0, 55: 1.0, 61: 1.0, 63: 1.0, 65: 1.0, 67: 1.0, 69: 1.0, 71: 1.0, 73: 1.0, 75: 1.0, 80: 0.05, 81: 1.0, 83: 1.0, 85: 1.0, 87: 1.0, 89: 1.0, 91: 1.0, 93: 1.0, 95: 1.0}))]

In [10]:
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [100, 5, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(
    maxIter=1000, layers=layers, blockSize=1000, seed=1234)

# train the model
model = trainer.fit(train)

In [11]:
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")

In [12]:
evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("f1: " + str(evaluator.evaluate(predictionAndLabels)))
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
print("weightedPrecision: " + str(evaluator.evaluate(predictionAndLabels)))
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
print("weightedRecall: " + str(evaluator.evaluate(predictionAndLabels)))
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))

f1: 0.9750246998082348
weightedPrecision: 0.9750231797207569
weightedRecall: 0.9750262793452689
Accuracy: 0.9750262793452689


In [9]:
x = 1

In [10]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StringType
from pyspark import SQLContext
from pyspark.sql import functions as F
conf = SparkConf().setMaster("local[11]").setAppName("KDD")
conf.set("spark.driver.memory", "28g") 
sc=SparkContext(conf=conf)
sqlContext=SQLContext(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Data exploration URL, master=local[6]) created by getOrCreate at <ipython-input-2-a520991c68d4>:2 

In [None]:
dfspark=sqlContext.read.format("csv").option("header","true").option("inferSchema","true").load('../data/url_svmlight/prueba.svm')

In [None]:
dfspark.head()

In [None]:
dfspark.printSchema()