In [1]:
# Import SparkSession
import findspark
findspark.init()

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.sql import SparkSession
import random
import sys
#path = str(sys.argv[1])
#atr = int(sys.argv[2])

In [2]:
# Build the SparkSession
spark = SparkSession.builder \
    .master("local[6]") \
    .appName("Data exploration URL") \
    .config("spark.executor.memory", "6gb") \
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.driver.memory', '6g'),
 ('spark.driver.port', '35457'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.executor.memory', '6gb'),
 ('spark.app.name', 'Data exploration URL'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.master', 'local[6]'),
 ('spark.driver.host', 'fedora'),
 ('spark.app.id', 'local-1614629532457')]

In [4]:
# Load training data
data = spark.read.format("libsvm")\
    .load("../data/url_svmlight/Day0.svm")
# Split the data into train and test
seed = random.randrange(500, 1300, 2)
splits = data.randomSplit([0.6, 0.4], seed)

train = splits[0]
test = splits[1]

In [5]:
data.take(1)

[Row(label=0.0, features=SparseVector(3231949, {3: 0.0788, 4: 0.1241, 5: 0.1176, 10: 0.4286, 15: 0.1, 16: 0.7496, 17: 0.843, 18: 0.1973, 20: 0.1429, 21: 0.1429, 22: 0.1429, 27: 1.0, 32: 0.0556, 40: 0.1, 53: 1.0, 55: 1.0, 63: 1.0, 69: 1.0, 71: 1.0, 73: 1.0, 75: 1.0, 81: 1.0, 83: 1.0, 85: 1.0, 87: 1.0, 89: 1.0, 91: 1.0, 93: 1.0, 95: 1.0, 101: 1.0, 103: 1.0, 105: 1.0, 107: 1.0, 109: 1.0, 111: 1.0, 154: 1.0, 189: 1.0, 203: 1.0, 358: 1.0, 359: 1.0, 360: 1.0, 1305: 1.0, 1308: 1.0, 1309: 1.0, 1310: 1.0, 2407: 1.0, 2920: 1.0, 2922: 1.0, 6999: 1.0, 7000: 1.0, 7001: 1.0, 7004: 1.0, 7005: 1.0, 7006: 1.0, 7008: 1.0, 7009: 1.0, 7758: 1.0, 7761: 1.0, 155152: 1.0, 155153: 1.0, 155154: 1.0, 155155: 1.0, 155156: 1.0, 155157: 1.0, 155158: 1.0, 155159: 1.0, 155160: 1.0, 155162: 1.0, 155163: 1.0, 155164: 1.0, 155165: 1.0, 155167: 1.0, 155168: 1.0, 155169: 1.0, 155171: 1.0, 155172: 1.0, 155173: 1.0, 155174: 1.0, 155175: 1.0, 155176: 1.0, 155177: 1.0, 155178: 1.0, 155179: 1.0, 155180: 1.0, 155181: 1.0, 1551

In [6]:
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [3231949, 5, 4, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(
    maxIter=100, layers=layers, blockSize=1000, seed=1234)

# train the model
model = trainer.fit(train)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:38937)
Traceback (most recent call last):
  File "/home/jsarabia/miniconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3427, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-11355d8dacf5>", line 11, in <module>
    model = trainer.fit(train)
  File "/opt/spark/python/pyspark/ml/base.py", line 129, in fit
    return self._fit(dataset)
  File "/opt/spark/python/pyspark/ml/wrapper.py", line 321, in _fit
    java_model = self._fit_java(dataset)
  File "/opt/spark/python/pyspark/ml/wrapper.py", line 318, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/opt/spark/python/pyspark/sql/utils.py", line 128, in deco
    return f(*a, **kw)
  File "/opt/spark/python/lib/py4j-0.10.9-src.zip/py

In [22]:
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")

In [23]:
evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("f1: " + str(evaluator.evaluate(predictionAndLabels)))
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
print("weightedPrecision: " + str(evaluator.evaluate(predictionAndLabels)))
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
print("weightedRecall: " + str(evaluator.evaluate(predictionAndLabels)))
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))

f1: 0.9696969696969697
weightedPrecision: 0.9696969696969697
weightedRecall: 0.9696969696969697
Accuracy: 0.9696969696969697


In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import StringType
from pyspark import SQLContext
from pyspark.sql import functions as F
conf = SparkConf().setMaster("local[11]").setAppName("KDD")
conf.set("spark.driver.memory", "28g") 
sc=SparkContext(conf=conf)
sqlContext=SQLContext(sc)

In [2]:
dfspark=sqlContext.read.format("csv").option("header","true").option("inferSchema","true").load('../data/url_svmlight/prueba.svm')

In [None]:
dfspark.head()

In [None]:
dfspark.printSchema()