In [1]:
#from pyspark import SparkConf, SparkContext
#conf = SparkConf().setAppName('spark-yarn') \
#                .setMaster('spark://172.22.0.2:7077')
#sc = SparkContext(conf=conf)

In [None]:
%ls /opt/

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.config('spark.executor.memory', '2g') \
                    .config('spark.driver.memory', '4g') \
                    .config('spark.jars','/opt/spark-data/libs/xgb_0_9/xgboost4j-0.90.jar') \
                    .config('spark.jars', '/opt/spark-data/libs/xgb_0_9/xgboost4j-spark-0.90.jar') \
                    .master("spark://spark-master:7077").getOrCreate()                    

In [5]:
# testing the repackaging - failed for now - June 2020
#spark.sparkContext.addPyFile("/opt/spark-data/libs/xgboost4j-spark_2.12-1.0.0.jar")
#spark.sparkContext.addPyFile("/opt/spark-data/libs/xgboost4j_2.12-1.0.0.jar")

In [6]:
# 0.9 wrapper
spark.sparkContext.addPyFile("/opt/spark-data/libs/pyspark-xgboost.zip")

In [7]:
import sparkxgb

## Iris Example

In [8]:

# create schema type for dataset
schema = StructType([
            StructField("sepal length", DoubleType(), True),
            StructField("sepal width", DoubleType(), True),
            StructField("petal length", DoubleType(), True),
            StructField("petal width", DoubleType(), True),
            StructField("class", StringType(), True)
        ])

# load in iris dataset
rawInput = spark.read.schema(schema).csv("/opt/spark-data/iris.data")

# convert text class to index
stringIndexer = StringIndexer(inputCol="class", outputCol="classIndex")

In [9]:
model = stringIndexer.fit(rawInput)

In [10]:
# transform labels
labelTransformed = model.transform(rawInput).drop("class")

# xgb spark requires 
vectorAssembler = VectorAssembler(inputCols=["sepal length", "sepal width", "petal length", "petal width"],
                                  outputCol="features")

xgbInput = vectorAssembler.transform(labelTransformed)
xgbInput = xgbInput.select("features", "classIndex")

train, test = xgbInput.randomSplit([0.8,0.2])


In [11]:
xgb_model = sparkxgb.XGBoostClassifier(
    featuresCol="features",
    labelCol="classIndex"
)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling o130.params

In [None]:
xgb_model.setParams(eta=0.1,
                   maxDepth=2,
                   objective="multi:softprob",
                   numClass=3,
                   numRound=10,
                   numWorkers=2)

model = xgb_model.fit(train)

In [None]:
results = model.transform(test)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                              labelCol="classIndex",
                                              metricName="weightedPrecision")

# compute the classification error on test data.
accuracy = evaluator.evaluate(results)
print("Test Error = %g" % (1.0 - accuracy))

## Kaggle Titanic Example

In [None]:
from pyspark.sql.types import *
schema = StructType(
  [StructField("PassengerId", DoubleType()),
    StructField("Survival", DoubleType()),
    StructField("Pclass", DoubleType()),
    StructField("Name", StringType()),
    StructField("Sex", StringType()),
    StructField("Age", DoubleType()),
    StructField("SibSp", DoubleType()),
    StructField("Parch", DoubleType()),
    StructField("Ticket", StringType()),
    StructField("Fare", DoubleType()),
    StructField("Cabin", StringType()),
    StructField("Embarked", StringType())
  ])

In [None]:
df_raw = spark\
  .read\
  .option("header", "true")\
  .schema(schema)\
  .csv("/opt/spark-data/train.csv")

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [None]:
df = df_raw.na.fill(0)

In [None]:
sexIndexer = StringIndexer()\
  .setInputCol("Sex")\
  .setOutputCol("SexIndex")\
  .setHandleInvalid("keep")
    
cabinIndexer = StringIndexer()\
  .setInputCol("Cabin")\
  .setOutputCol("CabinIndex")\
  .setHandleInvalid("keep")
    
embarkedIndexer = StringIndexer()\
  .setInputCol("Embarked")\
  .setOutputCol("EmbarkedIndex")\
  .setHandleInvalid("keep")

In [None]:
vectorAssembler = VectorAssembler()\
  .setInputCols(["Pclass", "SexIndex", "Age", "SibSp", "Parch", "Fare", "CabinIndex", "EmbarkedIndex"])\
  .setOutputCol("features")

In [None]:
import sparkxgb
from pyspark.ml import Pipeline

In [None]:
boost_model = sparkxgb.XGBoostClassifier(
        featuresCol="features", 
        labelCol="Survival", 
        predictionCol="prediction"
)

In [None]:
trainDF.head(2)

In [None]:
pipeline = Pipeline().setStages([sexIndexer, cabinIndexer, embarkedIndexer, vectorAssembler, boost_model])

In [None]:
trainDF, testDF = df.randomSplit([0.8, 0.2], seed=24)

In [None]:
model = pipeline.fit(trainDF)

### Shutdown

In [None]:
spark.stop()