In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

from xgboost.spark import SparkXGBClassifier

In [2]:
spark = SparkSession
        .builder
        .appName("PySpark XGBOOST Native")
        .getOrCreate()

22/10/20 15:47:18 WARN Utils: Your hostname, Bogdans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.11 instead (on interface en0)
22/10/20 15:47:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/20 15:47:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
schema = StructType(
  [StructField("PassengerId", DoubleType()),
    StructField("Survived", DoubleType()),
    StructField("Pclass", DoubleType()),
    StructField("Name", StringType()),
    StructField("Sex", StringType()),
    StructField("Age", DoubleType()),
    StructField("SibSp", DoubleType()),
    StructField("Parch", DoubleType()),
    StructField("Ticket", StringType()),
    StructField("Fare", DoubleType()),
    StructField("Cabin", StringType()),
    StructField("Embarked", StringType())
  ])

In [4]:
df_raw = spark
  .read
  .option("header", "true")
  .schema(schema)
  .csv("train.csv")

In [5]:
df_raw.show(2)

+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
|        1.0|     0.0|   3.0|Braund, Mr. Owen ...|  male|22.0|  1.0|  0.0|A/5 21171|   7.25| null|       S|
|        2.0|     1.0|   1.0|Cumings, Mrs. Joh...|female|38.0|  1.0|  0.0| PC 17599|71.2833|  C85|       C|
+-----------+--------+------+--------------------+------+----+-----+-----+---------+-------+-----+--------+
only showing top 2 rows



In [6]:
df = df_raw.na.fill(0)

In [7]:
sexIndexer = StringIndexer()
  .setInputCol("Sex")
  .setOutputCol("SexIndex")
  .setHandleInvalid("keep")
    
cabinIndexer = StringIndexer()
  .setInputCol("Cabin")
  .setOutputCol("CabinIndex")
  .setHandleInvalid("keep")
    
embarkedIndexer = StringIndexer()
  .setInputCol("Embarked")
  .setOutputCol("EmbarkedIndex")
  .setHandleInvalid("keep")

In [8]:
vectorAssembler = VectorAssembler()
  .setInputCols(["Pclass", "SexIndex", "Age", "SibSp", "Parch", "Fare", "CabinIndex", "EmbarkedIndex"])
  .setOutputCol("features")

In [9]:
xgboost = SparkXGBClassifier(
    features_col="features", 
    label_col="Survived",
    num_workers=2
)

In [10]:
pipeline = Pipeline().setStages([sexIndexer, cabinIndexer, embarkedIndexer, vectorAssembler, xgboost])

In [11]:
trainDF, testDF = df.randomSplit([0.8, 0.2], seed=24)

In [12]:
model = pipeline.fit(trainDF)

[15:47:47] task 1 got new rank 0                                    (0 + 2) / 2]
[15:47:47] task 0 got new rank 1


In [13]:
model.transform(testDF).select(col("PassengerId"), col("prediction")).show()

+-----------+----------+
|PassengerId|prediction|
+-----------+----------+
|        8.0|       0.0|
|       14.0|       0.0|
|       19.0|       1.0|
|       21.0|       0.0|
|       40.0|       1.0|
|       43.0|       0.0|
|       56.0|       0.0|
|       73.0|       0.0|
|       75.0|       0.0|
|       77.0|       0.0|
|       84.0|       0.0|
|       86.0|       0.0|
|       97.0|       0.0|
|       99.0|       1.0|
|      102.0|       0.0|
|      103.0|       0.0|
|      106.0|       0.0|
|      110.0|       1.0|
|      116.0|       0.0|
|      119.0|       0.0|
+-----------+----------+
only showing top 20 rows

