In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Challenge').getOrCreate()

In [3]:
data = spark.read.csv('train.csv', header=True, inferSchema=True)

data.show(4)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
only showing top 4 rows



In [4]:
# """ Select the Columns to use """
selected_data = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Survived']]


#""" Drop the na columns """
clean_data = selected_data.na.drop()


# """ Encode the String Columns """
from pyspark.ml.feature import StringIndexer
indexer1 = StringIndexer(inputCol="Sex", outputCol="idx_Sex")
indexer2 = StringIndexer(inputCol="Embarked", outputCol="idx_Embarked")


processed_data = indexer1.fit(clean_data).transform(clean_data)
processed_data = indexer2.fit(processed_data).transform(processed_data)

processed_data.show(5)

+------+------+----+-----+-----+--------+--------+-------+------------+
|Pclass|   Sex| Age|SibSp|Parch|Embarked|Survived|idx_Sex|idx_Embarked|
+------+------+----+-----+-----+--------+--------+-------+------------+
|     3|  male|22.0|    1|    0|       S|       0|    0.0|         0.0|
|     1|female|38.0|    1|    0|       C|       1|    1.0|         1.0|
|     3|female|26.0|    0|    0|       S|       1|    1.0|         0.0|
|     1|female|35.0|    1|    0|       S|       1|    1.0|         0.0|
|     3|  male|35.0|    0|    0|       S|       0|    0.0|         0.0|
+------+------+----+-----+-----+--------+--------+-------+------------+
only showing top 5 rows



In [5]:
processed_data.printSchema()

root
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- idx_Sex: double (nullable = false)
 |-- idx_Embarked: double (nullable = false)



In [6]:
processed_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'idx_Sex', 'idx_Embarked']
processed_data2 = processed_data[processed_features]
processed_data2.show(10)
# processed_data2.printSchema()

+------+----+-----+-----+-------+------------+
|Pclass| Age|SibSp|Parch|idx_Sex|idx_Embarked|
+------+----+-----+-----+-------+------------+
|     3|22.0|    1|    0|    0.0|         0.0|
|     1|38.0|    1|    0|    1.0|         1.0|
|     3|26.0|    0|    0|    1.0|         0.0|
|     1|35.0|    1|    0|    1.0|         0.0|
|     3|35.0|    0|    0|    0.0|         0.0|
|     1|54.0|    0|    0|    0.0|         0.0|
|     3| 2.0|    3|    1|    0.0|         0.0|
|     3|27.0|    0|    2|    1.0|         0.0|
|     2|14.0|    1|    0|    1.0|         1.0|
|     3| 4.0|    1|    1|    1.0|         0.0|
+------+----+-----+-----+-------+------------+
only showing top 10 rows



In [7]:
""" Split it into features and target """
processed_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'idx_Sex', 'idx_Embarked']
target_column   = ['Survived']


from pyspark.ml.feature import VectorAssembler 

vectorizer = VectorAssembler(inputCols=processed_features, outputCol='features')
processed_data3 = vectorizer.transform(processed_data)
processed_data3.show(10)

+------+------+----+-----+-----+--------+--------+-------+------------+--------------------+
|Pclass|   Sex| Age|SibSp|Parch|Embarked|Survived|idx_Sex|idx_Embarked|            features|
+------+------+----+-----+-----+--------+--------+-------+------------+--------------------+
|     3|  male|22.0|    1|    0|       S|       0|    0.0|         0.0|[3.0,22.0,1.0,0.0...|
|     1|female|38.0|    1|    0|       C|       1|    1.0|         1.0|[1.0,38.0,1.0,0.0...|
|     3|female|26.0|    0|    0|       S|       1|    1.0|         0.0|[3.0,26.0,0.0,0.0...|
|     1|female|35.0|    1|    0|       S|       1|    1.0|         0.0|[1.0,35.0,1.0,0.0...|
|     3|  male|35.0|    0|    0|       S|       0|    0.0|         0.0|(6,[0,1],[3.0,35.0])|
|     1|  male|54.0|    0|    0|       S|       0|    0.0|         0.0|(6,[0,1],[1.0,54.0])|
|     3|  male| 2.0|    3|    1|       S|       0|    0.0|         0.0|[3.0,2.0,3.0,1.0,...|
|     3|female|27.0|    0|    2|       S|       1|    1.0|         0.0

In [8]:
from pyspark.ml.classification import RandomForestClassifier

random_forest_clf = RandomForestClassifier(featuresCol='features', labelCol='Survived')



In [9]:
df_train, df_test = processed_data3.randomSplit([0.8, 0.2], seed=0)
df_test.show(5)

+------+------+----+-----+-----+--------+--------+-------+------------+--------------------+
|Pclass|   Sex| Age|SibSp|Parch|Embarked|Survived|idx_Sex|idx_Embarked|            features|
+------+------+----+-----+-----+--------+--------+-------+------------+--------------------+
|     1|female|23.0|    1|    0|       C|       1|    1.0|         1.0|[1.0,23.0,1.0,0.0...|
|     1|female|23.0|    3|    2|       S|       1|    1.0|         0.0|[1.0,23.0,3.0,2.0...|
|     1|female|26.0|    0|    0|       S|       1|    1.0|         0.0|[1.0,26.0,0.0,0.0...|
|     1|female|29.0|    0|    0|       S|       1|    1.0|         0.0|[1.0,29.0,0.0,0.0...|
|     1|female|30.0|    0|    0|       C|       1|    1.0|         1.0|[1.0,30.0,0.0,0.0...|
+------+------+----+-----+-----+--------+--------+-------+------------+--------------------+
only showing top 5 rows



In [10]:
random_forest_training = random_forest_clf.fit(df_train)

In [11]:
random_forest_clf.save("lrm_model.model")

In [12]:
random_forest_test = random_forest_training.transform(df_test)
random_forest_test.show(5)

+------+------+----+-----+-----+--------+--------+-------+------------+--------------------+--------------------+--------------------+----------+
|Pclass|   Sex| Age|SibSp|Parch|Embarked|Survived|idx_Sex|idx_Embarked|            features|       rawPrediction|         probability|prediction|
+------+------+----+-----+-----+--------+--------+-------+------------+--------------------+--------------------+--------------------+----------+
|     1|female|23.0|    1|    0|       C|       1|    1.0|         1.0|[1.0,23.0,1.0,0.0...|[1.61928679721475...|[0.08096433986073...|       1.0|
|     1|female|23.0|    3|    2|       S|       1|    1.0|         0.0|[1.0,23.0,3.0,2.0...|[1.76309502121857...|[0.08815475106092...|       1.0|
|     1|female|26.0|    0|    0|       S|       1|    1.0|         0.0|[1.0,26.0,0.0,0.0...|[1.61624658930409...|[0.08081232946520...|       1.0|
|     1|female|29.0|    0|    0|       S|       1|    1.0|         0.0|[1.0,29.0,0.0,0.0...|[1.61624658930409...|[0.08081232

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 

In [14]:
criterion = MulticlassClassificationEvaluator(labelCol='Survived', predictionCol='prediction')

In [15]:
accuracy = criterion.evaluate(random_forest_test)
accuracy

0.7929446851843902