In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

### Step 1
- Load the train and test sets
- Check the schema, the variables have their right types?
- If not, how to correctly load the datasets?

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

customSchema = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Survived", DoubleType(), True),
                           StructField("Pclass", IntegerType(), True), 
                           StructField("Name", StringType(), True),
                           StructField("Sex", StringType(), True),
                           StructField("Age", DoubleType(), True),
                           StructField("SibSp", IntegerType(), True),
                           StructField("Parch", IntegerType(), True),
                           StructField("Ticket", StringType(), True),
                           StructField("Fare", DoubleType(), True),
                           StructField("Cabin", StringType(), True),
                           StructField("Embarked", StringType(), True)])

customSchema2 = StructType([StructField("PassengerId", IntegerType(), True),
                           StructField("Pclass", IntegerType(), True), 
                           StructField("Name", StringType(), True),
                           StructField("Sex", StringType(), True),
                           StructField("Age", DoubleType(), True),
                           StructField("SibSp", IntegerType(), True),
                           StructField("Parch", IntegerType(), True),
                           StructField("Ticket", StringType(), True),
                           StructField("Fare", DoubleType(), True),
                           StructField("Cabin", StringType(), True),
                           StructField("Embarked", StringType(), True)])

train = sqlc.read.csv("./train.csv", header=True, schema=customSchema)
test = sqlc.read.csv("./test.csv", header=True, schema=customSchema2)

In [6]:
titanic = sqlc.read.format('com.databricks.spark.csv')\
                .option('header', 'true')\
                .option('inferschema', 'true')\
                .option('mode', 'DROPMALFORMED')\
                .load('./train.csv')

In [7]:
titanic.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [8]:
# for schema in titanic.schema:
#     print(schema)
    
titanic.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### Step 2
- Explore the features of your dataset
- You can use DataFrame's ***describe*** method to get summary statistics
    - hint: ***toPandas*** may be useful to ease the manipulation of small dataframes
- Are there any ***NaN*** values in your dataset?
- If so, define value/values to fill these ***NaN*** values
    - hint: ***na*** property of DataFrames provide several methods of handling NA values

In [32]:
# Calculating summary statistics and turning it into Pandas DF
train_desc = train.describe().toPandas().set_index('summary')
print(train_desc)

print(train.groupBy('Embarked').count().show())

# Computing correlations between Survived and some features
print({col:train.stat.corr('Survived',col) for col in ['Pclass','Age','SibSp','Parch','Fare']})

# Checking which columns have NULL values
print({col:train.where(train[col].isNull()).count() for col in train.columns})

# Taking the mean age from the Pandas DF
ageMean = float(train_desc.loc['mean']['Age'])
fareMean = float(train_desc.loc['mean']['Fare'])
print(ageMean)

# Filling the Age in both train and test datasets
trainFilled = train.na.fill({'Age': ageMean, 'Embarked': 'S'})
testFilled = test.na.fill({'Age': ageMean, 'Embarked': 'S', 'Fare': fareMean})

from pyspark.sql import functions as F
train.groupby('Sex','PClass').agg(F.mean('age')).show()

               PassengerId             Survived              Pclass  \
summary                                                               
count                  891                  891                 891   
mean                 446.0   0.3838383838383838   2.308641975308642   
stddev   257.3538420152301  0.48659245426485753  0.8360712409770491   
min                      1                  0.0                   1   
max                    891                  1.0                   3   

                                                     Name     Sex  \
summary                                                             
count                                                 891     891   
mean                                                 None    None   
stddev                                               None    None   
min      "Andersson, Mr. August Edvard (""Wennerstrom"")"  female   
max                           van Melkebeke, Mr. Philemon    male   

                  

In [10]:
titanic.describe().toPandas().set_index('summary')

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
mean,446.0,0.3838383838383838,2.308641975308642,,,29.69911764705882,0.5230078563411896,0.3815937149270482,260318.54916792738,32.2042079685746,,
stddev,257.3538420152301,0.4865924542648575,0.8360712409770491,,,14.526497332334037,1.1027434322934315,0.8060572211299488,471609.26868834975,49.69342859718089,,
min,1.0,0.0,1.0,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""",female,0.42,0.0,0.0,110152,0.0,A10,C
max,891.0,1.0,3.0,"van Melkebeke, Mr. Philemon",male,80.0,8.0,6.0,WE/P 5735,512.3292,T,S


In [11]:
df = train.toPandas()

In [12]:
df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Duran y More, Miss. Asuncion",male,,,,1601.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null float64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 83.6+ KB


In [14]:
df.loc[~df['Cabin'].isnull(), :].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1.0,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1.0,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


### Step 3
- How to handle categorical features?
    - hint: check the Estimators and Transformers
- Assemble all desired features into a Vector using the VectorAssembler Transformer
- Make sure to end up with a DataFrame with two columns: ***Survived*** and ***vFeatures***

In [15]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.util import MLUtils

for (IN, OUT) in [('Sex', 'Nsex'), ('Embarked', 'Nembarked')]:
    indexer = StringIndexer().setInputCol(IN).setOutputCol(OUT)#.setHandleInvalid('skip') would have been better
    trainFilled = indexer.fit(trainFilled).transform(trainFilled)

In [16]:
encoder = OneHotEncoder().setInputCol('Nembarked').setOutputCol('HotEmbarked')
trainFilled = encoder.transform(trainFilled)

In [17]:
trainFilled.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----+---------+-------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Nsex|Nembarked|  HotEmbarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----+---------+-------------+
|          1|     0.0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S| 0.0|      0.0|(2,[0],[1.0])|
|          2|     1.0|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C| 1.0|      1.0|(2,[1],[1.0])|
|          3|     1.0|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S| 1.0|      0.0|(2,[0],[1.0])|
|          4|     1.0|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S| 1.0|      0.0|(2,[0],

In [18]:
assembler = VectorAssembler().setInputCols(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Nsex', 'HotEmbarked']).setOutputCol('features')
df_train_features = assembler.transform(trainFilled)

In [19]:
df_train_features.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----+---------+-------------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Nsex|Nembarked|  HotEmbarked|            features|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----+---------+-------------+--------------------+
|          1|     0.0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S| 0.0|      0.0|(2,[0],[1.0])|[3.0,22.0,1.0,0.0...|
|          2|     1.0|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C| 1.0|      1.0|(2,[1],[1.0])|[1.0,38.0,1.0,0.0...|
|          3|     1.0|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S| 1.0|      0.0|(2,[0],[1.0])|[3.0,26.0,0.0,0.0...|
|         

In [20]:
for (IN, OUT) in [('Sex', 'Nsex'), ('Embarked', 'Nembarked')]:
    indexer = StringIndexer().setInputCol(IN).setOutputCol(OUT)
    testFilled = indexer.fit(testFilled).transform(testFilled)
    
encoder = OneHotEncoder().setInputCol('Nembarked').setOutputCol('HotEmbarked')
testFilled = encoder.transform(testFilled)

assembler = VectorAssembler().setInputCols(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Nsex', 'HotEmbarked']).setOutputCol('features')
df_test_features = assembler.transform(testFilled)
df_test_features.show(5)

+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+----+---------+-------------+--------------------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch| Ticket|   Fare|Cabin|Embarked|Nsex|Nembarked|  HotEmbarked|            features|
+-----------+------+--------------------+------+----+-----+-----+-------+-------+-----+--------+----+---------+-------------+--------------------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0| 330911| 7.8292| null|       Q| 0.0|      2.0|    (2,[],[])|(8,[0,1,4],[3.0,3...|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0| 363272|    7.0| null|       S| 1.0|      0.0|(2,[0],[1.0])|[3.0,47.0,1.0,0.0...|
|        894|     2|Myles, Mr. Thomas...|  male|62.0|    0|    0| 240276| 9.6875| null|       Q| 0.0|      2.0|    (2,[],[])|(8,[0,1,4],[2.0,6...|
|        895|     3|    Wirz, Mr. Albert|  male|27.0|    0|    0| 315154| 8.6625| null|       S| 0.0|      0.0|(2,[0],

### Step 4
- Apply a normalization Estimator of your choice to the ***features*** vector obtained in Step 3

In [21]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler().setInputCol('features').setOutputCol('scaled_feat').setWithStd(True).setWithMean(True)
df_train_scaled = scaler.fit(df_train_features.select('survived', 'features')) \
                        .transform(df_train_features.select('survived', 'features'))
    
df_test_scaled = scaler.fit(df_test_features.select('features')) \
                        .transform(df_test_features.select('features'))

In [22]:
df_train_scaled.show(5)

+--------+--------------------+--------------------+
|survived|            features|         scaled_feat|
+--------+--------------------+--------------------+
|     0.0|[3.0,22.0,1.0,0.0...|[0.82691281652436...|
|     1.0|[1.0,38.0,1.0,0.0...|[-1.5652278312782...|
|     1.0|[3.0,26.0,0.0,0.0...|[0.82691281652436...|
|     1.0|[1.0,35.0,1.0,0.0...|[-1.5652278312782...|
|     0.0|(8,[0,1,4,6],[3.0...|[0.82691281652436...|
+--------+--------------------+--------------------+
only showing top 5 rows



In [23]:
testFilled.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Nsex           418 non-null float64
Nembarked      418 non-null float64
HotEmbarked    418 non-null object
dtypes: float64(4), int64(4), object(6)
memory usage: 45.8+ KB


In [24]:
df_test_scaled.show(5)

+--------------------+--------------------+
|            features|         scaled_feat|
+--------------------+--------------------+
|(8,[0,1,4],[3.0,3...|[0.87243644459912...|
|[3.0,47.0,1.0,0.0...|[0.87243644459912...|
|(8,[0,1,4],[2.0,6...|[-0.3154411900667...|
|(8,[0,1,4,6],[3.0...|[0.87243644459912...|
|[3.0,22.0,1.0,1.0...|[0.87243644459912...|
+--------------------+--------------------+
only showing top 5 rows



### Step 5
- Instead of doing transformations on separate steps, put everything together with a Pipeline

In [None]:
for (IN, OUT) in [('Sex', 'Nsex'), ('Embarked', 'Nembarked')]:
    indexer = StringIndexer().setInputCol(IN).setOutputCol(OUT)
    testFilled = indexer.fit(testFilled).transform(testFilled)
    
encoder = OneHotEncoder().setInputCol('Nembarked').setOutputCol('HotEmbarked')
testFilled = encoder.transform(testFilled)

assembler = VectorAssembler().setInputCols(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Nsex', 'HotEmbarked']).setOutputCol('features')
df_train_features = assembler.transform(trainFilled)

assembler = VectorAssembler().setInputCols(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Nsex', 'HotEmbarked']).setOutputCol('features')
df_test_features = assembler.transform(testFilled)
df_test_features.show(5)

scaler = StandardScaler().setInputCol('features').setOutputCol('scaled_feat').setWithStd(True).setWithMean(True)
df_train_scaled = scaler.fit(df_train_features.select('survived', 'features')) \
                        .transform(df_train_features.select('survived', 'features'))
    
df_test_scaled = scaler.fit(df_test_features.select('features')) \
                        .transform(df_test_features.select('features'))

In [33]:
from pyspark.ml.pipeline import Pipeline

indexer1 = StringIndexer().setInputCol('Sex').setOutputCol('Nsex')
indexer2 = StringIndexer().setInputCol('Embarked').setOutputCol('Nembarked')
    
encoder = OneHotEncoder().setInputCol('Nembarked').setOutputCol('HotEmbarked')

assembler = VectorAssembler().setInputCols(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Nsex', 'HotEmbarked']).setOutputCol('features')

scaler = StandardScaler().setInputCol('features').setOutputCol('scaled_feat').setWithStd(True).setWithMean(True)

pipeline = Pipeline().setStages([indexer1, indexer2, encoder, assembler, scaler])

preprocessing_model = pipeline.fit(trainFilled)
preprocessed_train = preprocessing_model.transform(trainFilled)
preprocessed_test = preprocessing_model.transform(testFilled)

In [67]:
preprocessed_train.show(3)
preprocessed_test.show(3)

subset_train = preprocessed_train.select('Survived', 'scaled_feat')
subset_test = preprocessed_test.select('scaled_feat')

subset_train.show(3)
subset_test.show(3)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----+---------+-------------+--------------------+--------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Nsex|Nembarked|  HotEmbarked|            features|         scaled_feat|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+----+---------+-------------+--------------------+--------------------+
|          1|     0.0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S| 0.0|      0.0|(2,[0],[1.0])|[3.0,22.0,1.0,0.0...|[0.82691281652436...|
|          2|     1.0|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C| 1.0|      1.0|(2,[1],[1.0])|[1.0,38.0,1.0,0.0...|[-1.5652278312782...|
|          3|     1.0|     3|Heikkinen, Miss. ...|female|26.0|    0|  

### Step 6
- Train a classifier of your choice (for instance, Random Forest) using your dataset of LabeledPoints
- Make predictions for the training data
- Use the evaluators to find the Area Under ROC and Accuracy of your model
- How is your model performing? Try to tune its parameters

In [88]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

rfc = RandomForestClassifier().setLabelCol('Survived').setFeaturesCol('scaled_feat') \
                            .setNumTrees(13).setMaxDepth(7)
model_rfc = rfc.fit(subset_train)

In [89]:
model_rfc.featureImportances

SparseVector(8, {0: 0.149, 1: 0.1326, 2: 0.0442, 3: 0.0465, 4: 0.1742, 5: 0.4025, 6: 0.0193, 7: 0.0318})

In [90]:
subset_trained = model_rfc.transform(subset_train)
subset_trained.show(3)

+--------+--------------------+--------------------+--------------------+----------+
|Survived|         scaled_feat|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+--------------------+----------+
|     0.0|[0.82691281652436...|[11.6144769351195...|[0.89342130270150...|       0.0|
|     1.0|[-1.5652278312782...|[0.41176470588235...|[0.03167420814479...|       1.0|
|     1.0|[0.82691281652436...|[5.40697527641242...|[0.41592117510864...|       1.0|
+--------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [91]:
evaluator_roc = BinaryClassificationEvaluator().setLabelCol('Survived')\
    .setRawPredictionCol('prediction').setMetricName('areaUnderROC')

evaluator_accuracy = MulticlassClassificationEvaluator().setLabelCol('Survived')\
    .setPredictionCol('prediction').setMetricName('accuracy')
    
roc = evaluator_roc.evaluate(subset_trained)
accuracy = evaluator_accuracy.evaluate(subset_trained)
print(roc, accuracy)

0.8672227015626498 0.8866442199775533


### Step 7
- Take a look at the test data - use DataFrame's ***createOrReplaceTempView*** method to perform SQL queries over the data
    - hint: check if there are any NULL values in the dataset - if so, handle them
- Apply the transformations to the test data
    - hint: include the model to the pipeline
- Make predictions using the model previously trained and the transformed test data

In [None]:
### INSERT YOUR CODE HERE

### Step 8

- Load the answers for the ***test*** data
- Combine it with your predictions into a single DataFrame
- Use the evaluator you created on ***Step 6***
- What was your score?

In [None]:
### INSERT YOUR CODE HERE