# Predicting survivors of Titanic disaster

In [1]:
var sc = require('skale-engine').context();
var plot = require('plotter').plot;

var CSVDataFrame = require('./CSVDataFrame.js');
var StandardScaler = require('skale-ml').StandardScaler;
var LogisticRegressionWithSGD = require('skale-ml').LogisticRegressionWithSGD;
var BinaryClassificationMetrics = require('skale-ml').BinaryClassificationMetrics;

undefined

## Loading the training data

In [2]:
var columns = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 
              'Age',  'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'];
var df = new CSVDataFrame(sc, columns, 'train.csv', ',', '?');

undefined

## Printing inferred schema

In [3]:
$$async$$ = df.showSchema($$done$$);

0:PassengerId real-valued
1:Survived real-valued
2:Pclass real-valued
3:Name categorical
4:Sex categorical
5:Age real-valued
6:SibSp real-valued
7:Parch real-valued
8:Ticket categorical
9:Fare real-valued
10:Cabin categorical
11:Embarked categorical


null

## Dropping columns and removing people with age unknown

In [3]:
// var df = df.select(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']).where({"Age": {$neq: ''}});
var df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']).where({"Age": {$neq: ''}});

undefined

## Encoding the categorical features

In [4]:
var edf = df.number_encode_features();

undefined

NB: some people still has an age equals to zero, let's filter them.

In [5]:
var training_set = edf.toLabeledPoint("Survived", ["*"]);

undefined

## Scaling features to zero mean and unit variance

First we calculate the parameters of our scaler.

In [6]:
var scaler = new StandardScaler();
$$async$$ = scaler.fit(training_set.map(p => p[1]), $$done$$);

undefined

Then we apply scaler to our label/features dataset.

In [7]:
var training_set_std = training_set
    .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler)
    .persist();

undefined

## Training logistic regression

In [8]:
var nIterations = 10;
var parameters = {regParam: 0.01, stepSize: 1};
var model = new LogisticRegressionWithSGD(training_set_std, parameters);
$$async$$ = model.train(nIterations, $$done$$);

null

## Cross-validating our model

In [9]:
var predictionAndLabels = training_set_std.map((p, model) => [model.predict(p[1]), p[0]], model);
var metrics = new BinaryClassificationMetrics(predictionAndLabels);

$$async$$ = metrics.roc(function(err, roc) {
    var xy = {};
    for (var i in roc) xy[roc[i][1][0].toFixed(2)] = roc[i][1][1].toFixed(2);
    xy['0.00'] = '0.00';
    var data = {};
    data['regParam: ' + parameters.regParam + ', stepSize: ' + parameters.stepSize] = xy;
    data['Random'] = {0 :0, 1 : 1};
    plot({title: 'Logistic Regression ROC Curve', data: data, filename: 'roc.png', finish: function() {$$done$$();}});    
});


undefined

![alt text](roc.png)