In [13]:
global.sc = require('skale-engine').context();
global.Plot = require('plotly-notebook-js');

global.CSVDataFrame = require('./CSVDataFrame.js');
global.StandardScaler = require('skale-ml').StandardScaler;
global.LogisticRegressionWithSGD = require('skale-ml').LogisticRegressionWithSGD;
global.BinaryClassificationMetrics = require('skale-ml').BinaryClassificationMetrics;

In [14]:
global.columns = [
    "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
    "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
    "Hours-per-week", "Country", "Target"
];
global.df = new CSVDataFrame(sc, columns, 'adult.data', ',', '?');
$$html$$ = yield df.printHTML(5);

Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Target
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [15]:
$$html$$ = yield df.plotDistribution("Marital-Status");

In [16]:
global.edf = df.number_encode_features();
$$html$$ = yield edf.printHTML(10);

Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Target
39,0,77516,0,13,0,0,0,0,0,2174,0,40,0,0
50,1,83311,0,13,1,1,1,0,0,0,0,13,0,0
38,2,215646,1,9,2,2,0,0,0,0,0,40,0,0
53,2,234721,2,7,1,2,1,1,0,0,0,40,0,0
28,2,338409,0,13,1,3,2,1,1,0,0,40,1,0
37,2,284582,3,14,1,1,2,0,1,0,0,40,0,0
49,2,160187,4,5,3,4,0,1,1,0,0,16,2,0
52,1,209642,1,9,1,1,1,0,0,0,0,45,0,1
31,2,45781,3,14,0,3,0,0,1,14084,0,50,0,1
42,2,159449,0,13,1,1,1,0,0,5178,0,40,0,1


In [17]:
$$html$$ = yield df.select(["Education", "Education-Num"]).printHTML(5);

Education,Education-Num
Bachelors,13
Bachelors,13
HS-grad,9
11th,7
Bachelors,13


In [18]:
edf = edf.drop(["Education"]);
$$html$$ = yield edf.printHTML(5);

Age,Workclass,fnlwgt,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Target
39,0,77516,13,0,0,0,0,0,2174,0,40,0,0
50,1,83311,13,1,1,1,0,0,0,0,13,0,0
38,2,215646,9,2,2,0,0,0,0,0,40,0,0
53,2,234721,7,1,2,1,1,0,0,0,40,0,0
28,2,338409,13,1,3,2,1,1,0,0,40,1,0


In [19]:
$$html$$ = yield df.select(["Sex", "Relationship"]).printHTML(5);

Sex,Relationship
Male,Not-in-family
Male,Husband
Male,Not-in-family
Male,Husband
Female,Wife


In [20]:
global.training_set = edf.toLabeledPoint("Target", ["*"]);

In [21]:
global.scaler = new StandardScaler();
yield scaler.fit(training_set.map(p => p[1]));
global.training_set_std = training_set
    .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler)
    .persist();

In [22]:
global.nIterations = 10;
global.parameters = {regParam: 0.01, stepSize: 1};
global.model = new LogisticRegressionWithSGD(training_set_std, parameters);

yield model.train(nIterations);

In [23]:
global.vdf = new CSVDataFrame(sc, columns, 'adult.test', ',', '?');
global.evdf = vdf.drop(["Education"]).number_encode_features();
global.test_set_std = evdf
    .toLabeledPoint("Target", ["*"])
    .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler);

global.predictionAndLabels = test_set_std.map((p, model) => [model.predict(p[1]), p[0]], model);
global.metrics = new BinaryClassificationMetrics(predictionAndLabels);

global.roc = yield metrics.roc();

In [24]:
global.series = [{
        x: [], y: [],
        type: 'scatter',
        name: 'regParam = ' + parameters.regParam + ', stepSize = ' + parameters.stepSize
    }, {x: [0, 1], y: [0, 1], name: 'Random'}];
for (var i in roc) {
    series[0].x.push(roc[i][1][0].toFixed(2));
    series[0].y.push(roc[i][1][1].toFixed(2));
}
series[0].x.push(0);
series[0].y.push(0);

$$html$$ = Plot.createPlot(series, {
    title: 'Receiver Operating characteristic (ROC)',
    xaxis: {title: 'False Positive Rate (Specifity)'},
    yaxis: {title: 'True Positive Rate (Sensitivity)'},
    autosize: false,
    width: 800,
    height: 500   
}).render();