# Predicting adult income with Skale

This notebook demonstrates how to use Skale to predict if an adult identified with a set of parameters 
(education, gender, marital status, etc.) is earning more or less than 50K a year. 

For this we will use the Adult Data Set MLR file which contains income data for about 32000 people. 
We will train a logistic regression model out of it with the following steps:
- Run an ETL operation on raw data to obtain label and features from it
- Build a machine learning model using the refined data (a logistic regression)
- Evaluate the performance of our model using cross-validation technique

In [1]:
var co = require('co');
var sc = require('skale-engine').context();
var Plot = require('plotly-notebook-js');

var CSVDataFrame = require('./CSVDataFrame.js');
var StandardScaler = require('skale-ml').StandardScaler;
var LogisticRegressionWithSGD = require('skale-ml').LogisticRegressionWithSGD;
var BinaryClassificationMetrics = require('skale-ml').BinaryClassificationMetrics;

undefined

# Loading the CSV file

Our data is stored in a csv file named 'adult.data', let's load it and show the first five rows.

In [2]:
var columns = [
    "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
    "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
    "Hours-per-week", "Country", "Target"
];
var df = new CSVDataFrame(sc, columns, 'adult.data', ',', '?');
$$async$$ = df.printHTML(5, function(err, res) {$$done$$($$html$$ = res);});

Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Target
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Plotting Age distribution

In [3]:
$$async$$ = df.plotDistribution("Age", function(err, res) {$$done$$($$html$$ = res);});

# Plotting Workclass distribution

In [4]:
$$async$$ = df.plotDistribution("Workclass", function(err, res) {$$done$$($$html$$ = res);});

# Encoding the categorical features

In [5]:
var edf = df.number_encode_features()
$$async$$ = edf.printHTML(10, function(err, res) {$$done$$($$html$$ = res);});

Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Target
39,0,77516,0,13,0,0,0,0,0,2174,0,40,0,0
50,1,83311,0,13,1,1,1,0,0,0,0,13,0,0
38,2,215646,1,9,2,2,0,0,0,0,0,40,0,0
53,2,234721,2,7,1,2,1,1,0,0,0,40,0,0
28,2,338409,0,13,1,3,2,1,1,0,0,40,1,0
37,2,284582,3,14,1,1,2,0,1,0,0,40,0,0
49,2,160187,4,5,3,4,0,1,1,0,0,16,2,0
52,1,209642,1,9,1,1,1,0,0,0,0,45,0,1
31,2,45781,3,14,0,3,0,0,1,14084,0,50,0,1
42,2,159449,0,13,1,1,1,0,0,5178,0,40,0,1


# Correlation between Education and Education-Num

In [6]:
$$async$$ = df.select(["Education", "Education-Num"]).printHTML(5, function(err, res) {$$done$$($$html$$ = res);});

Education,Education-Num
Bachelors,13
Bachelors,13
HS-grad,9
11th,7
Bachelors,13


# Delete Education column

In [7]:
edf = edf.drop(["Education"]);
$$async$$ = edf.printHTML(5, function(err, res) {$$done$$($$html$$ = res);});

Age,Workclass,fnlwgt,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Target
39,0,77516,13,0,0,0,0,0,2174,0,40,0,0
50,1,83311,13,1,1,1,0,0,0,0,13,0,0
38,2,215646,9,2,2,0,0,0,0,0,40,0,0
53,2,234721,7,1,2,1,1,0,0,0,40,0,0
28,2,338409,13,1,3,2,1,1,0,0,40,1,0


# Correlation between Sex and Relationship

In [8]:
$$async$$ = df.select(["Sex", "Relationship"]).printHTML(5, function(err, res) {$$done$$($$html$$ = res);});

Sex,Relationship
Male,Not-in-family
Male,Husband
Male,Not-in-family
Male,Husband
Female,Wife


# Extract a LabeledPoint Dataset from our encoded Data Frame

In [9]:
var training_set = edf.toLabeledPoint("Target", ["*"]);

undefined

# Scale features to zero-mean, unit variance

In [10]:
var scaler = new StandardScaler();
var training_set_std;
$$async$$ = scaler.fit(training_set.map(p => p[1]), function() {
    training_set_std = training_set.map((p, scaler) => [p[0], scaler.transform(p[1])], scaler).persist();
    $$done$$();
});

undefined

# Train logistic regression with SGD on standardized training set

In [11]:
var nIterations = 10;
var parameters = {regParam: 0.01, stepSize: 1};
var model = new LogisticRegressionWithSGD(training_set_std, parameters);

$$async$$ = model.train(nIterations, $$done$$);

null

# Cross validate on test set and generate ROC curve

In [14]:
var vdf = new CSVDataFrame(sc, columns, 'adult.test', ',', '?');
var evdf = vdf.drop(["Education"]).number_encode_features();
var test_set_std = evdf
    .toLabeledPoint("Target", ["*"])
    .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler);

var predictionAndLabels = test_set_std.map((p, model) => [model.predict(p[1]), p[0]], model);
var metrics = new BinaryClassificationMetrics(predictionAndLabels);

var roc;
$$async$$ = metrics.roc(function(err, result) {
    roc = result;
    $$done$$();
});

undefined

In [15]:
var series = [{
        x: [], y: [],
        type: 'scatter',
        name: 'regParam = ' + parameters.regParam + ', stepSize = ' + parameters.stepSize
    }, {x: [0, 1], y: [0, 1], name: 'Random'}];
for (var i in roc) {
    series[0].x.push(roc[i][1][0].toFixed(2));
    series[0].y.push(roc[i][1][1].toFixed(2));
}
series[0].x.push(0);
series[0].y.push(0);

$$html$$ = Plot.createPlot(series, {
    title: 'Receiver Operating characteristic (ROC)',
    xaxis: {title: 'False Positive Rate (Specifity)'},
    yaxis: {title: 'True Positive Rate (Sensitivity)'},
    autosize: false,
    width: 800,
    height: 500   
}).render();