In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import argparse
import os
from matplotlib import pyplot as plt

import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import log_loss
from decentralized_SGD_logistic import DecentralizedSGDLogistic
from sklearn.linear_model import SGDClassifier
from scipy.special import expit as sigmoid

from helpers import plot_losses

## RCV dataset from ChocoSGD Paper

In [13]:
def run_logistic(A, y, param,):
    m = DecentralizedSGDLogistic(param)
    res = m.fit(A, y)
    print('{} - score: {1:.4f}'.format(param, m.score(A, y)))
    return res, m

### Load dataset

In [3]:
dataset_path = os.path.expanduser('../ChocoSGD/data/rcv1_test.binary.bz2')
print('Loading dataset...')
A, y = load_svmlight_file(dataset_path)
A_p = A[:100000]
y_p = y[:100000]
y_p = 1*(y_p > 0)

Loading dataset...


### Compute the baseline with SGD

In [5]:
clf = SGDClassifier(loss='log', penalty='l2',alpha =1/A_p.shape[0])
clf.fit(A_p, y_p)
print(clf.score(A_p, y_p))
x_predict = clf.predict(A_p)
exact_optimum = log_loss(x_predict, y_p) + (1 / A_p.shape[0]) * np.sum(x_predict**2)



0.97885


### Compute with Decentralized SGD

In [None]:
reg = 1 / A_p.shape[0]
n_features = A_p.shape[1]

params_disconnected = Parameters(num_epoch=5, lr_type='bottou',
                           initial_lr=10, tau=n_features, regularizer=reg,
                           quantization_method='full', n_machines=128,
                           communication_method='plain',topology='complete',
                           data_distribution_random_seed=2,
                           data_distribution_strategy='naive')

res_disconnected, dec_log = run_logistic(A_p, y_p, params_disconnected)

"""params_ring = dict(name="chocosgd-centralized", num_epoch=5, lr_type='bottou',
                           initial_lr=0.2, tau=n_features, regularizer=reg,
                           quantization='full', n_cores=10,
                           method='plain', topology='centralized', estimate='final',
                           split_data_random_seed=2, distribute_data=True,
                           split_data_strategy='naive')

res_centralized = run_logistic(A_p, y_p, params_ring)"""


## Higgs Boson Dataset

In [3]:
def run_logistic(A, y, param, logging=False):
    m = DecentralizedSGDLogistic(**param)
    list_losses = m.fit(A, y, logging=logging)
    if logging:
        print()
        print('{0} - score: {1:.4f}'.format(param, m.score(A, y)))
    return list_losses

In [4]:
def load_csv_data(data_path):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1

    return yb, input_data

In [5]:
def clean(input_data, mean=False):

    #Replace -999 by most frequent value of column
    for i in range(input_data.shape[1]):
        current_col = input_data[:, i]

        if -999.0 in current_col:
            indices_to_change = (current_col == -999.0)
            if mean:
                curr_mean = np.mean(current_col[~indices_to_change])
                current_col[indices_to_change] = curr_mean
            else:
                (values,counts) = np.unique(current_col[~indices_to_change], return_counts=True)
                ind=np.argmax(counts)
                current_col[indices_to_change] = values[ind] if len(values) > 0 else 0

    return input_data

In [6]:
def standardize(x):
    """Standardize the given data"""
    means = x.mean(0)
    stds = x.std(0)
    return (x - means)/stds

### Load dataset

In [7]:
y, A = load_csv_data('train.csv')
A = standardize(clean(A, True))
y = 1 *(y > 0.0)

### Compute the baseline with SGD classifier 

In [8]:
# Fit a SGD
clf = SGDClassifier(loss='log', penalty='l2',alpha =1/A.shape[0], tol=1e-4, n_jobs=1)
clf.fit(A, y)

accuracy = clf.score(A, y)
optimum_x = clf.predict_proba(A)[:, 1]

# Optimal loss, useful for plots
optimum_loss = (-(y.T.dot(np.log(optimum_x)) + (1 - y).T.dot(np.log(1 - optimum_x))) / A.shape[0])

print('Accuracy:', accuracy)
print('Final loss:', optimum_loss)

Accuracy: 0.741736
Final loss: 0.5129295355244131


### Computing with Decentralized SGD

In [None]:
reg = 1 / A.shape[0]
n_features = A.shape[1]
compute_loss_every = 30
params_disconnected = dict(num_epoch=3, lr_type='bottou', tol=1e-10,
                           initial_lr=0.05, tau=n_features, regularizer=reg,
                           quantization_method='full', n_machines=512,
                           communication_method='plain', topology='disconnected',
                           data_distribution_random_seed=2,
                           data_distribution_strategy='naive', 
                           compute_loss_every=compute_loss_every)

losses_disconnected = run_logistic(A, y, params_disconnected, logging=True)