## Demo for Strata Conf: Running Log. Reg. with Morpheus on Walmart 3-table dataset:

### Load requisite libraries, source files, and base table data files.

In [1]:
import numpy as np
from scipy.io import mmread
from scipy.sparse import hstack
import time
import morpheus.normalized_matrix as nm
from morpheus.algorithms.logistic_regression import NormalizedLogisticRegression as LogReg

s = np.matrix(np.genfromtxt('examples/data/Walmart/MLSraw.txt', skip_header=True, dtype=int)).T
join_set1 = np.genfromtxt('examples/data/Walmart/MLFK1.csv', skip_header=True, dtype=int)
r1 = mmread('examples/data/Walmart/MLR1Sparse.txt')
join_set2 = np.genfromtxt('examples/data/Walmart/MLFK2.csv', skip_header=True, dtype=int)
r2 = mmread('examples/data/Walmart/MLR2Sparse.txt')
k = [join_set1 - 1, join_set2 - 1]
Y = np.matrix(np.genfromtxt('examples/data/Walmart/MLY.csv', skip_header=True, dtype=int)).T

print "Loaded data"

Loaded data


### Create regular single table (Materialized) matrix.

In [2]:
# Create regular single table (Materialized) matrix.
t = time.time()
T = hstack((s, r1.tocsr()[k[0]], r2.tocsr()[k[1]]))
timetaken = time.time() - t
print "Created Materialized matrix.", timetaken, "seconds"

Created Materialized matrix. 0.328175067902 seconds


### Create Normalized Matrix for Morpheus.

In [3]:
t = time.time()
normalized_matrix = nm.NormalizedMatrix(s, [r1, r2], k)
timetaken = time.time() - t
print "Created Normalized matrix.", timetaken, "seconds"

Created Normalized matrix. 0.000217914581299 seconds


### Initialize Logistic Regression weights and hyper-parameters.

In [4]:
w_init = np.matrix(np.random.randn(T.shape[1], 1))
w_init2 = np.matrix(w_init, copy=True)
gamma = 0.001
iterations = 20

### Run Logistic Regression with regular Materialized matrix.

In [5]:
print "Materialized execution started"
logregMat = LogReg()
t = time.time()
logregMat.fit(T, Y, w_init=w_init)
timemat = time.time() - t
print "Materialized execution done.", timemat, "seconds"

Materialized execution started
Materialized execution done. 0.891387939453 seconds


### Run Logistic Regression on Morpheus with Normalized matrix.

In [6]:
print "Morpheus execution started"
logregMorph = LogReg()
t = time.time()
logregMorph.fit(normalized_matrix, Y, w_init=w_init2)
timemorph = time.time() - t
print "Morpheus execution done.", timemorph, "seconds"
print "Runtime Speedup of Morpheus over Materialized:", timemat / timemorph

Morpheus execution started
Morpheus execution done. 0.295523881912 seconds
Runtime Speedup of Morpheus over Materialized: 3.01629747716


### Check if both approaches return same weights.

In [7]:
print logregMat.w[1:6].T
print logregMorph.w[1:6].T

[[-1.1686932   1.69645199 -0.04188097  0.24049617  0.3112689 ]]
[[-1.1686932   1.69645199 -0.04188097  0.24049617  0.3112689 ]]


In [8]:
result_eps = 1e-6
if (np.linalg.norm(logregMat.w - logregMorph.w) < result_eps) :
  print "Success: Both approches return same weights."
else :
  print "Failed: The approches return different weights."

Success: Both approches return same weights.
