# Example for toy data

## Create toy data

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(12345)

In [3]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

In [4]:
df = pd.DataFrame(np.random.rand(500, 5), columns=list('abcde'))

In [5]:
W = np.array([[1, -2, 3, -2, 1], [-2, 1, 3, -2, 1]])
b = np.array([0.2, -0.1])

In [6]:
margins = df.dot(W.transpose()).add(b)
p = margins[0].map(sigmoid)
l = np.exp(margins[1])

In [7]:
df['time'] = l.map(lambda l:np.clip(np.random.exponential(l, 1)[0], 0, 1))
df['label'] = pd.concat([p, l], axis=1).apply(lambda r:np.round(r[0] * (1. - np.exp(-r[1]))), axis=1)

In [8]:
df.to_parquet('toy.parquet')

## Train & Test

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from dfm.classification import DelayedFeedbackClassifier

In [10]:
raw = sqlContext.read.parquet('toy.parquet')

In [11]:
raw.printSchema()

root
 |-- a: double (nullable = true)
 |-- b: double (nullable = true)
 |-- c: double (nullable = true)
 |-- d: double (nullable = true)
 |-- e: double (nullable = true)
 |-- time: double (nullable = true)
 |-- label: double (nullable = true)
 |-- __index_level_0__: long (nullable = true)



In [12]:
train, test = raw.randomSplit([0.9, 0.1], seed=12345)

In [13]:
formula = RFormula(formula='label ~ a + b + c + d + e')
dfc = DelayedFeedbackClassifier(timeCol='time', regParam=0.01)

In [14]:
pipeline = Pipeline(stages=[formula, dfc])
model = pipeline.fit(train)

In [15]:
predictions = model.transform(test)

In [16]:
eval = BinaryClassificationEvaluator()
eval.evaluate(predictions)

0.9779310344827586