# Simple data transformation

In [1]:
def print_table(data):
    header = ''.join([f'<td><b>{h}</b></td>' for h in data[0].keys()])
    head = f"<tr>{header}</tr>"
    rows = ''.join([f"<tr>{''.join([f'<td>{v}</td>' for v in row.values()])}</tr>" for row in data])
    
    # fields
    display(HTML('<table>{}</table>'.format(''.join([header, rows]))))

In [2]:
from IPython.display import HTML, display

inp = [{"date": "2017-10-26", "amount": "104.52"},
       {"date": "2017-10-22", "amount": "24.52"},
       {"date": "2017-10-21", "amount": "0"},
       {"date": "2017-10-10", "amount": "11.11"},
       {"date": "2017-09-23", "amount": "1078.12"},
       {"date": "2017-09-01", "amount": "5.9"},
       {"date": "2017-08-10", "amount": "26.12"},
       {"date": "2017-08-01", "amount": "19.54"},
       {"date": "2017-07-11", "amount": "66.06"}
       ]

print_table(inp)

0,1
2017-10-26,104.52
2017-10-22,24.52
2017-10-21,0.0
2017-10-10,11.11
2017-09-23,1078.12
2017-09-01,5.9
2017-08-10,26.12
2017-08-01,19.54
2017-07-11,66.06


In [3]:
from datarefinery.tuple.TupleOperations import substitution, compose, wrap, append
from datarefinery.FieldOperations import type_enforcer, date_parser, match_dict, column_category
from datarefinery.Tr import Tr

week_days={
    0: "Mo", 1: "Tu", 2: "We", 3: "Th", 4: "Fr", 5: "Sa", 6: "Su"
}

def day_of_week(dat):
      return dat.weekday()

float_enforcer = type_enforcer(lambda x: float(x))

operation = Tr(append(["date"], compose(
        date_parser(["%Y-%m-%d"]),
        wrap(day_of_week),
        match_dict(week_days),
        column_category(week_days.values())
    ))) \
    .then(substitution(["amount"], float_enforcer)) \
    .apply()

In [4]:
results = []
for x in inp:
    results.append(dict(operation(x)[1]))

print_table(results)

0,1,2,3,4,5,6,7
0,0,0,1,0,0,0,104.52
0,0,0,0,0,0,1,24.52
0,0,0,0,0,1,0,0.0
0,1,0,0,0,0,0,11.11
0,0,0,0,0,1,0,1078.12
0,0,0,0,1,0,0,5.9
0,0,0,1,0,0,0,26.12
0,1,0,0,0,0,0,19.54
0,1,0,0,0,0,0,66.06


# SciKit encode

In [5]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 0.  1.  0.]]


# Keras encode

In [6]:
from numpy import array
from numpy import argmax
from keras.utils import to_categorical
index_data = {'cold': 0, 'hot': 1, 'warm': 2}
raw_data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']

data = list(map(lambda x: index_data[x], raw_data))
data = array(data)

encoded = to_categorical(data)
print(encoded)

Using TensorFlow backend.


[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 0.  1.  0.]]


# Data Distiler Encode

In [7]:
from datarefinery.tuple.TupleOperations import append
from datarefinery.FieldOperations import column_category
from datarefinery.Tr import Tr

raw_data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
data = [{'category': v} for v in raw_data]

enc = Tr(append(['category'], column_category(['cold', 'hot', 'warm']))).apply()
res = [dict(enc(x)[1]) for x in data]

print_table(res)

0,1,2
1,0,0
1,0,0
0,0,1
1,0,0
0,1,0
0,1,0
0,0,1
1,0,0
0,0,1
0,1,0


# Notebook example

In [8]:
from datarefinery.tuple.TupleOperations import keep
from datarefinery.Tr import Tr

keep_people = Tr(keep(["who"])).apply()

In [9]:
(inp, res, err) = keep_people({"greet": "hello", "who": "world"})

In [10]:
print(res)

{'who': 'world'}


# PySpark example

In [11]:
import pyspark
sc = pyspark.SparkContext('local[*]')

ModuleNotFoundError: No module named 'pyspark'

In [None]:
from datarefinery.tuple.TupleOperations import keep
from datarefinery.Tr import Tr

keep_people = Tr(keep(["who"])).apply()

In [None]:
greets = sc.parallelize([{"greet": "hello", "who": "world"}, {"greet": "hello", "who": "Tom"}])
people = greets.map(keep_people).map(lambda x: x[1]['who'])

In [None]:
print(people.collect())