# Exercise 02: Homework Assignment 1.2

_Disclaimer:_ This is not necessarily the best implementation, just one simple example how to "get the job done."  We use this exercise to familiarize ourselves with some basic features and modules in Python that will be valuable over the next couple of weeks.

## Task

Create a Python class that offers the following functionality:

- Read a configuration from a file and save a configuration to a file. The implementation should work for both .yaml and .json files.
- Print the configuration.
- Return the configuration as a suitable data structure.
- Build a unique identifier for the configuration.  No two configurations should have the same identifier unless their content is identical.

We can use the example config `exercises/e01-config.yaml`.

In [1]:
import os
import yaml
import json
import dict_hash
import hashlib
import copy

import numpy as np

In [2]:
# the (absolute) path to which you cloned the repository
PATH_REPO = os.path.expanduser("~/repos/mlim")

## Some first steps

### Read config (from yaml)

Use packages `yaml` and `json` to read config files.

In [3]:
# yaml (YAML Ain’t Markup Language, Yet Another Markup Language)
def read_yaml(file):
    with open(file, "r") as con:
        config = yaml.safe_load(con)
    return config

In [4]:
# json (JavaScript Object Notation)
def read_json(file):
    with open(file, "r") as con:
        config = json.load(con)
    return config

In [5]:
config = read_yaml(f"{PATH_REPO}/exercises/e01-config.yaml")

In [6]:
config.keys()

dict_keys(['project', 'author', 'version', 'pipeline', 'config'])

In [7]:
config["author"]

'sebastian'

In [8]:
config["version"]

1.2

In [9]:
config["config"].keys()

dict_keys(['data', 'p2v', 'tsne'])

In [10]:
# we expect that each pipeline stage has an entry in `(object)[config]`
assert np.all(
    [x in config["config"] for x in config["pipeline"]]
)
config["pipeline"]

['data', 'p2v', 'tsne']

In [11]:
# let's just look at one entry in the config, otherwise the output is too long
config["config"]["data"]

{'variable_basket': 'basket_hash',
 'variable_values': ['j'],
 'batch_size': 1000,
 'shuffle': True,
 'n_negative_samples': 20,
 'power': 0.75}

### Create hash for config

In [12]:
dict_hash.sha256(config)

'35d57bc216f4cf81dce7d2636d29cfdf000f01dd542386a27d9f2640be943026'

In [13]:
config_new_author = copy.deepcopy(config)
config_new_author["author"] = "name of new author"
dict_hash.sha256(config_new_author)

'39fcfe16b9e74b076cd6b80ef1f09b373772ca1e8e4ec1ee0bf9e62f92b22343'

In [14]:
# we only care about the configs for pipeline stages so we hash `(object)["config"]`
# (excluding author, version, ...)
config_hash = dict_hash.sha256(config["config"])
assert config_hash == dict_hash.sha256(config_new_author["config"])
config_hash

'c1f513ac8fc79b14debee8b02705dd8ae0a528f5dac27bc16b333d71e7904b22'

### Write config (as json and yaml)

#### yaml

In [15]:
def write_yaml(x, file):
    with open(file, "w") as con:
        yaml.dump(x, con, sort_keys=False)

In [16]:
f"{PATH_REPO}/exercises/{config_hash}.yaml"

'/Users/sbstn/repos/mlim/exercises/c1f513ac8fc79b14debee8b02705dd8ae0a528f5dac27bc16b333d71e7904b22.yaml'

In [17]:
write_yaml(config, f"{PATH_REPO}/exercises/{config_hash}.yaml")

In [18]:
config_yaml_2 = read_yaml(f"{PATH_REPO}/exercises/{config_hash}.yaml")
config_yaml_2["config"]["data"]

{'variable_basket': 'basket_hash',
 'variable_values': ['j'],
 'batch_size': 1000,
 'shuffle': True,
 'n_negative_samples': 20,
 'power': 0.75}

#### json

In [19]:
def write_json(x, file):
    with open(file, "w") as con:
        json.dump(x, con, sort_keys=False)

In [20]:
f"{PATH_REPO}/exercises/{config_hash}.json"

'/Users/sbstn/repos/mlim/exercises/c1f513ac8fc79b14debee8b02705dd8ae0a528f5dac27bc16b333d71e7904b22.json'

In [21]:
write_json(config, f"{PATH_REPO}/exercises/{config_hash}.json")

In [22]:
config_json_2 = read_json(f"{PATH_REPO}/exercises/{config_hash}.json")
config_json_2["config"]["data"]

{'variable_basket': 'basket_hash',
 'variable_values': ['j'],
 'batch_size': 1000,
 'shuffle': True,
 'n_negative_samples': 20,
 'power': 0.75}

### Print config

#### yaml

In [23]:
print(yaml.dump(config["config"]["tsne"]))

tsne_data_kwargs:
  batch: 3000
  epoch: 4
  l2norm: true
  path_results: ./results/p2v-map-example
  pca: null
tsne_kwargs:
  angle: 0.5
  init: pca
  n_components: 2
  n_iter: 4000
  perplexity: 15
  random_state: 1
  verbose: 0



In [24]:
print(yaml.dump(config["config"]["tsne"], indent=4))

tsne_data_kwargs:
    batch: 3000
    epoch: 4
    l2norm: true
    path_results: ./results/p2v-map-example
    pca: null
tsne_kwargs:
    angle: 0.5
    init: pca
    n_components: 2
    n_iter: 4000
    perplexity: 15
    random_state: 1
    verbose: 0



#### json

In [25]:
print(json.dumps(config["config"]["tsne"]))

{"tsne_data_kwargs": {"epoch": 4, "batch": 3000, "l2norm": true, "pca": null, "path_results": "./results/p2v-map-example"}, "tsne_kwargs": {"random_state": 1, "n_components": 2, "n_iter": 4000, "perplexity": 15, "init": "pca", "angle": 0.5, "verbose": 0}}


In [26]:
print(json.dumps(config["config"]["tsne"], indent=4, sort_keys=True))

{
    "tsne_data_kwargs": {
        "batch": 3000,
        "epoch": 4,
        "l2norm": true,
        "path_results": "./results/p2v-map-example",
        "pca": null
    },
    "tsne_kwargs": {
        "angle": 0.5,
        "init": "pca",
        "n_components": 2,
        "n_iter": 4000,
        "perplexity": 15,
        "random_state": 1,
        "verbose": 0
    }
}


In [27]:
# not sorting keys will keep original order, probably easier for human users
print(json.dumps(config["config"]["tsne"], indent=4))

{
    "tsne_data_kwargs": {
        "epoch": 4,
        "batch": 3000,
        "l2norm": true,
        "pca": null,
        "path_results": "./results/p2v-map-example"
    },
    "tsne_kwargs": {
        "random_state": 1,
        "n_components": 2,
        "n_iter": 4000,
        "perplexity": 15,
        "init": "pca",
        "angle": 0.5,
        "verbose": 0
    }
}


## Class for managing configuration files

Let's put it all together

In [28]:
class Config():

    def __init__(self, config=None, file=None, method="yaml", indent=4):

        self.method = method
        self.indent = indent
        self.reader = self._reader()
        self.writer = self._writer()
        self.printer = self._printer()
        
        if config is not None and file is not None:
            raise Exception("Either provide `config` or `file`")
        if config is not None:
            self.config = config
        if file is not None:
            self.file = file
            self.read_config()
            assert np.all(
                [x in self.config["config"] for x in self.config["pipeline"]]
            )
        
        self.author = self.config["author"]
        self.project = self.config["project"]
        self.version = self.config["version"]

    def __getitem__(self, x):
        return self.config[x]
            
    def return_config(self):
        return self.config
            
    def read_config(self):
        with open(self.file, "r") as con:
            config = self.reader(con)
        self.config = config
        self.hash = dict_hash.sha256(self.config["config"])

    def write_config(self, path=""):
        with open(f"{path}{self.hash}.{self.method}", "w") as con:
            self.writer(self.config, con, sort_keys=False)

    def print_config(self):
        print(f"Project = {self.project}")
        print(f"Author = {self.author}")
        print(f"Config version = {self.version}")
        print(f"\nConfig content:")
        print(
            self.printer(self.config["config"], indent=self.indent, sort_keys=False)
        )

    def _reader(self):
        reader_methods = {
            "yaml": yaml.safe_load,
            "json": json.load,
        }
        return reader_methods[self.method]

    def _writer(self):
        writer_methods = {
            "yaml": yaml.dump,
            "json": json.dump,
        }
        return writer_methods[self.method]
    
    def _printer(self):
        printer_methods = {
            "yaml": yaml.dump,
            "json": json.dumps,
        }
        return printer_methods[self.method]

Some additional ideas:
- detect file type for config file name
- implement parent class for base functionality, child classes for `yaml` and `json`
- ... 

## Read config

In [29]:
config = Config(file=f"{PATH_REPO}/exercises/e01-config.yaml")

In [30]:
config.config

{'project': 'P2V-MAP',
 'author': 'sebastian',
 'version': 1.2,
 'pipeline': ['data', 'p2v', 'tsne'],
 'config': {'data': {'variable_basket': 'basket_hash',
   'variable_values': ['j'],
   'batch_size': 1000,
   'shuffle': True,
   'n_negative_samples': 20,
   'power': 0.75},
  'p2v': {'p2v_kwargs': {'size': 15,
    'bias_negative_sampling': True,
    'product_bias_negative_sampling': False,
    'normalise_weights': False,
    'regularisation': None,
    'use_covariates': False,
    'optimizer': {'method': 'adam',
     'control': {'beta1': 0.9, 'beta2': 0.999, 'epsilon': '1e-08'}},
    'path_results': './results/p2v-map-example',
    'n_batch_save': 1000,
    'n_batch_validation': 1000000,
    'n_batch_print': 1000,
    'n_products': 150,
    'verbose': 0,
    'train_streamer': None,
    'validation_streamer': None,
    'test_streamer': None},
   'p2v_train_kwargs': {'n_epoch': 5, 'learning_rate': 0.0005}},
  'tsne': {'tsne_data_kwargs': {'epoch': 4,
    'batch': 3000,
    'l2norm': Tr

## Get method

In [31]:
config["author"]

'sebastian'

In [32]:
config["pipeline"]

['data', 'p2v', 'tsne']

In [33]:
for stage in config["pipeline"]:
    print(stage)
    print(
        json.dumps(
            config["config"][stage],
            indent=4,
            sort_keys=False
        )
    )

data
{
    "variable_basket": "basket_hash",
    "variable_values": [
        "j"
    ],
    "batch_size": 1000,
    "shuffle": true,
    "n_negative_samples": 20,
    "power": 0.75
}
p2v
{
    "p2v_kwargs": {
        "size": 15,
        "bias_negative_sampling": true,
        "product_bias_negative_sampling": false,
        "normalise_weights": false,
        "regularisation": null,
        "use_covariates": false,
        "optimizer": {
            "method": "adam",
            "control": {
                "beta1": 0.9,
                "beta2": 0.999,
                "epsilon": "1e-08"
            }
        },
        "path_results": "./results/p2v-map-example",
        "n_batch_save": 1000,
        "n_batch_validation": 1000000,
        "n_batch_print": 1000,
        "n_products": 150,
        "verbose": 0,
        "train_streamer": null,
        "validation_streamer": null,
        "test_streamer": null
    },
    "p2v_train_kwargs": {
        "n_epoch": 5,
        "learning_rate":

## Return config as `dict`

In [34]:
config.return_config()

{'project': 'P2V-MAP',
 'author': 'sebastian',
 'version': 1.2,
 'pipeline': ['data', 'p2v', 'tsne'],
 'config': {'data': {'variable_basket': 'basket_hash',
   'variable_values': ['j'],
   'batch_size': 1000,
   'shuffle': True,
   'n_negative_samples': 20,
   'power': 0.75},
  'p2v': {'p2v_kwargs': {'size': 15,
    'bias_negative_sampling': True,
    'product_bias_negative_sampling': False,
    'normalise_weights': False,
    'regularisation': None,
    'use_covariates': False,
    'optimizer': {'method': 'adam',
     'control': {'beta1': 0.9, 'beta2': 0.999, 'epsilon': '1e-08'}},
    'path_results': './results/p2v-map-example',
    'n_batch_save': 1000,
    'n_batch_validation': 1000000,
    'n_batch_print': 1000,
    'n_products': 150,
    'verbose': 0,
    'train_streamer': None,
    'validation_streamer': None,
    'test_streamer': None},
   'p2v_train_kwargs': {'n_epoch': 5, 'learning_rate': 0.0005}},
  'tsne': {'tsne_data_kwargs': {'epoch': 4,
    'batch': 3000,
    'l2norm': Tr

## Print configuration

In [35]:
config.print_config()

Project = P2V-MAP
Author = sebastian
Config version = 1.2

Config content:
data:
    variable_basket: basket_hash
    variable_values:
    - j
    batch_size: 1000
    shuffle: true
    n_negative_samples: 20
    power: 0.75
p2v:
    p2v_kwargs:
        size: 15
        bias_negative_sampling: true
        product_bias_negative_sampling: false
        normalise_weights: false
        regularisation: null
        use_covariates: false
        optimizer:
            method: adam
            control:
                beta1: 0.9
                beta2: 0.999
                epsilon: 1e-08
        path_results: ./results/p2v-map-example
        n_batch_save: 1000
        n_batch_validation: 1000000
        n_batch_print: 1000
        n_products: 150
        verbose: 0
        train_streamer: null
        validation_streamer: null
        test_streamer: null
    p2v_train_kwargs:
        n_epoch: 5
        learning_rate: 0.0005
tsne:
    tsne_data_kwargs:
        epoch: 4
        batch: 3000
 

## Configuration hash (fingerprint)

In [36]:
config.hash

'c1f513ac8fc79b14debee8b02705dd8ae0a528f5dac27bc16b333d71e7904b22'

## Write configuration

In [37]:
config.write_config(path=f"{PATH_REPO}/exercises/")

In [38]:
def get_md5(file):
    return hashlib.md5(open(file, "rb").read()).hexdigest()

In [39]:
assert (
    get_md5(f"{PATH_REPO}/exercises/e01-config.yaml")==\
    get_md5(f"{PATH_REPO}/exercises/c1f513ac8fc79b14debee8b02705dd8ae0a528f5dac27bc16b333d71e7904b22.yaml")
)

## Initialize from `dict`

In [40]:
config_2 = Config(config=config.config)

In [41]:
config_2.config

{'project': 'P2V-MAP',
 'author': 'sebastian',
 'version': 1.2,
 'pipeline': ['data', 'p2v', 'tsne'],
 'config': {'data': {'variable_basket': 'basket_hash',
   'variable_values': ['j'],
   'batch_size': 1000,
   'shuffle': True,
   'n_negative_samples': 20,
   'power': 0.75},
  'p2v': {'p2v_kwargs': {'size': 15,
    'bias_negative_sampling': True,
    'product_bias_negative_sampling': False,
    'normalise_weights': False,
    'regularisation': None,
    'use_covariates': False,
    'optimizer': {'method': 'adam',
     'control': {'beta1': 0.9, 'beta2': 0.999, 'epsilon': '1e-08'}},
    'path_results': './results/p2v-map-example',
    'n_batch_save': 1000,
    'n_batch_validation': 1000000,
    'n_batch_print': 1000,
    'n_products': 150,
    'verbose': 0,
    'train_streamer': None,
    'validation_streamer': None,
    'test_streamer': None},
   'p2v_train_kwargs': {'n_epoch': 5, 'learning_rate': 0.0005}},
  'tsne': {'tsne_data_kwargs': {'epoch': 4,
    'batch': 3000,
    'l2norm': Tr

&mdash; <br>
Dr. Sebastian Gabel <br>
Machine Learning in Marketing &ndash; Exercise 2 <br>
2020 <br>