# Notebook for generating audit, model and task jsons

### *Use only when the model works perfectly well as 'code.py' script*

# Initialize below functions

In [1]:
def parse_np_matrix_to_json(a):
    output = {}
    for i in range(len(a[0])):
        output[str(a[0][i])] = int(a[1][i])
    return output

def summarize_categorical_variable(values, name, model):
    model['preprocessing'][name] = {
        "name": name,
        "type": "categorical",
        "number_of_unique_values": len(np.unique(values.dropna())),
        "number_of_missing_values": int(values.isna().sum()),
        "cat_frequencies": parse_np_matrix_to_json(np.unique(values.dropna(), return_counts=True)),
        "num_minimum": None,
        "num_1qu": None,
        "num_median": None,
        "num_mean": None,
        "num_3qu": None,
        "num_maximum": None
    }
    
def summarize_numerical_variable(values, name, model):
    model['preprocessing'][name] = {
        "name": name,
        "type": "numerical",
        "number_of_unique_values": len(np.unique(values.dropna())),
        "number_of_missing_values": int(values.isna().sum()),
        "cat_frequencies": None,
        "num_minimum": float(np.min(values.dropna())),
        "num_1qu": float(np.percentile(values.dropna(),25)),
        "num_median": float(np.percentile(values.dropna(),50)),
        "num_3q": float(np.percentile(values.dropna(),75)),
        "num_maximum": float(np.max(values.dropna()))
    }

# Dependencies

In [93]:
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from matplotlib import pyplot as plt
from platform import python_version

import openml
import pandas as pd
import numpy as np
import hashlib
import pkg_resources
import datetime
import json
import os

# Metadata

In [11]:
added_by = "Siemashko"
date = datetime.datetime.now().strftime("%d-%m-%Y")
dataset_id = "openml_kc1-numeric"
task_type = "regression"
task_target = "NUMDEFECTS"
task_id = f'{task_type}_{task_target}'
task = {
    "id": task_id,
    "added_by": added_by,
    "date": date,
    "dataset_id": dataset_id,
    "type": task_type,
    "target": task_target
}

with open('task.json', 'w') as fp:
    json.dump([task], fp, indent=4)

# Load the data for your model

In [216]:
np.random.seed(42)

datasetOpenmlId = 1070

dataset = openml.datasets.get_dataset(datasetOpenmlId)
(X, y, categorical, names) = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True,
    return_attribute_names=True,
    include_ignore_attributes=True
)

vals = {}
for i, name in enumerate(names):
    vals[name] = X[:, i]
vals[dataset.default_target_attribute] = y
df = pd.DataFrame(vals)

X = df.drop('NUMDEFECTS', axis=1)
y = df.loc[:, 'NUMDEFECTS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Model and md5 hash - remember to remain correct order of 'categorical' variable

'MLPRegressor'

In [217]:
transform_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns)

regressor = MLPRegressor(hidden_layer_sizes=[50, 15], random_state=42, max_iter=400, alpha=0.0006)

regressor.fit(X_train, y_train)

md5 = hashlib.md5(str(regressor).encode('utf-8')).hexdigest()

model = {
    "id": md5,
    "added_by": added_by,
    "date": date,
    "library": "scikit",
    "model_name": regressor.__class__.__name__,
    "task_id": task_id,
    "dataset_id": dataset_id,
    "parameters": regressor.get_params(),
    "preprocessing": {}
}

print(f'md5 hash: {md5}')

for i in range(len(names)):
    if categorical[i]:
        summarize_categorical_variable(X_train.loc[:,names[i]], names[i], model)
    else:
        summarize_numerical_variable(X_train.loc[:,names[i]], names[i], model)
        
try:
    os.mkdir(f'/home/siemashko/Desktop/2019L-WarsztatyBadawcze/models/{dataset_id}/{task_id}/{md5}')
except:
    print(f'Directory {md5} already exists')
    
with open(f'/home/siemashko/Desktop/2019L-WarsztatyBadawcze/models/{dataset_id}/{task_id}/{md5}/model.json', 'w') as fp:
    json.dump([model], fp, indent=4)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


md5 hash: e5fd45d2ed0455daebc29e48d9f68d7f
Directory e5fd45d2ed0455daebc29e48d9f68d7f already exists


# Audit

In [218]:
y_pred = regressor.predict(transform_pipeline.transform(X_test))

audit = {"id": f'audit_{md5}',
         "date": datetime.datetime.now().strftime("%d-%m-%Y"),
         "added_by": added_by,
         "model_id": md5,
         "task_id": task_id,
         "dataset_id": dataset_id}

audit['performance'] = {
             "MSE": mean_squared_error(y_test, y_pred),
             "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
             "MAE": mean_absolute_error(y_test, y_pred),
             "R2": r2_score(y_test, y_pred),
         }

with open(f'/home/siemashko/Desktop/2019L-WarsztatyBadawcze/models/{dataset_id}/{task_id}/{md5}/audit.json', 'w') as fp:
    json.dump([audit], fp, indent=4)

  Xt = transform.transform(Xt)


In [167]:
code_py = """#:# libraries
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from matplotlib import pyplot as plt
from platform import python_version

import openml
import pandas as pd
import numpy as np
import hashlib
import pkg_resources
import datetime
import json
import os

#:# config

np.random.seed(42)

#:# data

datasetId = 1070

dataset = openml.datasets.get_dataset(datasetId)
(X, y, categorical, names) = dataset.get_data(
    target=dataset.default_target_attribute,
    return_categorical_indicator=True,
    return_attribute_names=True,
    include_ignore_attributes=True
)

vals = {}
for i, name in enumerate(names):
    vals[name] = X[:, i]
vals[dataset.default_target_attribute] = y
df = pd.DataFrame(vals)

X = df.drop('NUMDEFECTS', axis=1)
y = df.loc[:, 'NUMDEFECTS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

#:# preprocessing

transform_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train), columns=X_train.columns)

#:# model

regressor = MLPRegressor(hidden_layer_sizes=[50, 15], random_state=42, max_iter=400, alpha=0.0009)
regressor.fit(X_train, y_train)

#:# hash
#:# b5e36cb00a948148308cccec6aff6b05
md5 = hashlib.md5(str(regressor).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = regressor.predict(transform_pipeline.transform(X_test))

print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')

#:# session info

# Dodaj wersję pythona w session info

sessionInfo = {
    "python_version": python_version(),
    "library_versions":[str(d) for d in pkg_resources.working_set]
}
with open('sessionInfo.txt', 'w') as f:
    json.dump(sessionInfo, f, indent=4)
"""
with open(f'/home/siemashko/Desktop/2019L-WarsztatyBadawcze/models/{dataset_id}/{task_id}/{md5}/code.py', 'w') as fp:
    fp.write(code_py)