In [1]:
! pip install git+https://github.com/FireFlyTy/datrics-model-json

Collecting git+https://github.com/FireFlyTy/datrics-model-json
  Cloning https://github.com/FireFlyTy/datrics-model-json to /private/var/folders/vd/c6p08wcd38l_mvcjcy4g9h200000gq/T/pip-req-build-1078ux78
  Running command git clone -q https://github.com/FireFlyTy/datrics-model-json /private/var/folders/vd/c6p08wcd38l_mvcjcy4g9h200000gq/T/pip-req-build-1078ux78
Building wheels for collected packages: datrics-json
  Building wheel for datrics-json (setup.py) ... [?25ldone
[?25h  Created wheel for datrics-json: filename=datrics_json-0.1.0-py3-none-any.whl size=15888 sha256=cc2bd389faa0942ff943ebea55aaa310256442236e73457e4926d0a19fb6a846
  Stored in directory: /private/var/folders/vd/c6p08wcd38l_mvcjcy4g9h200000gq/T/pip-ephem-wheel-cache-gl6wt6eg/wheels/c0/fb/8f/40343db0c6c0743e11aebc583059741b45c1de051eca324e76
Successfully built datrics-json
Installing collected packages: datrics-json
Successfully installed datrics-json-0.1.0


In [2]:
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest, ExtraTreesRegressor, RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.tree import ExtraTreeRegressor, BaseDecisionTree
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge, Lasso

from dask_ml.preprocessing import OneHotEncoder, Categorizer, LabelEncoder, MinMaxScaler
import dask.dataframe as dd

In [3]:
import lightgbm as lgbm
import json
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [4]:
import datrics_json as datjson

In [5]:
def validate_model(model, deserealized_model, X):
    if isinstance(model, DBSCAN):
        return(model.fit_predict(X).tolist() == deserealized_model.fit_predict(X).tolist())
    elif isinstance(model, LabelEncoder):
        return(model.transform(X).flatten().tolist() == 
               deserealized_model.transform(X).flatten().tolist())
    elif isinstance(model, OneHotEncoder):
        return(model.transform(X).compute().values.tolist() == 
               deserealized_model.transform(X).compute().values.tolist())
    elif isinstance(model, MinMaxScaler):
        return(model.transform(X).compute().values.tolist() == 
               deserealized_model.transform(X).compute().values.tolist())
    else:
        return(model.predict(X).tolist() == deserealized_model.predict(X).tolist())

In [6]:
import pandas as pd
data_classification = pd.read_csv('iris.csv')
data_regression = pd.read_csv('Downloads/online_retail_II.csv')

X_class = data_classification[['sepal length', 'sepal width', 'petal length', 'petal width']]
y_class = data_classification.target
y_class_1 = data_classification.target == 'Iris-setosa'



a = data_regression.StockCode.value_counts()>1000
data_regression = data_regression[data_regression.StockCode.isin(a.index.values[a])]
data_regression.InvoiceDate = data_regression.InvoiceDate.astype('datetime64[ns]')
data_regression['date_month'] = data_regression.InvoiceDate.dt.month
data_regression['week_day'] = data_regression.InvoiceDate.dt.day_of_week
data_regression = data_regression[['StockCode', 'Quantity', 'Price', 'date_month', 'week_day']]
data_regression = pd.get_dummies(data_regression, columns = ['StockCode'])
X_regr = data_regression.drop('Quantity', axis = 1)
y_regr = data_regression['Quantity']

# Modelling

## SKLearn

In [7]:
model_logreg = LogisticRegression().fit(X_class, y_class)
model_linear = LinearRegression().fit(X_regr, y_regr)
model_elastic = ElasticNet().fit(X_regr, y_regr)
model_ridge = Ridge().fit(X_regr, y_regr)
model_lasso = Lasso().fit(X_regr, y_regr)

model_kmeans = KMeans(n_clusters=5).fit(X_class)
model_dbscan = DBSCAN().fit(X_class)
model_iforest = IsolationForest().fit(X_class)

## Dask-ML

In [8]:
yy = dd.from_pandas(pd.DataFrame(y_class), npartitions=2)
xx = dd.from_pandas(pd.DataFrame(X_class.iloc[:,0:1]), npartitions=2)

encoder_label = LabelEncoder().fit(yy[['target']].categorize())
encoder_onehot = OneHotEncoder().fit(yy[['target']].categorize())

scaler_min_max = MinMaxScaler()
scaler_min_max.fit(xx)

MinMaxScaler()

## LGBM

### Multiclass Classifier

In [9]:
model_init_parameters={}
model_init_parameters['boosting'] = 'gbdt' 
model_init_parameters['objective'] = 'multiclass'  # hardcode for binary classification
model_init_parameters['importance_type'] = 'gain'  # hardcode for gain
model_init_parameters['learning_rate'] = 0.1
model_init_parameters['num_iterations'] = 100
model_init_parameters['num_leaves'] = 31

model_lgbm_m_classifier = lgbm.LGBMClassifier(**model_init_parameters).fit(X_class, y_class)

### Binary Classification

In [10]:
model_init_parameters={}
model_init_parameters['boosting'] = 'gbdt' 
model_init_parameters['objective'] = 'binary'  # hardcode for binary classification
model_init_parameters['importance_type'] = 'gain'  # hardcode for gain
model_init_parameters['learning_rate'] = 0.1
model_init_parameters['num_iterations'] = 100
model_init_parameters['num_leaves'] = 31

model_lgbm_b_classifier = lgbm.LGBMClassifier(**model_init_parameters).fit(X_class, y_class_1)

### Regression

In [11]:
model_init_parameters={}
model_init_parameters['boosting'] = 'gbdt' 
model_init_parameters['objective'] = 'regression'  # hardcode for binary classification
model_init_parameters['importance_type'] = 'gain'  # hardcode for gain
model_init_parameters['learning_rate'] = 0.1
model_init_parameters['num_iterations'] = 100
model_init_parameters['num_leaves'] = 31

model_init_parameters['reg_alpha'] = 0.5
model_init_parameters['reg_lambda'] = 0.5


model_lgbm_regression = lgbm.LGBMRegressor(**model_init_parameters).fit(X_regr, y_regr)

## RF

### Multiclass Classifier

In [12]:
model_init_parameters={}
model_init_parameters['boosting'] = 'rf' 
model_init_parameters['objective'] = 'multiclass'  # hardcode for binary classification
model_init_parameters['importance_type'] = 'gain'  # hardcode for gain
model_init_parameters['learning_rate'] = 0.1
model_init_parameters['num_iterations'] = 100
model_init_parameters['num_leaves'] = 31
model_init_parameters['bagging_fraction'] = 0.5
model_init_parameters['bagging_freq'] = 1

model_rf_m_classifier = lgbm.LGBMClassifier(**model_init_parameters).fit(X_class, y_class)


### Binary Classification

In [13]:
model_init_parameters={}
model_init_parameters['boosting'] = 'rf' 
model_init_parameters['objective'] = 'binary'  # hardcode for binary classification
model_init_parameters['importance_type'] = 'gain'  # hardcode for gain
model_init_parameters['learning_rate'] = 0.1
model_init_parameters['num_iterations'] = 100
model_init_parameters['num_leaves'] = 31
model_init_parameters['bagging_fraction'] = 0.5
model_init_parameters['bagging_freq'] = 1

model_rf_b_classifier = lgbm.LGBMClassifier(**model_init_parameters).fit(X_class, y_class_1)

### Regression

In [14]:
model_init_parameters={}
model_init_parameters['boosting'] = 'rf' 
model_init_parameters['objective'] = 'regression'  # hardcode for binary classification
model_init_parameters['importance_type'] = 'gain'  # hardcode for gain
model_init_parameters['learning_rate'] = 0.1
model_init_parameters['num_iterations'] = 100
model_init_parameters['num_leaves'] = 31

model_init_parameters['reg_alpha'] = 0.5
model_init_parameters['reg_lambda'] = 0.5

model_init_parameters['bagging_fraction'] = 0.5
model_init_parameters['bagging_freq'] = 1

model_rf_regression = lgbm.LGBMRegressor(**model_init_parameters).fit(X_regr, y_regr)

# Serialized

In [15]:
datjson.to_json(model_logreg, 'model_logreg.json')
datjson.to_json(model_linear, 'model_linear.json')
datjson.to_json(model_elastic, 'model_elastic.json')
datjson.to_json(model_ridge, 'model_ridge.json')
datjson.to_json(model_lasso, 'model_lasso.json')

datjson.to_json(model_kmeans, 'model_kmeans.json')
datjson.to_json(model_dbscan, 'model_dbscan.json')

datjson.to_json(model_lgbm_m_classifier, 'model_lgbm_m_classifier.json')
datjson.to_json(model_lgbm_b_classifier, 'model_lgbm_b_classifier.json')

datjson.to_json(model_rf_m_classifier, 'model_rf_m_classifier.json')
datjson.to_json(model_rf_b_classifier, 'model_rf_b_classifier.json')

datjson.to_json(model_rf_regression, 'model_rf_regression.json')
datjson.to_json(model_lgbm_regression, 'model_lgbm_regression.json')

datjson.to_json(model_iforest, 'model_iforest.json')

datjson.to_json(encoder_label, 'encoder_label.json')
datjson.to_json(encoder_onehot, 'encoder_onehot.json')

datjson.to_json(scaler_min_max, 'scaler_min_max.json')

In [16]:
models = [[model_logreg, 'model_logreg.json', X_class],
          [model_linear, 'model_linear.json', X_regr],
          [model_elastic, 'model_elastic.json', X_regr],
          [model_ridge, 'model_ridge.json', X_regr],
          [model_lasso, 'model_lasso.json', X_regr],
          [model_kmeans, 'model_kmeans.json', X_class],
          [model_dbscan, 'model_dbscan.json', X_class],
          [model_lgbm_m_classifier, 'model_lgbm_m_classifier.json', X_class],
          [model_lgbm_b_classifier, 'model_lgbm_b_classifier.json', X_class],
          [model_rf_m_classifier, 'model_rf_m_classifier.json', X_class],
          [model_rf_b_classifier, 'model_rf_b_classifier.json', X_class],
          [model_rf_regression, 'model_rf_regression.json', X_regr],
          [model_lgbm_regression, 'model_lgbm_regression.json', X_regr],
          [model_iforest, 'model_iforest.json', X_class],
          [encoder_label, 'encoder_label.json', yy[['target']].categorize()],
          [encoder_onehot, 'encoder_onehot.json', yy[['target']].categorize()],
          [scaler_min_max, 'scaler_min_max.json', xx[['sepal length']]]]

In [17]:
for model in models:
    deserialized_model = datjson.from_json(model[1])
    print(model[1].split('.')[0],": ", validate_model(model[0], deserialized_model, model[2]))

model_logreg :  True
model_linear :  True
model_elastic :  True
model_ridge :  True
model_lasso :  True
model_kmeans :  True
model_dbscan :  True
Finished loading model, total used 100 iterations
model_lgbm_m_classifier :  True
Finished loading model, total used 100 iterations
model_lgbm_b_classifier :  True
Finished loading model, total used 100 iterations
model_rf_m_classifier :  True
Finished loading model, total used 100 iterations
model_rf_b_classifier :  True
Finished loading model, total used 100 iterations
model_rf_regression :  True
Finished loading model, total used 100 iterations
model_lgbm_regression :  True
model_iforest :  True
encoder_label :  True
encoder_onehot :  True
scaler_min_max :  True
