In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import os

# Set path to use 'src' subdir
sys.path.append(os.path.join(os.getcwd(), 'src'))

# Classification model

This is a toy exampe using a simple classification model. The data is created using three numerical variables.

\begin{align}
z & = 3 x_1 - x_2 + \frac{x_3}{2} + 0.1 n\\
y & = \left[
\begin{array}{ll}
'low' & z < -3 \\
'mid' & z < 3 \\
'hi'  & ! \\
\end{array}
\right. \\
\end{align}


where:
- x1 : Normally distributted N(0, 1)
- x2 : Normally distributted N(2, 2)
- x3 : Normally distributted N(-3, 5)
- n : Random noise, Normally distributted N(0, 1)

Here is the code for creating the dataset

In [None]:
def to_class(c):
    if c < -3:
        return 'low'
    if c < 3:
        return 'mid'
    return 'high'


# Create dataset
def create_dataset():
    # Number of samples
    num = 2000
    # Inputs: x1, x2, x3
    x1 = np.random.normal(0, 1, num)
    x2 = np.random.normal(2, 3, num)
    x3 = np.random.normal(-3, 5, num)
    # Noise
    n = np.random.normal(0, 1, num)
    # Output
    y = 3. * x1 - 1. * x2 + 0.5 * x3 + 0.1 * n
    # Categorical output
    y_str = np.array([to_class(c) for c in y])
    # Add missing data
    x1_na = (np.random.rand(num) < 0.01)
    x2_na = (np.random.rand(num) < 0.01)
    x3_na = (np.random.rand(num) < 0.01)
    x1[x1_na] = np.nan
    x2[x2_na] = np.nan
    x3[x3_na] = np.nan
    y_na = (np.random.rand(num) < 0.01)
    y_str[y_na] = ''
    # Create dataFrame
    df = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'y': y_str})
    df.to_csv('class3.csv', index=False)

# LogMl

We create a LogMl object, and then run it

In [None]:
!rm -rvf data/class3/class3.pkl logml_plots/class3

In [None]:
from logml import *

ml = LogMl('config/class3.yaml', verbose=True)
ml()