# MERCS 101 - Lecture 03: Mix Classification & Regression

This is the third part of the tutorial, combining classification and regression

## Preliminaries

### External Imports

In [1]:
import numpy as np
import os
import sys
from sklearn.metrics import (mean_absolute_error,
                             mean_squared_error,
                             mean_squared_log_error)
import pandas as pd

### MERCS imports

In [2]:
sys.path.insert(0, '..') # We add the parent dir to the path
from src.mercs.core import MERCS
from src.mercs.utils import *

import src.datasets as datasets

  from numpy.core.umath_tests import inner1d


## Induction

### Importing Data

First, we import the nursery dataset.

In [3]:
train, test = datasets.load_fertility()

load_example_dataset is loading fname: ../resc/data/fertility_train.csv

load_example_dataset is loading fname: ../resc/data/fertility_test.csv



This is a fully numerical dataset

In [4]:
train.head()

Unnamed: 0,season,age,child_diseases,accident,surgical_intervention,high_fever,alco,smoking,h_seating,diagnosis
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,1
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,0
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5,1
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,1
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5,0


In [10]:
test.head()

Unnamed: 0,season,age,child_diseases,accident,surgical_intervention,high_fever,alco,smoking,h_seating,diagnosis
0,-0.33,0.5,1,1,0,-1,0.8,0,0.88,0
1,0.33,0.69,1,0,0,1,1.0,-1,0.31,1
2,1.0,0.56,1,0,0,1,0.6,0,0.5,1
3,-1.0,0.5,1,0,0,1,0.8,-1,0.44,1
4,-1.0,0.53,1,0,0,1,0.8,-1,0.63,1


In [14]:
nb_atts = test.shape[1]
nb_atts

10

### Training

In [6]:
model = MERCS()

In [8]:
ind_parameters = {'ind_type':           'RF',
                  'ind_n_estimators':   10,
                  'ind_max_depth':      4}

sel_parameters = {'sel_type':           'Base',
                  'sel_its':            8,
                  'sel_param':          1}

In [21]:
model.fit(train, **ind_parameters, **sel_parameters)

## Introspection

### Identification of types

MERCS makes some decisions regarding the attribute types automatically.

In [36]:
model.s['metadata']['clf_labels']

[['numeric'],
 ['numeric'],
 array([0., 1.]),
 array([0., 1.]),
 array([0., 1.]),
 array([-1.,  0.,  1.]),
 ['numeric'],
 array([-1.,  0.,  1.]),
 ['numeric'],
 array([0., 1.])]

In [37]:
train

Unnamed: 0,season,age,child_diseases,accident,surgical_intervention,high_fever,alco,smoking,h_seating,diagnosis
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,1
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,0
2,-0.33,0.50,1,0,0,0,1.0,-1,0.50,1
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,1
4,-0.33,0.67,1,1,0,0,0.8,-1,0.50,0
5,-0.33,0.67,1,0,1,0,0.8,0,0.50,1
6,-0.33,0.67,0,0,0,-1,0.8,-1,0.44,1
7,-0.33,1.00,1,1,1,0,0.6,-1,0.38,1
8,1.00,0.64,0,0,1,0,0.8,-1,0.25,1
9,1.00,0.61,1,0,0,0,1.0,-1,0.25,1


In [35]:
train.nunique()

season                    3
age                      14
child_diseases            2
accident                  2
surgical_intervention     2
high_fever                3
alco                      5
smoking                   3
h_seating                13
diagnosis                 2
dtype: int64

## Inference

### Prediction

In [15]:
code = [0]*nb_atts
code[-1] = 1
print(code)

target_boolean = np.array(code) == 1
y_true = test[test.columns.values[target_boolean]].values

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [16]:
y_true

array([[0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [17]:
pred_parameters = {'pred_type':     'IT',
                   'pred_param':    0.1,
                   'pred_its':      4}

In [18]:
y_pred = model.predict(test,
                       **pred_parameters,
                       qry_code=code)

SETTINGS.PY: I AM READING A SINGLE QUERY CODE, I.E: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
Predicting q_code: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [19]:
y_pred

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

### Evaluation 

In [20]:
y_true = test[test.columns.values[np.array(code)==1]].values

In [15]:
obs_1 = mean_absolute_error(y_true, y_pred)
obs_2 = mean_squared_error(y_true, y_pred)
obs_3 = mean_squared_log_error(y_true, y_pred)

obs = [obs_1, obs_2, obs_3]

for o in obs:
    assert isinstance(o, (int, float))
    assert 0 <= o 

In [16]:
obs_3

0.005933170080519997