# Dummy Model


In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [5]:
import util
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier

NUM_FOLDS = 10


## Read the data


### Train Data


In [6]:
X, y = util.get_train_data(path='../data/train.csv', scaleNumericalFeatures=True)

## Dummy Model

The DummyClassifier class in scikit-learn provides several strategies for a baseline method, such as predicting the most frequent class label, predicting a random class label, or predicting based on the class distribution of the training set.

**Strategy to use to generate predictions:**

1. most_frequent:
   - The predict method always returns the most frequent class label in the observed y argument passed to fit.
   - The predict_proba method returns the matching one-hot encoded vector.
2. prior:

- The predict method always returns the most frequent class label in the observed y argument passed to fit (like most_frequent).
- Predict_proba always returns the empirical class distribution of y also known as the empirical class prior distribution.

3. stratified:

- The predict_proba method randomly samples one-hot vectors from a multinomial distribution parametrized by the empirical class prior probabilities.
- The predict method returns the class label which got probability one in the one-hot vector of predict_proba. Each sampled row of both methods is therefore independent and identically distributed.

4. uniform:

- Generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.

5. constant:

- Always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class.


In [7]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='stratified')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.81959
f1_macro:  0.49969086512792005
f1_micro:  0.81959


In [8]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='most_frequent')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8995099999999999
f1_macro:  0.4735484413817992
f1_micro:  0.8995099999999999


In [9]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='prior')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8995099999999999
f1_macro:  0.4735484413817992
f1_micro:  0.8995099999999999


In [10]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='uniform')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.50054
f1_macro:  0.40538418730498493
f1_micro:  0.50054


In [12]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='constant', constant=0)
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8995099999999999
f1_macro:  0.4735484413817992
f1_micro:  0.8995099999999999
