# Dummy Model


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from util import *
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix ,f1_score
from sklearn.dummy import DummyClassifier

NUM_FOLDS = 10


## Read the data


### Train Data


In [4]:
X, X_test, y, y_test = get_data(path='../data/train_pca_20.csv')
# standardize the data
X = standardize_features(X)
X_test = standardize_features(X_test)

## Dummy Model

The DummyClassifier class in scikit-learn provides several strategies for a baseline method, such as predicting the most frequent class label, predicting a random class label, or predicting based on the class distribution of the training set.

**Strategy to use to generate predictions:**

1. most_frequent:
   - The predict method always returns the most frequent class label in the observed y argument passed to fit.
   - The predict_proba method returns the matching one-hot encoded vector.
2. prior:

- The predict method always returns the most frequent class label in the observed y argument passed to fit (like most_frequent).
- Predict_proba always returns the empirical class distribution of y also known as the empirical class prior distribution.

3. stratified:

- The predict_proba method randomly samples one-hot vectors from a multinomial distribution parametrized by the empirical class prior probabilities.
- The predict method returns the class label which got probability one in the one-hot vector of predict_proba. Each sampled row of both methods is therefore independent and identically distributed.

4. uniform:

- Generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.

5. constant:

- Always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class.


In [5]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='stratified')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.81929
f1_macro:  0.49861507940412125
f1_micro:  0.81929


In [6]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.82144
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     89951
           1       0.11      0.10      0.10     10049

    accuracy                           0.82    100000
   macro avg       0.50      0.50      0.50    100000
weighted avg       0.82      0.82      0.82    100000



In [7]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='most_frequent')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8995099999999999
f1_macro:  0.47354844130885043
f1_micro:  0.8995099999999999


In [8]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.89951
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     89951
           1       0.00      0.00      0.00     10049

    accuracy                           0.90    100000
   macro avg       0.45      0.50      0.47    100000
weighted avg       0.81      0.90      0.85    100000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='prior')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8995099999999999
f1_macro:  0.47354844130885043
f1_micro:  0.8995099999999999


In [10]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.89951
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     89951
           1       0.00      0.00      0.00     10049

    accuracy                           0.90    100000
   macro avg       0.45      0.50      0.47    100000
weighted avg       0.81      0.90      0.85    100000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='uniform')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())

accuracy:  0.49940999999999997
f1_macro:  0.40474650863298994
f1_micro:  0.49940999999999997


In [12]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.50116
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.50      0.64     89951
           1       0.10      0.50      0.17     10049

    accuracy                           0.50    100000
   macro avg       0.50      0.50      0.41    100000
weighted avg       0.82      0.50      0.60    100000



In [13]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='constant', constant=0)
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8995099999999999
f1_macro:  0.47354844130885043
f1_micro:  0.8995099999999999


In [14]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.89951
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     89951
           1       0.00      0.00      0.00     10049

    accuracy                           0.90    100000
   macro avg       0.45      0.50      0.47    100000
weighted avg       0.81      0.90      0.85    100000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
