# Dummy Model


In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from util import *
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix ,f1_score
from sklearn.dummy import DummyClassifier

NUM_FOLDS = 10


## Read the data


### Train Data


In [4]:
X, y = get_train_data(path='../data/train.csv')

In [5]:
# standardize the data
X = standardize_features(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Dummy Model

The DummyClassifier class in scikit-learn provides several strategies for a baseline method, such as predicting the most frequent class label, predicting a random class label, or predicting based on the class distribution of the training set.

**Strategy to use to generate predictions:**

1. most_frequent:
   - The predict method always returns the most frequent class label in the observed y argument passed to fit.
   - The predict_proba method returns the matching one-hot encoded vector.
2. prior:

- The predict method always returns the most frequent class label in the observed y argument passed to fit (like most_frequent).
- Predict_proba always returns the empirical class distribution of y also known as the empirical class prior distribution.

3. stratified:

- The predict_proba method randomly samples one-hot vectors from a multinomial distribution parametrized by the empirical class prior probabilities.
- The predict method returns the class label which got probability one in the one-hot vector of predict_proba. Each sampled row of both methods is therefore independent and identically distributed.

4. uniform:

- Generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.

5. constant:

- Always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class.


In [7]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='stratified')
zeroR.fit(X_train, y_train)

cv_results = cross_validate(zeroR, X_train, y_train, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8215
f1_macro:  0.4987028437537974
f1_micro:  0.8215


In [8]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.826
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90      4495
           1       0.13      0.12      0.12       505

    accuracy                           0.83      5000
   macro avg       0.51      0.51      0.51      5000
weighted avg       0.82      0.83      0.82      5000



In [9]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='most_frequent')
zeroR.fit(X_train, y_train)

cv_results = cross_validate(zeroR, X_train, y_train, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.90195
f1_macro:  0.47422381964761345
f1_micro:  0.90195


In [10]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.899
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      4495
           1       0.00      0.00      0.00       505

    accuracy                           0.90      5000
   macro avg       0.45      0.50      0.47      5000
weighted avg       0.81      0.90      0.85      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='prior')
zeroR.fit(X_train, y_train)

cv_results = cross_validate(zeroR, X_train, y_train, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.90195
f1_macro:  0.47422381964761345
f1_micro:  0.90195


In [12]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.899
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      4495
           1       0.00      0.00      0.00       505

    accuracy                           0.90      5000
   macro avg       0.45      0.50      0.47      5000
weighted avg       0.81      0.90      0.85      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='uniform')
zeroR.fit(X_train, y_train)

cv_results = cross_validate(zeroR, X_train, y_train, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.4998
f1_macro:  0.4022572401968524
f1_micro:  0.4998


In [14]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.4876
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.49      0.63      4495
           1       0.10      0.48      0.16       505

    accuracy                           0.49      5000
   macro avg       0.49      0.48      0.40      5000
weighted avg       0.81      0.49      0.58      5000



In [15]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='constant', constant=0)
zeroR.fit(X_train, y_train)

cv_results = cross_validate(zeroR, X_train, y_train, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.90195
f1_macro:  0.47422381964761345
f1_micro:  0.90195


In [16]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.899
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      4495
           1       0.00      0.00      0.00       505

    accuracy                           0.90      5000
   macro avg       0.45      0.50      0.47      5000
weighted avg       0.81      0.90      0.85      5000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
