# Dummy Model


In [17]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
%autoreload 2

In [19]:
from util import *
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix ,f1_score
from sklearn.dummy import DummyClassifier

NUM_FOLDS = 10


## Read the data


### Train Data


In [20]:
X, y = get_train_data(path='../data/train_pca_10.csv')

In [21]:
# standardize the data
X = standardize_features(X)

Mean values of each feature: 
 pca0    5.286438e-17
pca1    1.454244e-17
pca2   -8.715991e-18
pca3    2.086627e-17
pca4    1.006602e-17
pca5   -5.779081e-18
pca6    1.288451e-17
pca7    1.141605e-17
pca8   -3.031649e-17
pca9   -1.702934e-17
dtype: float64
Std values of each feature: 
 pca0    1.0
pca1    1.0
pca2    1.0
pca3    1.0
pca4    1.0
pca5    1.0
pca6    1.0
pca7    1.0
pca8    1.0
pca9    1.0
dtype: float64


In [22]:
# read the test data
X_test,y_test = get_test_data(path='../data/train_pca_10.csv')

## Dummy Model

The DummyClassifier class in scikit-learn provides several strategies for a baseline method, such as predicting the most frequent class label, predicting a random class label, or predicting based on the class distribution of the training set.

**Strategy to use to generate predictions:**

1. most_frequent:
   - The predict method always returns the most frequent class label in the observed y argument passed to fit.
   - The predict_proba method returns the matching one-hot encoded vector.
2. prior:

- The predict method always returns the most frequent class label in the observed y argument passed to fit (like most_frequent).
- Predict_proba always returns the empirical class distribution of y also known as the empirical class prior distribution.

3. stratified:

- The predict_proba method randomly samples one-hot vectors from a multinomial distribution parametrized by the empirical class prior probabilities.
- The predict method returns the class label which got probability one in the one-hot vector of predict_proba. Each sampled row of both methods is therefore independent and identically distributed.

4. uniform:

- Generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.

5. constant:

- Always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class.


In [23]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='stratified')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.81988
f1_macro:  0.500278279037511
f1_micro:  0.81988


In [24]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.27161541248021964
Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.90      0.35     44936
           1       0.79      0.10      0.18    164868

    accuracy                           0.27    209804
   macro avg       0.50      0.50      0.26    209804
weighted avg       0.67      0.27      0.21    209804



In [25]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='most_frequent')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8997733333333334
f1_macro:  0.4736214143289648
f1_micro:  0.8997733333333334


In [26]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.21418085451183008
Classification Report:
              precision    recall  f1-score   support

           0       0.21      1.00      0.35     44936
           1       0.00      0.00      0.00    164868

    accuracy                           0.21    209804
   macro avg       0.11      0.50      0.18    209804
weighted avg       0.05      0.21      0.08    209804



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='prior')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8997733333333334
f1_macro:  0.4736214143289648
f1_micro:  0.8997733333333334


In [28]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.21418085451183008
Classification Report:
              precision    recall  f1-score   support

           0       0.21      1.00      0.35     44936
           1       0.00      0.00      0.00    164868

    accuracy                           0.21    209804
   macro avg       0.11      0.50      0.18    209804
weighted avg       0.05      0.21      0.08    209804



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='uniform')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())

accuracy:  0.49948000000000004
f1_macro:  0.4037058278264281
f1_micro:  0.49948000000000004


In [30]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.5022973823187356
Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.50      0.30     44936
           1       0.79      0.50      0.61    164868

    accuracy                           0.50    209804
   macro avg       0.50      0.50      0.46    209804
weighted avg       0.66      0.50      0.55    209804



In [31]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='constant', constant=0)
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8997733333333334
f1_macro:  0.4736214143289648
f1_micro:  0.8997733333333334


In [32]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.21418085451183008
Classification Report:
              precision    recall  f1-score   support

           0       0.21      1.00      0.35     44936
           1       0.00      0.00      0.00    164868

    accuracy                           0.21    209804
   macro avg       0.11      0.50      0.18    209804
weighted avg       0.05      0.21      0.08    209804



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
