# Dummy Model


In [17]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
%autoreload 2

In [19]:
from util import *
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix ,f1_score
from sklearn.dummy import DummyClassifier

NUM_FOLDS = 10


## Read the data


### Train Data


In [20]:
X, y = get_train_data(path='../data/train_pca_20.csv',training_size=100000)

In [21]:
# standardize the data
X = standardize_features(X)

Mean values of each feature: 
 pca0    -2.660983e-17
pca1    -6.039613e-18
pca2    -8.668621e-18
pca3     3.272049e-17
pca4    -1.250555e-17
pca5     2.970069e-17
pca6    -3.268497e-17
pca7    -4.121148e-18
pca8    -1.243450e-17
pca9    -8.384404e-18
pca10   -6.821210e-18
pca11    8.668621e-18
pca12   -6.892265e-18
pca13   -1.865175e-17
pca14    1.008971e-17
pca15    4.760636e-18
pca16   -1.989520e-17
pca17    2.145839e-17
pca18   -6.714629e-18
pca19   -1.577405e-17
dtype: float64
Std values of each feature: 
 pca0     1.0
pca1     1.0
pca2     1.0
pca3     1.0
pca4     1.0
pca5     1.0
pca6     1.0
pca7     1.0
pca8     1.0
pca9     1.0
pca10    1.0
pca11    1.0
pca12    1.0
pca13    1.0
pca14    1.0
pca15    1.0
pca16    1.0
pca17    1.0
pca18    1.0
pca19    1.0
dtype: float64


In [22]:
# read the test data
X_test,y_test = get_test_data(path='../data/train_pca_20.csv',test_size=100000)

## Dummy Model

The DummyClassifier class in scikit-learn provides several strategies for a baseline method, such as predicting the most frequent class label, predicting a random class label, or predicting based on the class distribution of the training set.

**Strategy to use to generate predictions:**

1. most_frequent:
   - The predict method always returns the most frequent class label in the observed y argument passed to fit.
   - The predict_proba method returns the matching one-hot encoded vector.
2. prior:

- The predict method always returns the most frequent class label in the observed y argument passed to fit (like most_frequent).
- Predict_proba always returns the empirical class distribution of y also known as the empirical class prior distribution.

3. stratified:

- The predict_proba method randomly samples one-hot vectors from a multinomial distribution parametrized by the empirical class prior probabilities.
- The predict method returns the class label which got probability one in the one-hot vector of predict_proba. Each sampled row of both methods is therefore independent and identically distributed.

4. uniform:

- Generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.

5. constant:

- Always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class.


In [23]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='stratified')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8184699999999999
f1_macro:  0.4977632366599372
f1_micro:  0.8184699999999999


In [24]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.81861
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     89917
           1       0.10      0.10      0.10     10083

    accuracy                           0.82    100000
   macro avg       0.50      0.50      0.50    100000
weighted avg       0.82      0.82      0.82    100000



In [25]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='most_frequent')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8998500000000001
f1_macro:  0.47364265563458696
f1_micro:  0.8998500000000001


In [26]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.89917
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     89917
           1       0.00      0.00      0.00     10083

    accuracy                           0.90    100000
   macro avg       0.45      0.50      0.47    100000
weighted avg       0.81      0.90      0.85    100000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='prior')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8998500000000001
f1_macro:  0.47364265563458696
f1_micro:  0.8998500000000001


In [28]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.89917
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     89917
           1       0.00      0.00      0.00     10083

    accuracy                           0.90    100000
   macro avg       0.45      0.50      0.47    100000
weighted avg       0.81      0.90      0.85    100000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='uniform')
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())

accuracy:  0.5008999999999999
f1_macro:  0.4056152067337889
f1_micro:  0.5008999999999999


In [30]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.49843
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.50      0.64     89917
           1       0.10      0.49      0.17     10083

    accuracy                           0.50    100000
   macro avg       0.50      0.50      0.40    100000
weighted avg       0.82      0.50      0.59    100000



In [31]:
# Train ZeroR on the training set
zeroR = DummyClassifier(strategy='constant', constant=0)
zeroR.fit(X, y)

cv_results = cross_validate(zeroR, X, y, cv=NUM_FOLDS, scoring=[
                            'f1_macro', 'accuracy', 'f1_micro'])

print('accuracy: ', cv_results['test_accuracy'].mean())
print('f1_macro: ', cv_results['test_f1_macro'].mean())
print('f1_micro: ', cv_results['test_f1_micro'].mean())


accuracy:  0.8998500000000001
f1_macro:  0.47364265563458696
f1_micro:  0.8998500000000001


In [32]:
# Make predictions
zero_r_pred = zeroR.predict(X_test)

zero_r_accuracy = accuracy_score(y_test, zero_r_pred)
print("ZeroR Accuracy:", zero_r_accuracy)

print("Classification Report:")
print(classification_report(y_test, zero_r_pred))

ZeroR Accuracy: 0.89917
Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     89917
           1       0.00      0.00      0.00     10083

    accuracy                           0.90    100000
   macro avg       0.45      0.50      0.47    100000
weighted avg       0.81      0.90      0.85    100000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
