In [None]:
%pip install numpy pandas scikit-learn

from contextlib import contextmanager
from json import dumps, load
from pathlib import Path
from timeit import default_timer
from typing import Dict

from pandas import DataFrame

from numpy import append, array, float64, ndarray, reshape, uint8
from numpy.typing import NDArray

from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

Note: you may need to restart the kernel to use updated packages.


In [None]:
@contextmanager
def timer():
  '''from given starter code'''

  start = default_timer()
  def elapser(): return default_timer() - start
  yield lambda: elapser()
  end = default_timer()
  def elapser(): return end - start


def time(name: str, model: OneVsRestClassifier | SGDClassifier | SVC, X: NDArray[float64], y: NDArray[uint8]) -> None:
  '''
  time fitting models
  :param name: the name of the model
  :type name: str
  :param model: the model to fit
  :type model: OneVsRestClassifier | SGDClassifier | SVC
  :param X: the training feature set
  :type X: NDArray[float64]
  :param y: the training label set
  :type y: NDArray[uint8]
  '''

  file: str = f'times/{name}.txt'
  if Path(file).is_file():
    with open(file, 'r') as time:
      print(f'\ntraining time: {time.read()}')
  else:
    with timer() as fit_timer:
      model.fit(X, y)

    print(f'training time: {fit_timer()}')
    with open(file, 'w') as time:
      time.write(str(fit_timer()))

def crop_time(name: str, X: NDArray[float64], model: OneVsRestClassifier | SGDClassifier | SVC, y_train: NDArray[uint8], y_test: NDArray[uint8]) -> None:
  '''
  time fitting cropped models
  :param name: the name of the model
  :type name: str
  :param X: the feature set
  :type X: NDArray[float64]
  :param model: the model to train
  :type model: OneVsRestClassifier | SGDClassifier | SVC
  :param y_train: the training labels
  :type y_train: NDArray[uint8]
  :param y_test: the testing labels
  :type y_test: NDArray[uint8]
  '''

  info: Dict[str, int | float] = {'dim': 28, 'time': float('inf'), 'perf': 0}

  if Path(f'times/{name}.json').is_file():
    with open(f'times/{name}.txt', 'r') as json_file:
      info = load(json_file)
    print(f'\ndim.: {info['dim']}\ntraining time: {info['time']}\nf1-score: {info['perf']}')
  else:
    for i in range(0, 14):
      curr_dim: int = 28 - i
      X_reshape: NDArray[float64] = array(
        [instance.reshape(28, 28)[i:curr_dim, i:curr_dim].flatten() for instance in X])
      X_reshape_train: NDArray[float64]
      X_reshape_test: NDArray[float64]
      X_reshape_train, X_reshape_test = X[:60000], X[60000:]

      with timer() as fit_timer:
        model.fit(X_reshape_train, y_train)
      score: float = f1_score(y_test, model.predict(X_reshape_test), average='micro')
      curr_time: float = fit_timer()
      print(f'\ndim.: {curr_dim}\ntraining time: {curr_time}\nf1-score: {score}')

      if score < perf:
        break
      info = {'dim': curr_dim, 'time': curr_time, 'perf': score}

    with open(f'times/{info}.json', 'w') as json_file:
      json_file.write(dumps(info, indent=2))

In [None]:
mnist: DataFrame = fetch_openml('mnist_784', version=1)

X: NDArray[float64] = mnist['data'].astype(float64).to_numpy()
y: NDArray[uint8] = mnist['target'].astype(uint8).to_numpy()

X_train: NDArray[float64] = X[:60000]

y_train: NDArray[uint8]
y_test: NDArray[uint8]
y_train, y_test = y[:60000], y[60000:]

scaler: StandardScaler = StandardScaler()
X_train_scale: NDArray[float64] = scaler.fit_transform(X_train)

In [90]:
time('sgd', SGDClassifier(random_state=42), X_train_scale, y_train)

runtime: 225.77199569999993


In [None]:
time('svc_17500', SVC(gamma='auto', random_state=42),
     X_train_scale[:17500], y_train[:17500])

runtime: 0.12480860000050598


In [None]:
time('svc_35000', SVC(gamma='auto', random_state=42),
     X_train_scale[:35000], y_train[:35000])

runtime: 0.3617899999999281


In [None]:
time('svc_70000', SVC(gamma='auto', random_state=42),
     X_train_scale, y_train)

runtime: 1.2012546000005386


In [None]:
time('ovr_17500', OneVsRestClassifier(SVC(gamma='auto', random_state=42)),
     X_train_scale[:17500], y_train[:17500])

runtime: 0.3732375999998112


In [None]:
time('ovr_35000', OneVsRestClassifier(SVC(gamma='auto', random_state=42)),
     X_train_scale[:35000], y_train[:35000])

runtime: 1.6762154000007286


In [None]:
time('ovr_70000', OneVsRestClassifier(SVC(gamma='auto', random_state=42)),
     X_train_scale, y_train)

runtime: 7.157275699999445


In [None]:
crop_time('sgd_crop', X, SGDClassifier(random_state=42), y_train, y_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ 0.  0. 30. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ... 96.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  4.  0.  0.]]
[[ 49. 238. 253. ...  78.   

In [None]:
crop_time('svc_crop', X, SVC(gamma='auto', random_state=42), y_train, y_test)

In [None]:
crop_time('ovr_crop', X, OneVsRestClassifier(SVC(gamma='auto', random_state=42)), y_train, y_test)