In [64]:
%pip install numpy pandas scikit-learn

from contextlib import contextmanager
from json import dumps, load
from pathlib import Path
from timeit import default_timer
from typing import Callable, Dict

from pandas import DataFrame

from numpy import append, array, float64, ndarray, reshape, uint8, unique
from numpy.ma import masked_array, masked_less_equal
from numpy.typing import NDArray

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

Note: you may need to restart the kernel to use updated packages.


In [None]:
@contextmanager
def timer():
  '''from given starter code'''

  start = default_timer()
  def elapser(): return default_timer() - start
  yield lambda: elapser()
  end = default_timer()
  def elapser(): return end - start


def time(name: str, model: OneVsRestClassifier | SGDClassifier | SVC, X: NDArray[float64], y: NDArray[uint8]) -> None:
  '''
  time fitting models
  :param name: the name of the model
  :type name: str
  :param model: the model to fit
  :type model: OneVsRestClassifier | SGDClassifier | SVC
  :param X: the training feature set
  :type X: NDArray[float64]
  :param y: the training label set
  :type y: NDArray[uint8]
  '''

  file: str = f'times/{name}.txt'
  if Path(file).is_file():
    with open(file, 'r') as time:
      print(f'\ntraining time: {time.read()}')
  else:
    with timer() as fit_timer:
      model.fit(X, y)

    curr_time: float = fit_timer()
    print(f'training time: {curr_time}')
    with open(file, 'w') as time:
      time.write(str(curr_time))

def crop_time(name: str, X_train: NDArray[float64], X_test: NDArray[float64], model: OneVsRestClassifier | SGDClassifier | SVC, y_train: NDArray[uint8], y_test: NDArray[uint8]) -> None:
  '''
  time fitting cropped models
  :param name: the name of the model
  :type name: str
  :param X_train: the feature set to train with
  :type X_train: NDArray[float64]
  :param X_test: the feature set to test with
  :type X_test: NDArray[float64]
  :param model: the model to train
  :type model: OneVsRestClassifier | SGDClassifier | SVC
  :param y_train: the training labels
  :type y_train: NDArray[uint8]
  :param y_test: the testing labels
  :type y_test: NDArray[uint8]
  '''

  crop: Callable[[NDArray[float64], int], NDArray[float64]] = lambda X, dim: array([instance.reshape(28, 28)[i:dim, i:dim].flatten() for instance in X])
  info: Dict[str, int | float] = {'dim': 28, 'time': float('inf'), 'perf': 0}
  file: str = f'times/{name}.json'

  if Path(file).is_file():
    with open(file, 'r') as json_file:
      info = load(json_file)
    print(f'\ndim.: {info['dim']}\ntraining time: {info['time']}\nf1-score: {info['perf']}')
  else:
    for i in range(1, 14):
      curr_dim: int = 28 - i
      X_train_crop: NDArray[float64] = crop(X_train, curr_dim)
      X_test_crop: NDArray[float64] = crop(X_test, curr_dim)

      with timer() as fit_timer:
        model.fit(X_train_crop, y_train)
      score: float = f1_score(y_test, model.predict(X_test_crop), average='micro')
      curr_time: float = fit_timer()
      print(f'\ndim.: {curr_dim}\ntraining time: {curr_time}\nf1-score: {score}')
      sgd: SGDClassifier = SGDClassifier(random_state=42, n_jobs=-1)
      if score < info['perf']:
        break
      info = {'dim': curr_dim, 'time': curr_time, 'perf': score}

    with open(file, 'w') as json_file:
      json_file.write(dumps(info, indent=2))

def importance_time(name: str, importances_unique: NDArray[float64], importances: NDArray[float64], X: NDArray[float64], model: OneVsRestClassifier | SGDClassifier | SVC, y_train: NDArray[float64], y_test: NDArray[float64]) -> None:
  '''
  time fitting importance models
  :param name: the name of the model
  :type name: str
  :param importances_unique: the unique importance values
  :type importances_unique: NDArray[float64]
  :param importances: the importances of each feature
  :type importances: NDArray[float64]
  :param X: the feature set
  :type X: NDArray[float64]
  :param model: the model to train
  :type model: OneVsRestClassifier | SGDClassifier | SVC
  :param y_train: the training labels
  :type y_train: NDArray[float64]
  :param y_test: the testing labels
  :type y_test: NDArray[float64]
  '''

  info: Dict[str, float] = {'thresh': 0, 'time': float('inf'), 'perf': 0}

  if Path(f'times/{name}.json').is_file():
    with open(file, 'r') as json_file:
      info = load(json_file)
    print(f'\nthresh.: {info['thresh']}\ntraining time: {info['time']}\nf1-score: {info['perf']}')
  else:
    for thresh in importances_unique:
      mask: NDArray[bool] = masked_less_equal(importances, thresh).mask
      X_mask: NDArray[float64] = array([masked_array(instance, mask).compressed() for instance in X])
      X_train_mask: NDArray[float64]
      X_test_mask: NDArray[float64]
      X_train_mask = X_mask[:60000]
      X_test_mask = X_mask[60000:]

      with timer() as fit_timer:
        model.fit(X_train_mask, y_train)
      score: float = f1_score(y_test, model.predict(X_test_mask), average='micro')
      curr_time: float = fit_timer()
      print(f'\nthresh.: {thresh}\ntraining time: {curr_time}\nf1-score: {score}')

      if score < info['perf']:
        break
      info = {'thresh': thresh, 'time': curr_time, 'perf': score}

    with open(file, 'w') as json_file:
      json_file.write(dumps(info, indent=2))

In [66]:
mnist: DataFrame = fetch_openml('mnist_784', version=1)

X: NDArray[float64] = mnist['data'].astype(float64).to_numpy()
y: NDArray[uint8] = mnist['target'].astype(uint8).to_numpy()

X_train: NDArray[float64]
X_test: NDArray[float64]
X_train = X[:60000]
X_test = X[60000:]

y_train: NDArray[uint8]
y_test: NDArray[uint8]
y_train = y[:60000]
y_test = y[60000:]

scaler: StandardScaler = StandardScaler()
X_train_scale: NDArray[float64] = scaler.fit_transform(X_train)

In [67]:
time('sgd', SGDClassifier(random_state=42, n_jobs=-1), X_train_scale, y_train)


training time: 62.35176499999943


In [68]:
time('svc_1000', SVC(gamma='auto', random_state=42),
     X_train_scale[:1000], y_train[:1000])


training time: 0.14785229999688454


In [69]:
time('svc_2000', SVC(gamma='auto', random_state=42),
     X_train_scale[:2000], y_train[:2000])


training time: 0.4460183000046527


In [70]:
time('svc_4000', SVC(gamma='auto', random_state=42),
     X_train_scale[:4000], y_train[:4000])


training time: 1.270653300001868


In [71]:
time('ovr_1000', OneVsRestClassifier(SVC(gamma='auto', random_state=42), n_jobs=-1),
     X_train_scale[:1000], y_train[:1000])


training time: 6.220961700004409


In [72]:
time('ovr_2000', OneVsRestClassifier(SVC(gamma='auto', random_state=42), n_jobs=-1),
     X_train_scale[:2000], y_train[:2000])


training time: 3.7742016999982297


In [73]:
time('ovr_4000', OneVsRestClassifier(SVC(gamma='auto', random_state=42), n_jobs=-1),
     X_train_scale[:4000], y_train[:4000])


training time: 1.4222129999980098


In [74]:
crop_time('sgd_crop', X_train, X_test, SGDClassifier(random_state=42, n_jobs=-1), y_train, y_test)


dim.: 15
training time: 34.4232697999978
f1-score: 0.874


In [75]:
crop_time('svc_crop', X_train[:4000], X_test, SVC(gamma='auto', random_state=42), y_train[:4000], y_test)


dim.: 15
training time: 0.9878120999928797
f1-score: 0.2346


In [76]:
crop_time('ovr_crop', X_train[:4000], X_test, OneVsRestClassifier(SVC(gamma='auto', random_state=42), n_jobs=-1), y_train[:4000], y_test)


dim.: 16
training time: 0.9502957999939099
f1-score: 0.1883


In [None]:
forest: RandomForestClassifier = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
forest.fit(X_train, y_train)
importances: NDArray[float64] = forest.feature_importances_
importances_unique: NDArray[float64] = unique(importances)
importances_unique.sort()

In [None]:
importance_time('sgd_important', importances_unique, importances, X, SGDClassifier(random_state=42, n_jobs=-1), y_train, y_test)