In [None]:

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import fetch_openml
import pandas


from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [None]:
# https://www.openml.org/search?type=data&sort=runs&id=554&status=active
# https://www.openml.org/d/554
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)

In [None]:
X.shape


In [None]:
X[0].shape


In [None]:
plt.imshow(X[1].reshape(28,28))


In [None]:
pandas.Series(y).value_counts().sort_index().plot(kind='bar')



In [None]:
index_sevens = np.where(y == '7')[0]
index_ones = np.where(y == '1')[0]


In [None]:
# downsample '1'

index_ones_sample = np.random.choice(index_ones, size=10)

In [None]:
# balanced
sample_size = min(index_ones.shape[0], index_sevens.shape[0])

Xb = X[np.concatenate((index_sevens[:sample_size], index_ones[:sample_size]))]
yb = y[np.concatenate((index_sevens[:sample_size], index_ones[:sample_size]))]

# imbalanced
Xn = X[np.concatenate((index_sevens, index_ones_sample))]
yn = y[np.concatenate((index_sevens, index_ones_sample))]

In [None]:
pandas.Series(yb).value_counts().sort_index().plot(kind='bar')
plt.show() 
pandas.Series(yn).value_counts().sort_index().plot(kind='bar')
plt.show()


In [None]:
# balanced model


X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(Xb, yb, test_size=0.25, random_state=42, stratify=yb)
clf_b = DecisionTreeClassifier(max_depth=1, random_state=42)
clf_b.fit(X_train_b, y_train_b)
y_pred_b = clf_b.predict(X_test_b)
print(accuracy_score(y_test_b, y_pred_b))

In [None]:
pandas.Series(y_test_b).value_counts().sort_index().plot(kind='bar')


In [None]:
# IMbalanced model

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(Xn, yn, test_size=0.25, random_state=42)
clf_n = DecisionTreeClassifier(max_depth=1, random_state=42)
clf_n.fit(X_train_n, y_train_n)
y_pred_n = clf_n.predict(X_test_n)
print(accuracy_score(y_test_n, y_pred_n))

In [None]:
y_pred = clf_n.predict(X_test_b)
print(accuracy_score(y_test_b, y_pred))

In [None]:
print(classification_report(y_test_b, y_pred))
