In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import numpy as np

In [2]:
import torchvision as tv
import time

In [3]:
BATCH_SIZE=256

In [4]:
train_dataset = tv.datasets.FashionMNIST('.', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.FashionMNIST('.', train=False, transform=tv.transforms.ToTensor(), download=True)
train = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [5]:
train_dataset[0][0].shape

torch.Size([1, 28, 28])

In [6]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 10)
)

In [7]:
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

In [8]:
def train_model():
    for ep in range(num_epochs):
        train_iters, train_passed  = 0, 0
        train_loss, train_acc = 0., 0.
        start=time.time()
    
        model.train()
        for X, y in train:
            trainer.zero_grad()
            y_pred = model(X)
            l = loss(y_pred, y)
            l.backward()
            trainer.step()
            train_loss += l.item()
            train_acc += (y_pred.argmax(dim=1) == y).sum().item()
            train_iters += 1
            train_passed += len(X)
    
        test_iters, test_passed  = 0, 0
        test_loss, test_acc = 0., 0.
        model.eval()
        for X, y in test:
            y_pred = model(X)
            l = loss(y_pred, y)
            test_loss += l.item()
            test_acc += (y_pred.argmax(dim=1) == y).sum().item()
            test_iters += 1
            test_passed += len(X)
        
        print("ep: {}, taked: {:.3f}, train_loss: {}, train_acc: {}, test_loss: {}, test_acc: {}".format(
            ep, time.time() - start, train_loss / train_iters, train_acc / train_passed,
            test_loss / test_iters, test_acc / test_passed)
        )

In [10]:
train_model()

ep: 0, taked: 7.245, train_loss: 1.7601350500228556, train_acc: 0.55265, test_loss: 1.2999123871326446, test_acc: 0.6515
ep: 1, taked: 7.307, train_loss: 1.0789860649311795, train_acc: 0.6722333333333333, test_loss: 0.9449721798300743, test_acc: 0.6762
ep: 2, taked: 7.467, train_loss: 0.863167611588823, train_acc: 0.7067333333333333, test_loss: 0.8167662739753723, test_acc: 0.7094
ep: 3, taked: 7.517, train_loss: 0.7685404703972187, train_acc: 0.73855, test_loss: 0.7466320350766182, test_acc: 0.7354
ep: 4, taked: 7.513, train_loss: 0.7104806555078385, train_acc: 0.7587333333333334, test_loss: 0.6989318527281284, test_acc: 0.7548
ep: 5, taked: 8.195, train_loss: 0.6686997966563448, train_acc: 0.7739833333333334, test_loss: 0.663357450067997, test_acc: 0.7683
ep: 6, taked: 7.545, train_loss: 0.6364650584281759, train_acc: 0.7869, test_loss: 0.6356029383838177, test_acc: 0.7789
ep: 7, taked: 7.470, train_loss: 0.6106998621149266, train_acc: 0.7962666666666667, test_loss: 0.613337654620409

In [11]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 7.924, train_loss: 0.587893165933325, train_acc: 0.8109666666666666, test_loss: 0.4391354352235794, test_acc: 0.8439
ep: 1, taked: 7.693, train_loss: 0.3846598372814503, train_acc: 0.8601333333333333, test_loss: 0.4147147431969643, test_acc: 0.8501
ep: 2, taked: 7.795, train_loss: 0.35326170122369804, train_acc: 0.8705666666666667, test_loss: 0.3897495724260807, test_acc: 0.8611
ep: 3, taked: 7.855, train_loss: 0.3323100947953285, train_acc: 0.8774166666666666, test_loss: 0.37809828817844393, test_acc: 0.8651
ep: 4, taked: 7.861, train_loss: 0.31795938534939544, train_acc: 0.8820166666666667, test_loss: 0.409425650909543, test_acc: 0.8578
ep: 5, taked: 7.945, train_loss: 0.3078056680395248, train_acc: 0.8858833333333334, test_loss: 0.3740345239639282, test_acc: 0.8683
ep: 6, taked: 7.800, train_loss: 0.29441306952466356, train_acc: 0.8896333333333334, test_loss: 0.3830126073211432, test_acc: 0.8687
ep: 7, taked: 7.727, train_loss: 0.2880462475913636, train_acc: 0.89303333

In [12]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)

In [13]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 9.239, train_loss: 0.5796468372040606, train_acc: 0.78615, test_loss: 0.4751006603240967, test_acc: 0.8228
ep: 1, taked: 8.900, train_loss: 0.39172746558138666, train_acc: 0.85625, test_loss: 0.4041179705411196, test_acc: 0.8591
ep: 2, taked: 8.877, train_loss: 0.35894011354192773, train_acc: 0.8677, test_loss: 0.4030242063105106, test_acc: 0.8621
ep: 3, taked: 9.734, train_loss: 0.33736372482269367, train_acc: 0.8757, test_loss: 0.36005138978362083, test_acc: 0.8682
ep: 4, taked: 9.627, train_loss: 0.31821161629037653, train_acc: 0.8836333333333334, test_loss: 0.38276813067495824, test_acc: 0.8572
ep: 5, taked: 9.720, train_loss: 0.3124156154216604, train_acc: 0.8847, test_loss: 0.3977614603936672, test_acc: 0.8589
ep: 6, taked: 9.798, train_loss: 0.29894380639208123, train_acc: 0.8893, test_loss: 0.3727261498570442, test_acc: 0.8699
ep: 7, taked: 9.691, train_loss: 0.2987341870652868, train_acc: 0.8895, test_loss: 0.40575418025255205, test_acc: 0.8668
ep: 8, taked: 9.68

In [14]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [15]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 9.618, train_loss: 0.47627001290625715, train_acc: 0.82325, test_loss: 0.4551668807864189, test_acc: 0.829
ep: 1, taked: 11.698, train_loss: 0.37274380871590146, train_acc: 0.8638666666666667, test_loss: 0.4183744054287672, test_acc: 0.8501
ep: 2, taked: 9.800, train_loss: 0.3368694584420387, train_acc: 0.8759666666666667, test_loss: 0.382206367701292, test_acc: 0.8653
ep: 3, taked: 9.577, train_loss: 0.3111954168436375, train_acc: 0.8855, test_loss: 0.38737270701676607, test_acc: 0.8645
ep: 4, taked: 9.721, train_loss: 0.29096772512222857, train_acc: 0.8923833333333333, test_loss: 0.38593940418213607, test_acc: 0.8662
ep: 5, taked: 9.459, train_loss: 0.2785996473215996, train_acc: 0.8956, test_loss: 0.3617354419082403, test_acc: 0.8678
ep: 6, taked: 9.714, train_loss: 0.26541429276162004, train_acc: 0.90105, test_loss: 0.36743786111474036, test_acc: 0.8703
ep: 7, taked: 9.695, train_loss: 0.2555426927957129, train_acc: 0.905, test_loss: 0.3914118766784668, test_acc: 0.86

In [16]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 2560),
    torch.nn.ReLU(),
    torch.nn.Dropout(0,5),
    torch.nn.Linear(2560, 1280),
    torch.nn.ReLU(),
    torch.nn.Dropout(0,5),
    torch.nn.Linear(1280, 640),
    torch.nn.ReLU(),
    torch.nn.Dropout(0,5),
    torch.nn.Linear(640, 10)
)

In [17]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 29.867, train_loss: 1.204211634524325, train_acc: 0.76635, test_loss: 0.47525575906038287, test_acc: 0.8249
ep: 1, taked: 30.029, train_loss: 0.4079807149603012, train_acc: 0.8504833333333334, test_loss: 0.4303401470184326, test_acc: 0.8485
ep: 2, taked: 30.790, train_loss: 0.3764487272247355, train_acc: 0.8627, test_loss: 0.43754935935139655, test_acc: 0.8493
ep: 3, taked: 38.728, train_loss: 0.37185923926373743, train_acc: 0.86385, test_loss: 0.4115158535540104, test_acc: 0.8577
ep: 4, taked: 42.269, train_loss: 0.35209068288194373, train_acc: 0.8705333333333334, test_loss: 0.3933768939226866, test_acc: 0.8613
ep: 5, taked: 40.765, train_loss: 0.33385013119971496, train_acc: 0.8781, test_loss: 0.4096515320241451, test_acc: 0.856
ep: 6, taked: 37.845, train_loss: 0.32761769776648664, train_acc: 0.87765, test_loss: 0.40887233018875124, test_acc: 0.8538
ep: 7, taked: 37.922, train_loss: 0.33163515830293616, train_acc: 0.87745, test_loss: 0.4142324265092611, test_acc: 0.858

In [20]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [21]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 8.251, train_loss: 0.4644355477170741, train_acc: 0.8303666666666667, test_loss: 0.4350229226052761, test_acc: 0.8396
ep: 1, taked: 7.841, train_loss: 0.3661520337804835, train_acc: 0.8664666666666667, test_loss: 0.40292198173701765, test_acc: 0.8529
ep: 2, taked: 7.789, train_loss: 0.3361668385723804, train_acc: 0.8771666666666667, test_loss: 0.3995236875489354, test_acc: 0.8553
ep: 3, taked: 7.894, train_loss: 0.3135426676019709, train_acc: 0.8852166666666667, test_loss: 0.4074003094807267, test_acc: 0.8486
ep: 4, taked: 7.917, train_loss: 0.2955461283313467, train_acc: 0.8906833333333334, test_loss: 0.4066134406253695, test_acc: 0.8585
ep: 5, taked: 7.927, train_loss: 0.28052983987838664, train_acc: 0.8959166666666667, test_loss: 0.41030587246641514, test_acc: 0.8504
ep: 6, taked: 7.933, train_loss: 0.26484038823462547, train_acc: 0.9004, test_loss: 0.4403573803603649, test_acc: 0.8629
ep: 7, taked: 7.950, train_loss: 0.2568029174145232, train_acc: 0.9036833333333333, 

In [24]:
trainer = torch.optim.RMSprop(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 7.787, train_loss: 0.33952321422860976, train_acc: 0.8802333333333333, test_loss: 0.5620562620460987, test_acc: 0.8065
ep: 1, taked: 8.028, train_loss: 0.25805019121220774, train_acc: 0.9038166666666667, test_loss: 0.5188459839671851, test_acc: 0.8221
ep: 2, taked: 8.027, train_loss: 0.24104834325770114, train_acc: 0.9079833333333334, test_loss: 0.5447124138474464, test_acc: 0.833
ep: 3, taked: 7.834, train_loss: 0.22624286102487687, train_acc: 0.91405, test_loss: 0.458222671598196, test_acc: 0.8695
ep: 4, taked: 7.726, train_loss: 0.21904112512760973, train_acc: 0.9176666666666666, test_loss: 0.515175261721015, test_acc: 0.8383
ep: 5, taked: 7.751, train_loss: 0.211131962182674, train_acc: 0.9197333333333333, test_loss: 0.5323040628805756, test_acc: 0.8457
ep: 6, taked: 7.746, train_loss: 0.20312167425104913, train_acc: 0.9217666666666666, test_loss: 0.5364961341023445, test_acc: 0.8512
ep: 7, taked: 7.786, train_loss: 0.19139417400385472, train_acc: 0.9273666666666667, 

In [25]:
trainer = torch.optim.SGD(model.parameters(), lr=.01)
train_model()

ep: 0, taked: 7.861, train_loss: 0.18784283531155993, train_acc: 0.9284833333333333, test_loss: 0.46616086848080157, test_acc: 0.8792
ep: 1, taked: 7.679, train_loss: 0.17732520502932528, train_acc: 0.9325166666666667, test_loss: 0.4581877153366804, test_acc: 0.8808
ep: 2, taked: 7.778, train_loss: 0.17235297186577575, train_acc: 0.9344, test_loss: 0.4530596468597651, test_acc: 0.8826
ep: 3, taked: 7.719, train_loss: 0.1689288723500485, train_acc: 0.9356333333333333, test_loss: 0.44932129234075546, test_acc: 0.8828
ep: 4, taked: 7.611, train_loss: 0.16631260125878009, train_acc: 0.9366, test_loss: 0.4464335225522518, test_acc: 0.8842
ep: 5, taked: 7.575, train_loss: 0.16419577138855102, train_acc: 0.93735, test_loss: 0.4440814480185509, test_acc: 0.8845
ep: 6, taked: 7.832, train_loss: 0.16243436672586076, train_acc: 0.9381, test_loss: 0.4421222865581512, test_acc: 0.8845
ep: 7, taked: 7.875, train_loss: 0.16092378045333192, train_acc: 0.93865, test_loss: 0.4404309745877981, test_acc: 

Лучшей моделью оказалась модель со средним количеством слоев, SGD и BatchNorm. Удалось выйти на результат выше 88% как на обучающей, так и на тестовой выборке. При данных параметрах на обучающей выборке потери показывают лучший результат. На тестовой средний результат, однако переобучения вроде бы нет, так как результат с каждой эпохой падал.