In [13]:
import sys
sys.path.append('../python')

import numpy as np
import pytest
import needle as ndl
from needle import backend_ndarray as nd
import needle.nn as nn
import needle.optim as optim
from needle import data as ndldata
import needle.init as init
import needle.ops as ops
import time
from tqdm import tqdm

np.random.seed(4)

# Matmul test - CPU VS GPU

todo: gpu显存不会释放

In [2]:
matmul_dims = (5000, 5000, 5000)

m = matmul_dims[0]
n = matmul_dims[1]
p = matmul_dims[2]

_A = np.random.randn(m, n)
_B = np.random.randn(n, p)

st = time.time()
_S = _A @ _B
ed = time.time()
total = ed - st

print("time of cpu:", total)

time of cpu: 0.8920676708221436


In [3]:
A = nd.array(_A, device = nd.cuda())
B = nd.array(_B, device = nd.cuda())

st = time.time()
S = A @ B
ed = time.time()
total = ed - st

print("time of gpu:", total) # 0.0013

time of gpu:Matmul in cuda
 0.0007078647613525391


# Training MNIST On CPU

In [14]:
BATCH_SIZE = 128
HIDDEN_DIM = 1024

# model
def MLPNet(dim, hidden_dim=100, num_classes=10, device=ndl.cpu_numpy()):
    ### BEGIN YOUR SOLUTION
    mlpNet = nn.Sequential(
        nn.Linear(in_features=dim, out_features=hidden_dim, device=device), 
        nn.ReLU(), 
        nn.Linear(in_features=hidden_dim, out_features=hidden_dim, device=device), 
        nn.ReLU(), 
        nn.Linear(in_features=hidden_dim, out_features=hidden_dim, device=device), 
        nn.ReLU(), 
        nn.Linear(in_features=hidden_dim, out_features=hidden_dim, device=device),
        nn.ReLU(), 
        nn.Linear(in_features=hidden_dim, out_features=512, device=device),
        nn.ReLU(), 
        nn.Linear(in_features=512, out_features=256, device=device),
        nn.ReLU(), 
        nn.Linear(in_features=256, out_features=num_classes, device=device))
    return mlpNet

mnist_train_dataset = ndldata.MNISTDataset("../data/train-images-idx3-ubyte.gz",
                                            "../data/train-labels-idx1-ubyte.gz")

mnist_train_dataloader = ndldata.DataLoader(dataset=mnist_train_dataset,
                                                 batch_size=BATCH_SIZE,
                                                 shuffle=True)

mnist_test_dataset = ndldata.MNISTDataset("../data/t10k-images-idx3-ubyte.gz",
                                               "../data/t10k-labels-idx1-ubyte.gz")

mnist_test_dataloader = ndldata.DataLoader(dataset=mnist_test_dataset,
                                            batch_size=BATCH_SIZE,
                                            shuffle=False)



In [5]:
def train_CPU(n_epochs, optimizer, model, loss_fn, train_loader):
    model.train()
    for epoch in range(1, n_epochs + 1):
        n_sample, correct = 0, 0
        loss_list = []
        acc_list = []
        for i, batch in tqdm(enumerate(train_loader)):
            x, y = batch[0], batch[1]
            x = x.reshape((x.shape[0], -1))
            y_hat = model(x)

            loss = loss_fn(y_hat, y)
            loss_list.append(loss.cached_data.numpy())

            logit = nd.NDArray(np.argmax(y_hat.cached_data.numpy(), axis=1))
            correct += np.sum((logit == y.cached_data).numpy().astype('int'))

            n_sample += x.shape[0]
            acc_list.append(correct / x.shape[0])

            opt.reset_grad()
            loss.backward()
            opt.step()
    
            acc, loss = correct/n_sample, np.mean(loss_list)
        print('Epoch {} - loss {} - acc {}'.format(epoch, loss, acc))


model = MLPNet(784, HIDDEN_DIM)
loss_fn = nn.SoftmaxLoss()
opt = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

In [6]:
train_CPU(n_epochs = 5, optimizer = opt, model = model, loss_fn = loss_fn, train_loader = mnist_train_dataloader)

600it [00:05, 107.71it/s]


Epoch 1 - loss 0.3711097538471222 - acc 0.8967666666666667


600it [00:05, 104.80it/s]


Epoch 2 - loss 0.18391425907611847 - acc 0.94865


600it [00:05, 109.75it/s]


Epoch 3 - loss 0.14476287364959717 - acc 0.9604833333333334


600it [00:05, 106.82it/s]


Epoch 4 - loss 0.12549129128456116 - acc 0.9660666666666666


600it [00:05, 109.27it/s]

Epoch 5 - loss 0.1112636849284172 - acc 0.9701166666666666





# Training MNIST On GPU

In [15]:
def train_GPU(n_epochs, optimizer, model, loss_fn, train_loader):
    model.train()
    for epoch in range(1, n_epochs + 1):
        n_sample, correct = 0, 0
        loss_list = []
        acc_list = []
        for i, batch in tqdm(enumerate(train_loader)):
            x, y = batch[0], batch[1]
            gx = x.cuda()
            gy = y.cuda()
            gx = gx.reshape((gx.shape[0], -1))
            y_hat = model(gx)

            loss = loss_fn(y_hat, gy)
            loss_list.append(loss.cached_data.numpy())

            logit = nd.array(np.argmax(y_hat.cached_data.numpy(), axis=1), device = ndl.cuda())
            # print(logit.device)
            # print(y.cached_data.device)
            correct += np.sum((logit == gy.cached_data).numpy().astype('int'))

            n_sample += gx.shape[0]
            acc_list.append(correct / x.shape[0])

            opt.reset_grad()
            loss.backward()
            opt.step()
    
            acc, loss = correct/n_sample, np.mean(loss_list)
        print('Epoch {} - loss {} - acc {}'.format(epoch, loss, acc))

In [16]:
# Training
gpu_device = ndl.cuda()
gpu_device.init_cublas()
gmodel = MLPNet(784, HIDDEN_DIM, device=gpu_device)
loss_fn = nn.SoftmaxLoss()
opt = optim.Adam(gmodel.parameters(), lr=0.001, weight_decay=0.001)

train_GPU(n_epochs = 5, optimizer = opt, model = gmodel, loss_fn = loss_fn, train_loader = mnist_train_dataloader)
gpu_device.clean_cublas()

469it [00:43, 10.86it/s]


Epoch 1 - loss 0.23252712190151215 - acc 0.9286166666666666


469it [00:42, 10.93it/s]


Epoch 2 - loss 0.12316399812698364 - acc 0.9639333333333333


469it [00:43, 10.89it/s]


Epoch 3 - loss 0.10738688707351685 - acc 0.9689333333333333


469it [00:42, 10.91it/s]


Epoch 4 - loss 0.09373391419649124 - acc 0.9727166666666667


469it [00:43, 10.90it/s]

Epoch 5 - loss 0.07917285710573196 - acc 0.9769666666666666





In [18]:
# save checkpoint and new_model load
ndl.save('../weights/gmodel1024l7_5epoch.pkl', gmodel.state_dict())

gmodel_new = MLPNet(784, HIDDEN_DIM, device=ndl.cuda())
gmodel_ckpt = ndl.load('../weights/gmodel1024l7_5epoch.pkl', device=ndl.cuda())
gmodel_new.load_state_dict(gmodel_ckpt)

In [7]:
# Transfer training
loss_fn = nn.SoftmaxLoss()
opt = optim.Adam(gmodel_new.parameters(), lr=0.001, weight_decay=0.001)

# Training
gpu_device = ndl.cuda()
gpu_device.init_cublas()
train_GPU(n_epochs = 5, optimizer = opt, model = gmodel_new, loss_fn = loss_fn, train_loader = mnist_train_dataloader)
gpu_device.clean_cublas()
ndl.save('../weights/gmodel284l4_5epoch.pkl', gmodel_new.state_dict())

468it [00:05, 91.42it/s]


Epoch 1 - loss 0.06674222648143768 - acc 0.9794895777659006


469it [00:05, 93.50it/s]


Epoch 2 - loss 0.06045741215348244 - acc 0.9820666666666666


469it [00:05, 92.85it/s]


Epoch 3 - loss 0.05804314836859703 - acc 0.9824666666666667


469it [00:05, 92.63it/s]


Epoch 4 - loss 0.054068706929683685 - acc 0.9840666666666666


469it [00:05, 89.57it/s]

Epoch 5 - loss 0.05034878849983215 - acc 0.9853666666666666





# Evaluation

In [19]:
def eval_GPU(model, test_loader):
    model.eval()
    n_sample, correct = 0, 0
    acc_list = []
    for i, batch in tqdm(enumerate(test_loader)):
        x, y = batch[0], batch[1]
        gx = x.cuda()
        gy = y.cuda()
        gx = gx.reshape((gx.shape[0], -1))
        y_hat = model(gx)

        logit = nd.array(np.argmax(y_hat.cached_data.numpy(), axis=1), device = ndl.cuda())
        # print(logit.device)
        # print(y.cached_data.device)
        correct += np.sum((logit == gy.cached_data).numpy().astype('int'))

        n_sample += gx.shape[0]
        acc_list.append(correct / x.shape[0])

    acc = correct/n_sample
    print('acc {}'.format(acc))

In [23]:
gpu_device = ndl.cuda()
gpu_device.init_cublas()
st = time.time()
eval_GPU(gmodel, test_loader=mnist_test_dataloader)
print(time.time() - st)
gpu_device.clean_cublas()

79it [00:00, 371.05it/s]

acc 0.9646
0.2165372371673584



