# Pipeline de Dados

Baixando, transformando e salvando os dados.


In [1]:
import numpy as np
import gzip
import urllib.request
import os

In [2]:
def download_file(url,source_path):
    out_file = source_path + url.split("/")[-1]

    # Download archive
    try:
        response =  urllib.request.urlopen(url)
        with urllib.request.urlopen(url) as response:
            with open(out_file, 'wb') as f:
                f.write(response.read())

    except Exception as e:
        print(e)

In [3]:
# Diretórios 
source_path = '../data/raw/'
data_path = '../data/processed/'

In [4]:
# Baixando os dados
urls = [
    'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
    'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
    'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
    'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
]

for url in urls:
    download_file(url,source_path)

In [5]:
def load_mnist(path, kind='train'):
   

    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16)

    return images, labels

In [6]:
# Carregando, transformando e salvando os dados.
train_images,train_labels=load_mnist(path = source_path, kind='train')
train_images = train_images.reshape(len(train_labels),28,28)
np.save(data_path+"train_images.npy", train_images)
np.save(data_path+"train_labels.npy", train_labels)

test_images,test_labels=load_mnist(path = source_path, kind='t10k')
test_images = test_images.reshape(len(test_labels),28,28)
np.save(data_path+"test_images.npy", test_images)
np.save(data_path+"test_labels.npy", test_labels)


In [7]:
# Estrutura dos dados
print("train_images:",train_images.shape)
print("train_labels:",train_labels.shape)
print("test_images:",test_images.shape)
print("test_labels:",test_labels.shape)

train_images: (60000, 28, 28)
train_labels: (60000,)
test_images: (10000, 28, 28)
test_labels: (10000,)
