# Workshop Amazon SageMaker

In [None]:
import urllib.request
import tarfile
import os
import csv
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
s3_client = boto3.client('s3')
dir_path = os.getcwd()
file_name = 'Data_Entry_2017_v2020.csv'
bucket_name = 'TROQUE PELO NOME DO BUCKET CRIADO, VERIFIQUE O OUTPUT DO CLOUDFORMATION'
prefix = 'raw'

## Efetuando o download do dataset

In [None]:
# Download das imagens https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345
'''links = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
    'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
    'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
    'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
    'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
    'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
    'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
    'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
    'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
    'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
    'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
    'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
]'''

links = ['https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz']

for idx, link in enumerate(links):
    fn = 'images_%02d.tar.gz' % (idx+1)
    print('downloading'+fn+'...')
    urllib.request.urlretrieve(link, fn)

print("Download complete. Please check the checksums")

In [None]:
# Descompactando
def untar(fname):
    tar = tarfile.open(fname)
    tar.extractall()
    tar.close()
    print (f'Arquivos extraidos: {fname}')

In [None]:
for file in os.listdir(dir_path):
    if file.endswith("tar.gz"):
        untar(file)

## Explorando o dataset

In [None]:
df = pd.read_csv(f'{dir_path}/{file_name}')

In [None]:
print('Tamanho do dataset:')
print(df.shape)
print('\nColunas:')
print(df.dtypes)
print('\nExemplo do dado:')
df.head()

In [None]:
patient_ids = df['Patient ID']
uniq_pids = np.unique(patient_ids)
np.random.shuffle(uniq_pids)
total_ids = len(uniq_pids)
print(f'Número de pacientes únicos no dataset: {total_ids}')

In [None]:
print('Rótulos de doenças no dataset')
df['Finding Labels'].value_counts()

![diseases](diseases.png)

In [None]:
# Quantidade de imagens por gênero
df['Patient Gender'].value_counts().plot(kind='barh')

In [None]:
# Quantidade de imagens por idade
plt.hist(df['Patient Age'], bins = 10)
plt.show()

## Preparando o dataset para a etapa de treinamento

In [None]:
# Separando o dataset em treinamento, validação e teste
trainper = 0.7
valper = 0.1

trainset = int(trainper*total_ids)
valset = trainset+int(valper*total_ids)
testset = trainset+valset

In [None]:
train = uniq_pids[:trainset]
val = uniq_pids[trainset+1:valset]
test = uniq_pids[valset+1:]
print('Número de pacientes: treinamento: %d, validação: %d, teste: %d'%(len(train), len(val), len(test)))

In [None]:
traindata = df.loc[df['Patient ID'].isin(train)]
valdata = df.loc[df['Patient ID'].isin(val)]
testdata = df.loc[df['Patient ID'].isin(test)]

In [None]:
# Gravando os aquivos .csv
traindata.to_csv('traindata.csv', sep=',', header=False, index=False)
valdata.to_csv('valdata.csv', sep=',', header=False, index=False)
testdata.to_csv('testdata.csv', sep=',', header=False, index=False)

In [None]:
# Função para separar a coluna de doenças
def gen_set(csvfile, outputfile):
    disease_list = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', \
                   'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening', 'Cardiomegaly', 'Nodule', 'Mass', \
                   'Hernia']
    alldiseases = {disease:i for i,disease in enumerate(disease_list)}
    with open(outputfile, 'w') as fp:
        with open(csvfile, 'r') as cfile:
            line = csv.reader(cfile, delimiter=',')
            index = 0
            for element in line:
                # A primeira coluna corresponde ao nome da imagem
                # A segunda coluna possui o nome da doença separada por |
                diseases = element[1].split('|')
                fp.write('%d\t'%index)
                for d in alldiseases:
                    if d in diseases:
                        fp.write('%d\t'%1)
                    else:
                        fp.write('%d\t'%0)
                fp.write('images/%s\n' % element[0])
                index += 1

In [None]:
gen_set('traindata.csv', 'chestxraytrain.lst')
gen_set('valdata.csv', 'chestxrayval.lst')
gen_set('testdata.csv', 'chestxraytest.lst')  

In [None]:
# download do script para converter as imagens para o formato recordio
urllib.request.urlretrieve('https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py', 'im2rec.py')
print('Download concluído')

In [None]:
!python im2rec.py --pack-label chestxraytrain.lst .
!python im2rec.py --pack-label chestxrayval.lst .
!python im2rec.py --pack-label chestxraytest.lst . 

## Efetuando o upload dos arquivos para a etapa de treinamento