In [5]:
import import_ipynb
from AlexNetModel import AlexNet_v1,AlexNet_v2,AlexNet_v1_2gpu
import tensorflow as tf
import os
import time
import glob
import random

importing Jupyter notebook from AlexNetModel.ipynb


In [6]:
os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"  # os.environ[“CUDA_DEVICE_ORDER”] = “PCI_BUS_ID” # 按照PCI_BUS_ID顺序从0开始排列GPU设备
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'        # 设置当前使用的GPU设备仅为0号设备  设备名称为'/gpu:0'

In [7]:
gpus = tf.config.experimental.list_physical_devices('GPU')
logical_gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu,True)
    except RuntimeError as e:
        print(e)
        exit(-1)
logical_gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [8]:
data_root = os.path.abspath(os.path.join(os.getcwd(),"../../datasets"))
image_path = data_root+"/flower_data/"
train_dir = image_path+'train'
validation_dir = image_path+'val'

In [9]:
im_height = 224
im_width = 224
batch_size = 32
epochs = 10

In [10]:
# class dict
data_class = [cla for cla in os.listdir(train_dir) if '.txt' not in cla]
class_num = len(data_class)
class_dict = dict((value,index) for index,value in enumerate(data_class))
inverse_dict = dict((value,key) for key,value in class_dict.items())
inverse_dict

{0: 'dandelion', 1: 'daisy', 2: 'roses', 3: 'tulips', 4: 'sunflowers'}

In [11]:
# load train images list
train_image_list = glob.glob(train_dir+"/*/*.jpg")
random.shuffle(train_image_list)
train_num = len(train_image_list)
train_label_list = [class_dict[path.split(os.path.sep)[-2]] for path in train_image_list]

In [12]:
# load validation images list
val_image_list = glob.glob(validation_dir+"/*/*.jpg")
random.shuffle(val_image_list)
val_num = len(val_image_list)
val_label_list = [class_dict[path.split(os.path.sep)[-2]] for path in val_image_list]

In [13]:
def process_path(img_path,label):
    label = tf.one_hot(label,depth=class_num)
    image = tf.io.read_file(img_path)
    image = tf.image.decode_jpeg(image)
    image = tf.image.convert_image_dtype(image,tf.float32)
    image = tf.image.resize(image,[im_height,im_width])
    return image,label

In [14]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
AUTOTUNE

-1

In [None]:
strategy = tf.distribute.MirroredStrategy()  
# batch_size_per_replica = 32
# # Global batch size
# GLOBAL_BATCH_SIZE = batch_size_per_replica * strategy.num_replicas_in_sync
# # Buffer size for data loader
# BUFFER_SIZE = batch_size_per_replica * strategy.num_replicas_in_sync * 16

In [15]:
# load train dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_list,train_label_list))
"""
通过‘’tf.data.Dataset.prefetch‘转换，tf.data’ API提供了一个软件流水线操作机制，可以用来解耦数据产生的时间和数据消耗的时间。
特别地，转换使用一个后台线程和一个内部缓冲区，以便在请求输入数据集的元素之前预取它们。
预取元素的数量应该等于(或者可能大于)单个训练步骤所消耗的批数。您可以手动调整这个值，
或者将其设置为tf.data.experimental.AUTOTUNE,它将提示tf.data runtime在运行时动态地调整值。
"""
train_dataset = train_dataset.shuffle(buffer_size=train_num).map(process_path,num_parallel_calls=AUTOTUNE)\
                                .repeat().batch(batch_size).prefetch(AUTOTUNE)

In [16]:
val_dataset = tf.data.Dataset.from_tensor_slices((val_image_list,val_label_list))
val_dataset = val_dataset.map(process_path,num_parallel_calls=AUTOTUNE).repeat().batch(batch_size)

In [17]:

with strategy.scope(): 
    model = AlexNet_v1_2gpu(im_height=im_height,im_width=im_width,class_num=5)
    model.summary() 
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
             loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False), #因为已经softmax所以false
             metrics=["accuracy"])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
zero_padding2d (ZeroPadding2 (None, 227, 227, 3)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 55, 55, 96)        34944     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 27, 27, 96)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 27, 27, 256)       614656    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 13, 256)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 13, 13, 384)       885120

In [19]:
history = model.fit(x=train_dataset,
                        steps_per_epoch=train_num // batch_size,
                        epochs=epochs,
                        validation_data=val_dataset,
                        validation_steps=val_num // batch_size)

Train for 103 steps, validate for 11 steps
Epoch 1/10
INFO:tensorflow:batch_all_reduce: 16 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
