## 获取数据

下载数据集，项目数据集来自Kaggle，[Dogs vs. Cats Redux](https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data)

使用kaggle-api下载：kaggle competitions download -c dogs-vs-cats-redux-kernels-edition

In [None]:
import os
import shutil

if not os.path.isdir('train'):
    if os.path.isfile('train.zip'):
        os.system('unzip train.zip')
    else:
        print('FILES (train.zip) NOT FOUND, DOWNLOAD FIRST')

if not os.path.isdir('test'):
    if os.path.isfile('test.zip'):
        os.system('unzip test.zip')
    else:
        print('FILES (test.zip) NOT FOUND, DOWNLOAD FIRST')

In [None]:
import numpy as np

dogs=[]
cats=[]

for file in os.listdir('train'):
    if (file.split(sep='.')[0]=='cat'):
        cats=np.append(cats,file)
    else:
        dogs=np.append(dogs,file)

cat_lable=np.zeros(len(cats))
dog_lable=np.zeros(len(dogs))+1

### 异常值处理

根据观察训练数据集图片，发现存在以下两种情况：
1.有一些图片内容并非猫狗，这种类型的图片是属于错误图片，直接作为异常值处理。
2.此外此项目是为了识别是猫或者是狗，是一个二分类问题，因此在这个前提下不应该存在猫狗共存的图片，如果存在则会与项目目标二分类相违背，对于模型训练中的准确度也会产生较大影响，因此个人觉得如果同时存在猫狗，那么这类型图片也需要作为异常值处理。

此处异常值模型使用imagenet多个模型top-50来做批量识别，当图片是第一种情况时，模型对于图片识别结果top-50中没有识别到给定标签的品种；当图片是第二种情况时，模型对于图片识别结果top-50中同时存在猫狗品种。使用多个模型去识别图片，对多个模型预测中存在以上两种异常值，分别求两种情况下的交集。考虑到模型识别的准确率问题，最后需要人为去识别图片是否需要做为异常处理。

In [None]:
import numpy as np
import os

import tensorflow as tf
from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *
from tqdm import tqdm

##  imagenet 分类中的猫狗标签  ##
dogs_class = [ 'n02085620','n02085782','n02085936','n02086079','n02086240','n02086646','n02086910','n02087046','n02087394','n02088094',
              'n02088238','n02088364','n02088466','n02088632','n02089078','n02089867','n02089973','n02090379','n02090622','n02090721',
              'n02091032','n02091134','n02091244','n02091467','n02091635','n02091831','n02092002','n02092339','n02093256','n02093428',
              'n02093647','n02093754','n02093859','n02093991','n02094114','n02094258','n02094433','n02095314','n02095570','n02095889',
              'n02096051','n02096177','n02096294','n02096437','n02096585','n02097047','n02097130','n02097209','n02097298','n02097474',
              'n02097658','n02098105','n02098286','n02098413','n02099267','n02099429','n02099601','n02099712','n02099849','n02100236',
              'n02100583','n02100735','n02100877','n02101006','n02101388','n02101556','n02102040','n02102177','n02102318','n02102480',
              'n02102973','n02104029','n02104365','n02105056','n02105162','n02105251','n02105412','n02105505','n02105641','n02105855',
              'n02106030','n02106166','n02106382','n02106550','n02106662','n02107142','n02107312','n02107574','n02107683','n02107908',
              'n02108000','n02108089','n02108422','n02108551','n02108915','n02109047','n02109525','n02109961','n02110063','n02110185',
              'n02110341','n02110627','n02110806','n02110958','n02111129','n02111277','n02111500','n02111889','n02112018','n02112137',
              'n02112350','n02112706','n02113023','n02113186','n02113624','n02113712','n02113799','n02113978']

cats_class =['n02123045','n02123159','n02123394','n02123597','n02124075','n02125311','n02127052']

model_ResNet50={'name':ResNet50,'shape':(224,224),'preprocess':None}
model_VGG16={'name':VGG16,'shape':(224,224),'preprocess':None}
model_VGG19={'name':VGG19,'shape':(224,224),'preprocess':None}
model_InceptionV3={'name':InceptionV3,'shape':(299,299),'preprocess':inception_v3.preprocess_input}
model_Xception={'name':Xception,'shape':(299,299),'preprocess':xception.preprocess_input}
model_InceptionResNetV2={'name':InceptionResNetV2,'shape':(299,299),'preprocess':inception_resnet_v2.preprocess_input}

def check_err_picture(model,model_shape,preprocess_input,picture_names,correct_class):    
    bad_pict=[]
    both_pict=[]
    
    for file in  tqdm(picture_names):
        img = load_img(r'E:\project-py\dog_cat\train\\'+file, target_size=model_shape)
        x = img_to_array(img)
        x = np.expand_dims(x, axis=0)
        if preprocess_input:
            x = preprocess_input(x)
        
        preds = model.predict(x)
        pred_class=densenet.decode_predictions(preds, top=50)[0]
        check_class=[(lambda x:x[0])(x) for x in pred_class]
        
        func_dog=(lambda x,y:len(set(x).intersection(set(y))))(pred_class,dogs_class)
        func_cat=(lambda x,y:len(set(x).intersection(set(y))))(pred_class,cats_class)
            
        if pict_tpye=='DOG':
            #  如果与标签类型无交集，说明预测top50不包含指定标签类型
            if func_dog==0:
                bad_pict.append(file)
            #  如果同时包含有猫狗标签，可能会存在第二种异常类型
            if (func_dog>0 and func_cat>0):
                both_pict.append(file)
        elif pict_tpye=='CAT':
            if func_cat==0:
                bad_pict.append(file)
            if (func_dog>0 and func_cat>0):
                both_pict.append(file)
        
    return bad_pict,both_pict
        

def check_error_pict(model_name):
    err_list=[]
    both_list=[]
    
    model = model_name['name'](weights='imagenet')
    shape = model_name['shape']
    preprocess_input=model_name['preprocess']
    
    dog_err,both1=check_err_picture(model,shape,preprocess_input,dogs,'DOG')
    cat_err,both2=check_err_picture(model,shape,preprocess_input,cats,'CAT')
    err_list=(dog_err+cat_err)
    both_list=(both1+both2)
    return err_list,both_list

In [None]:
err_ResNet50,both_ResNet50=check_error_pict(model_ResNet50)

In [None]:
err_VGG16,both_VGG16=check_error_pict(model_VGG16)

In [None]:
err_VGG19,both_VGG19=check_error_pict(model_VGG19)

In [None]:
all_err=sorted(list(set(err_ResNet50).intersection(set(err_VGG16)).intersection(set(err_VGG19))))
all_both=sorted(list(set(both_ResNet50).intersection(set(both_VGG16)).intersection(set(both_VGG19))))

print(len(all_err))
print(len(all_both))

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

all_err=['cat.10029.jpg', 'cat.10365.jpg', 'cat.10536.jpg', 'cat.10636.jpg', 'cat.10700.jpg', 'cat.10712.jpg', 'cat.11194.jpg', 'cat.11297.jpg', 'cat.11701.jpg', 'cat.12227.jpg', 'cat.12272.jpg', 'cat.12424.jpg', 'cat.12476.jpg', 'cat.166.jpg', 'cat.2337.jpg', 'cat.241.jpg', 'cat.2457.jpg', 'cat.252.jpg', 'cat.2520.jpg', 'cat.2663.jpg', 'cat.2758.jpg', 'cat.2817.jpg', 'cat.3410.jpg', 'cat.3738.jpg', 'cat.4308.jpg', 'cat.4338.jpg', 'cat.4688.jpg', 'cat.4842.jpg', 'cat.5351.jpg', 'cat.5418.jpg', 'cat.5780.jpg', 'cat.5880.jpg', 'cat.6345.jpg', 'cat.6442.jpg', 'cat.7377.jpg', 'cat.7564.jpg', 'cat.7661.jpg', 'cat.7671.jpg', 'cat.7968.jpg', 'cat.8456.jpg', 'cat.8470.jpg', 'cat.9090.jpg', 'cat.9110.jpg', 'cat.9171.jpg', 'cat.9983.jpg', 'dog.2422.jpg', 'dog.2614.jpg', 'dog.5604.jpg', 'dog.6733.jpg']

plt.figure(figsize=(50,100),dpi=90)
sorted_pict=sorted(all_err)

for i in range(10):
    for j in range(5):
        if i*5+j < len(sorted_pict):
            err_file=sorted_pict[i*5+j]
            pict=cv2.resize(cv2.imread(r'E:\project-py\dog_cat\train\\'+err_file),(300,300))            
            plt.subplot(10,5, i*5+j+1)
            plt.title(err_file)
            plt.imshow(pict)
            plt.axis('off')

plt.show()

### 移除异常值
通过仔细识别图片，确定哪些图片属于异常值，然后从预训练数据集中移除异常值

In [None]:
outline_err=[]
outline_both=[]

In [None]:
def remove_outline(outline_set):
    for x in outline_err:
        if x[:3]=='dog':
            dogs.remove(outline_err)
        elif x[:3]=='cat':
            cats.remove(outline_err)

remove_outline(outline_err)
remove_outline(outline_both)

### 划分验证集

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

dogs_train,dogs_valid,dog_lable_train,dog_lable_valid=train_test_split(dogs,dog_lable,test_size=0.2,random_state=10,shuffle=True)
cats_train,cats_valid,dog_lable_train,dog_lable_valid=train_test_split(cats,dog_lable,test_size=0.2,random_state=10,shuffle=True)

In [None]:
if os.path.exists('train2'):
    shutil.rmtree('train2')
    os.mkdir('train2')
    os.mkdir(r'train2/cat')
    os.mkdir(r'train2/dog')
else:
    os.mkdir('train2')
    os.mkdir(r'train2/cat')
    os.mkdir(r'train2/dog')

if os.path.exists('valid'):
    shutil.rmtree('valid')
    os.mkdir('valid')
    os.mkdir(r'valid/cat')
    os.mkdir(r'valid/dog')
else:
    os.mkdir('valid')
    os.mkdir(r'valid/cat')
    os.mkdir(r'valid/dog')

if os.path.exists('test2'):
    shutil.rmtree('test2')
else:
    os.mkdir('test2')

def link_image(image_name,train_valid,dog_or_cats):
    for file in image_name:
        if train_valid=='T':
            if dog_or_cats =='CAT':
                os.symlink(r'train/'+file,r'train2/cat/'+file)
            else:
                os.symlink(r'train/'+file,r'train2/dog/'+file)
        else:
            if dog_or_cats =='CAT':
                os.symlink(r'train/'+file,r'valid/cat/'+file)
            else:
                os.symlink(r'train/'+file,r'valid/dog/'+file)

link_image(dogs_train,'T','DOG')   
link_image(dogs_valid,'V','DOG')    
link_image(cats_train,'T','CAT')    
link_image(cats_valid,'V','CAT')    

In [None]:
# for windows
!tree
#for linux
#!tree -d

print('\n')
print('statistics:')
print('totol train pictures  :{}'.format(len(os.listdir('train'))))
print('totol test  pictures  :{}'.format(len(os.listdir('test'))))
print('train      set:  cats :{}'.format(len(os.listdir(r'train2/cat'))))
print('train      set:  dogs :{}'.format(len(os.listdir(r'train2/dog'))))
print('validation set:  cats :{}'.format(len(os.listdir(r'valid/cat'))))
print('validation set:  dogs :{}'.format(len(os.listdir(r'valid/dog'))))

## 探索数据

In [None]:
import os
import cv2
import random
import matplotlib.pyplot as plt

%matplotlib inline

dog_path=r'train/'+random.choice(dogs)
cat_path=r'train/'+random.choice(cats)

dog_pict=cv2.resize(cv2.imread(dog_path),(200,200))
cat_pict=cv2.resize(cv2.imread(cat_path),(200,200))

plt.figure(figsize=(10,5),dpi=90)
p1=plt.subplot(1,2,1)
p2=plt.subplot(1,2,2)
p1.set_title("random dog")
p2.set_title("random cat")
p1.imshow(dog_pict)
p2.imshow(cat_pict)
p1.axis('off')
p2.axis('off')
plt.show()

## 模型

In [None]:
import tensorflow as tf
from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *

'''
input1 = Input(shape=(299, 299, 3))
input2 = Input(shape=(224, 224, 3))
input_set=[input1,input2]
'''

model_ResNet50={'name':ResNet50,'shape':(224,224),'preprocess':None}
model_VGG16={'name':VGG16,'shape':(224,224),'preprocess':None}
model_VGG19={'name':VGG19,'shape':(224,224),'preprocess':None}
model_InceptionV3={'name':InceptionV3,'shape':(299,299),'preprocess':inception_v3.preprocess_input}
model_Xception={'name':Xception,'shape':(299,299),'preprocess':xception.preprocess_input}

In [None]:
def import_model(model_input,model_dict):
    if model_dict['preprocess']:
        x = Lambda(model_dict['preprocess'])(model_input)
    else:
        x=model_input
        
    base_model=model_dict['name'](input_tensor=x,weights='imagenet',include_top=False)
    for layers in base_model.layers:
        layers.trainable = False
    return base_model

def model_concatenate(input_set,model_set):
    #input1 = Input(shape=(299, 299, 3))
    input1=input_set[0]
    #input2 = Input(shape=(224, 224, 3))
    input2=input_set[1]

    mid_out=[]
    
    for i in range(len(model_set)):
        if model_set[i] in [model_Xception,model_InceptionV3]:
            base_model=import_model(input1,model_set[i])
        else:
            base_model=import_model(input2,model_set[i])
    
        pool_layer=GlobalAveragePooling2D()(base_model.output)
        mid_out.append(pool_layer)
    
    #print(mid_out)
    
    if (len(model_set)>1):
        x= Concatenate(axis=-1)(mid_out)
    else:
        x=mid_out[0]
    
    x = Dropout(0.5)(x)
    out=Dense(1,activation='sigmoid')(x)
    #print(out)
    
    return out

def input_define(mask):
    input_tensor=[]
    if mask[0]==1:
        input_tensor.append(Input(shape=(299, 299, 3)))
    if mask[1]==1:
        input_tensor.append(Input(shape=(224, 224, 3)))
    return input_tensor


def img_load_mask_transfer(input_type,mask,enhance=False):
    masked_load=[]
    
    # enhance parameter for train
    if enhance:
        if input_type=='train':
            data_gen_args = dict(featurewise_center=True,
                         featurewise_std_normalization=True,
                         rotation_range=90.,
                         width_shift_range=0.1,
                         height_shift_range=0.1,
                         horizontal_flip=True,
                         vertical_flip=True,
                         zoom_range=0.2)
    else:
        data_gen_args = dict()

    #   directories  from different input        
    if (input_type=='train'):
        load_direct=r'train2'
    elif (input_type=='valid'):
        load_direct=r'valid'

    #   mask  ImageDataGenerator   
    if mask[0]==1:
        image_size=(299,299)
        gen1 = ImageDataGenerator(**data_gen_args)
        image_gen1=gen1.flow_from_directory(directory=load_direct,class_mode='binary',target_size=(299, 299),batch_size=20,seed=1)
        masked_load.append(image_gen1)
    if mask[1]==1:
        image_size=(224,224)
        gen2 = ImageDataGenerator(**data_gen_args)
        image_gen2 = gen2.flow_from_directory(directory=load_direct,class_mode='binary',target_size=(224,224),batch_size=20,seed=1)
        masked_load.append(image_gen2)
    # if not use append  then use zip
    if sum(mask)>1:
        return(zip(image_gen1,image_gen2))
    else:
        return masked_load

## 构建基础模型

In [None]:
model_set=[model_ResNet50,model_Xception]

##   model_mask   used for mask input(train,valid,test)
model_mask=[0,0]
for i in range(len(model_set)):
    if model_set[i] in [model_Xception,model_InceptionV3]:
        model_mask[0]=1
    else:
        model_mask[1]=1

print(model_mask)

In [None]:
input_set=input_define(model_mask)
print(input_set)

out=model_concatenate(input_set,model_set)

model = Model(input_set, out)

model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
train_load=img_load_mask_transfer('train',model_mask,True)
print(train_load)
#print(train_load[0].__dict__)

valid_load=img_load_mask_transfer('valid',model_mask)
#print(valid_load)

In [None]:
#model.fit(X_train, y_train, batch_size=128, nb_epoch=8, validation_split=0.2)

hist = model.fit_generator(train_load, epochs=10, validation_data=valid_load ,shuffle=True)

In [None]:
print(hist.history)

## 模型结果可视化

In [None]:
import numpy as np  
import matplotlib.pyplot as plt  
history=np.load(npy_dir_path)  
history=history.tolist()  
acc=history['acc']  
loss=history['loss']  
val_acc=history['val_acc']  
val_loss=history['val_loss']  
nb_epoach=np.size(acc)  
  
plt.xlabel('Epochs')  
plt.ylabel('Loss')  
plt.title('VGG-16 Loss Trend')  
plt.plot(loss,'blue',label='Training Loss')  
plt.plot(val_loss,'green',label='Validation Loss')  
plt.xticks(range(0,nb_epoach))  
plt.legend()  
plt.show()  
  
plt.xlabel('Epochs')  
plt.ylabel('Loss')  
plt.title('VGG-16 Accuracy Trend')  
plt.plot(acc,'blue',label='Training Loss')  
plt.plot(val_acc,'green',label='Validation Loss')  
plt.xticks(range(0,nb_epoach))  
plt.legend()  
plt.show()  

In [None]:
for i, layer in enumerate(model.layers):
    print('Model name: {}'.format(model_set))
    print('  seq        layer_name')
    print(i, layer.name)

In [None]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot, plot

SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

## 预测

In [None]:
gen_test= ImageDataGeneraotr().flow_from_directory(directory='test',class_mode=None,shuffle=False,
                                                   target_size=target_size, batch_size=batch_size)
predict=model.predict_generator(gen_test,)
filenames = test_gen.filenames

y_pred = model.predict_generator(X_test, verbose=1)
y_pred = y_pred.clip(min=0.005, max=0.995)