## 获取数据
项目数据集来自Kaggle，[Dogs vs. Cats Redux](https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data)

In [6]:
import os
import shutil

if not os.path.isdir('train'):
    if os.path.isfile('train.zip'):
        os.system('unzip train.zip')
    else:
        print('FILES (train.zip) NOT FOUND, DOWNLOAD FIRST')

if not os.path.isdir('test'):
    pass
else:
    if os.path.isfile('test.zip'):
        os.system('unzip test.zip')
    else:
        print('FILES (test.zip) NOT FOUND, DOWNLOAD FIRST')

print('train set:{}'.format(len(os.listdir('train'))))
print('test  set:{}'.format(len(os.listdir('test'))))

if os.path.exists('slink'):
    pass
else:
    os.mkdir('slink')

if os.path.exists(r'slink\cat'):
    shutil.rmtree(r'slink\cat')
    os.mkdir('slink\cat')
else:
    os.mkdir('slink\cat')

if os.path.exists(r'slink\dog'):
    shutil.rmtree(r'slink\dog')
    os.mkdir('slink\dog')
else:
    os.mkdir('slink\dog')
    
for file in os.listdir('train'):
    if (file.split(sep='.')[0]=='cat'):
        os.symlink('./train/'+file,'./slink/cat/'+file)
    else:
        os.symlink('./train/'+file,'./slink/dog/'+file)

train set:25000
test  set:12500


## 探索数据
数据训练集中，通过文件名的方式标记图片中的动物类别，如'cat.0.jpg','dog.1.jpg'等，因此分类标签需要从文件名中获取。
训练标签只包含'cat','dog'两种，因此数据模型是一个二分类任务。

In [7]:
print('total train number:{}'.format(len(os.listdir('train'))))
print('cats number:{}'.format(len(os.listdir('slink/cat'))))
print('dogs number:{}'.format(len(os.listdir('slink/dog'))))

total train number:25000
cats number:12500
dogs number:12500


## 预处理
观察训练集数据，可以得到训练图片的分辨率大小并不是统一的，所以在训练之前需要做预处理，将所有的图片缩放或者填充成一致的输入分辨率。

### 随机化

## 构建模型

In [None]:

from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *

import h5py

def write_gap(MODEL,image_size,lambda_func=None):
    width = image_size[0]
    height=image_size[1]
    input_tensor=Input((height,width,3))
    x=input_tensor
    
    if lambda_func:
        x=Lambda(lambda_func)(x)
    
    base_model=MODEL(input_tensor=x,weights='imagenet',include_top=False)
    model=Model(base_model.input,GlobalAveragePooling2D()(base_model.output))
    
    gen=ImageDataGenerator()
    train_generator =gen.flow_from_directory("slink",image_size,shuffle=False,batch_size=16)
    test_generator  =gen.flow_from_directory("slink",image_size,shuffle=False,batch_size=16,class_mode=None)
    
    train=model.predict_generator(train_generator,train_generator.nb_sample)
    test=model.predict_generator(test_generator,test_generator.nb_sample)
    
    with h5py.File("gap_%s.h5"%MODEL.func_name) as h:
        h.create_dataset("train",data=train)
        h.create_dataset("test",data=test)
        h.create_dataset("lable",data=train_generator.classes)

In [None]:
keras.applications.InceptionResNetV2()
keras.applications.DenseNet201()
keras.applications.Xception()
keras.applications.nasnet()

In [None]:
分别适用3个模型，导出3个模型的特征向量
write_gap()


In [None]:
import h5py
import numpy as np
from sklearn.utils import shuffle

np.random.seed(2017)

X_train=[]
X_test=[]

for filename in ["gap_ResNet50.h5","gap_Xception.h5","gap_InceptionV3.h5"]:
    with h5py.File(filename,'r') as h:
        X_train.append(np.array(h['train']))
        X_test.append(np.array(h['test']))
        y_train=np.array(h['lable'])
        
X_train=np.concatenate(X_train,axis=1)
X_test=np.concatenate(X_test,axis=1)

X_train,y_train=shuffle(X_train,y_train)


In [None]:
from keras.models import *
from keras.layers import *

np.ramdom.seed(2017)

input_tensor=Input(X_train.shape[1:])
x=Dropout(0.5)(input_tensor)
x=Dense(1,activation='sigmoid')(x)

model=Model(input_tensor,x)

model.compile(optimizer='adadelta',loss='binary_crossentorpy',metrics=['accuracy'])

model.fit(X_train,y_train,batch_size=128,nb_epoch=8,validation_split=0.2)

y_pred=model.predict(X_test,verbose=1)
y_pred=y_pred.clip(min=0.005,max=0.995)

In [None]:
import pandas as pd
from keras.preprocessing.image import *

df=pd.read_csv("sample_submission.csv")

gen=ImageDataGenerator()

test_generator=gen.flow_from_directory("test",(224,224),shuffle=False,batch_size=16,class_mode=None)

for i,fname in enumerate(test_generator.filenames):
    index=int(fname[fname.rfind('/')+1:fname.rfind('.')])
    df.set_value(index-1,'lable',y_pred[i])
    
df.to_csv('pred.csv',index=None)

df.head(10)

https://blog.csdn.net/SusanZhang1231/article/details/73249978

![image.png](attachment:image.png)