## ライブラリインストール

In [1]:
import os
import random

import numpy as np
import pandas as pd
# import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.layers import Input, Dense, Conv2D, Activation
from tensorflow.keras.layers import MaxPooling2D, UpSampling2D, BatchNormalization, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0, ResNet50

In [2]:
def set_randvalue(value):
    # Set a seed value
    seed_value= value 
    # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
    os.environ['PYTHONHASHSEED']=str(seed_value)
    # 2. Set `python` built-in pseudo-random generator at a fixed value
    random.seed(seed_value)
    # 3. Set `numpy` pseudo-random generator at a fixed value
    np.random.seed(seed_value)
    # 4. Set `tensorflow` pseudo-random generator at a fixed value
    tf.random.set_seed(seed_value)

seed_value = 42
set_randvalue(seed_value)

## CSVロード & 半教師データ作成

In [3]:
# Train001でアンサンブルし作成したcsv
ensemble_df = pd.read_csv("../data/output/Train005_ensemble_sub.csv")
ensemble_df.head(3)

Unnamed: 0,img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,img_1.jpg,0.020099,0.007584666,0.013422,0.005908,0.002738102,0.869602,0.002999901,0.008679,0.020682,0.048285
1,img_10.jpg,0.00011,3.774269e-07,1e-06,3e-06,7.94022e-07,0.999396,8.523394e-08,0.000191,0.000217,8e-05
2,img_100.jpg,0.482515,0.01307274,0.00137,0.016425,0.004297219,0.028058,0.001605076,0.015099,0.097214,0.340344


In [4]:
ensemble_values = ensemble_df.values[:,1:]

In [5]:
ensemble_max_values = np.max(ensemble_values, axis=1)
ensemble_max_indexes = np.argmax(ensemble_values, axis=1)
ensemble_df["max_pred"] = ensemble_max_values
ensemble_df["label"] = ensemble_max_indexes

In [6]:
ensemble_df.head(3)

Unnamed: 0,img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,max_pred,label
0,img_1.jpg,0.020099,0.007584666,0.013422,0.005908,0.002738102,0.869602,0.002999901,0.008679,0.020682,0.048285,0.869602,5
1,img_10.jpg,0.00011,3.774269e-07,1e-06,3e-06,7.94022e-07,0.999396,8.523394e-08,0.000191,0.000217,8e-05,0.999396,5
2,img_100.jpg,0.482515,0.01307274,0.00137,0.016425,0.004297219,0.028058,0.001605076,0.015099,0.097214,0.340344,0.482515,0


In [7]:
# 信頼値90%より値が大きい行を取得
ensemble_df_over_thresh = ensemble_df[ensemble_df.max_pred > 0.95]

In [8]:
print("確率が95％より大きな行数",len(ensemble_df_over_thresh))

確率が95％より大きな行数 41870


In [9]:
ensemble_train_df = ensemble_df_over_thresh[["img","label"]]

In [10]:
ensemble_train_df["image"] = ensemble_train_df["img"]
ensemble_train_df = ensemble_train_df.drop(["img"],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ensemble_train_df["image"] = ensemble_train_df["img"]


In [11]:
# 半教師あり学習用にDataFrame作成
train_df = pd.read_csv("../data/input/csvs/train.csv")
# train_df["img"] = train_df["image"]
merge_df = pd.concat([train_df, ensemble_train_df])
merge_df = merge_df.reset_index()
merge_df["label"] = merge_df["label"].astype(str)

In [12]:
# 学習データ数
len(merge_df)

64294

In [13]:
merge_df

Unnamed: 0,index,image,label
0,0,img_100026.jpg,0
1,1,img_10003.jpg,0
2,2,img_100050.jpg,0
3,3,img_100074.jpg,0
4,4,img_10012.jpg,0
...,...,...,...
64289,79717,img_9999.jpg,3
64290,79719,img_99991.jpg,2
64291,79720,img_99993.jpg,7
64292,79722,img_99995.jpg,3


In [14]:
# ラベルごとの数
merge_df.label.value_counts()

1    9162
3    9073
4    8339
5    8290
2    7811
6    6074
7    5379
0    3995
8    3759
9    2412
Name: label, dtype: int64

## モデル作成

In [15]:
# EfficientNetB0のモデルを作成
def create_model(weight_flg=False):
    weight = None
    if weight_flg:
        weight = "../model/efficientnetb0_notop.h5" # ImageNetで学習されたモデルをロード
    # include_top=False; 全結合層なし
#     base_model = EfficientNetB0(weights=weight, include_top=False, pooling='avg', input_shape=(224,224,3))
    base_model = ResNet50(weights="imagenet", include_top=False, pooling='avg', input_shape=(224,224,3))
    x = Dense(512, activation='relu')(base_model.output)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(10, activation='softmax')(x)
    model = Model(base_model.input, output)
    
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

## 学習

In [16]:
# Hyper parameters
img_size = 224
batch_size = 32
epochs = 250

In [17]:
# Generator作成
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20, # rotation range
    width_shift_range=30/img_size, # 30 pixel
    height_shift_range=30/img_size, # 30 pixel
    zoom_range=0.2
)

valid_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [18]:
%%time
# 学習と評価データでのラベルの分布数を一定に保ち学習
skf = StratifiedKFold(n_splits = 5)
splitter = skf.split(merge_df["image"],merge_df["label"])
for i, (train_ids, valid_ids) in enumerate(splitter, 1):
    # データ生成
    train, valid = merge_df.iloc[train_ids], merge_df.iloc[valid_ids]
    train_datagenerator = train_datagen.flow_from_dataframe(
        train,
        directory='../data/input/imgs/train/semi-supervised_imgs/',
        x_col='image',
        y_col='label',
        target_size=(img_size, img_size),
        class_mode='categorical',
        batch_size=batch_size,
        seed=seed_value
    )

    valid_datagenerator = valid_datagen.flow_from_dataframe(
        valid,
        directory='../data/input/imgs/train/semi-supervised_imgs/',
        x_col='image',
        y_col='label',
        target_size=(img_size, img_size),
        class_mode='categorical',
        batch_size=batch_size,
        seed=seed_value
    )
    
    model = create_model()
    
    # 早期終了
    early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
    
    # 早期終了するのでval_lossが小さいモデルを保存
    model_path = '../model/' +  'Train006_' + "fold" + str(i) + "_best_model.h5"
    checkpoint = ModelCheckpoint(
                    filepath=model_path,
                    monitor='val_loss',
                    save_best_only=True,
                    period=1)
    
    history = model.fit(
        train_datagenerator,
        steps_per_epoch=int(len(train)//batch_size),
        epochs=epochs,
        validation_data=valid_datagenerator,
        validation_steps=int(len(valid)//batch_size),
        verbose=1,
        shuffle=True,
        callbacks=[early_stop, checkpoint])

Found 51435 validated image filenames belonging to 10 classes.
Found 12859 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 00006: early stopping
Found 51435 validated image filenames belonging to 10 classes.
Found 12859 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 00009: early stopping
Found 51435 validated image filenames belonging to 10 classes.
Found 12859 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 00008: early stopping
Found 51435 validated image filenames belonging to 10 classes.
Found 12859 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 00009: early stopping


## 推論

In [19]:
submit = pd.read_csv('../data/input/csvs/sample_submission.csv')
columns = submit.columns.values
labels = submit.columns[1:].values

test_generator = test_datagen.flow_from_dataframe(
    submit,
    directory='../data/input/imgs/test/',
    x_col='img',
    y_col='c0', # ダミー変数
    target_size=(img_size, img_size),
    class_mode=None,
    batch_size=1,
    verbose=1,
    shuffle=False
)

def inference(fold_num):
    model = create_model()
    weight_path = '../model/Train006_fold' + str(fold_num) + '_best_model.h5'
    output_path = '../data/output/Train006_fold' + str(fold_num) + '_sub.csv'
    model.load_weights(weight_path)
    
    pred = model.predict(test_generator, verbose=1)
    pred_df = pd.DataFrame(columns=columns)
    pred_df['img'] = submit['img']
    pred_df[labels] = pred
    pred_df.to_csv(output_path, index=False)

Found 79726 validated image filenames.


In [20]:
for i in range(1,6):
    inference(i)



### アンサンブル：単純平均

In [22]:
submit = pd.read_csv('../data/input/csvs/sample_submission.csv')
columns = submit.columns.values
labels = submit.columns[1:].values

ensemble = 0
for i in range(1,6):
    path = "../data/output/Train006_fold" + str(i) +"_sub.csv"
    ensemble += pd.read_csv(path).values[:,1:] / 5 # fold数で割る

ensemble_df = pd.DataFrame(columns=columns)
ensemble_df['img'] = submit['img']
ensemble_df[labels] = ensemble

ensemble_df.to_csv("../data/output/Train006_ensemble_sub.csv", index=False)

### Train002のアンサンブルとTrain006のアンサンブルのアンサンブル

In [27]:
submit = pd.read_csv('../data/input/csvs/sample_submission.csv')
columns = submit.columns.values
labels = submit.columns[1:].values

ensemble = 0
# for i in range(1,6):
path1 = "../data/output/Train006_ensemble_sub.csv"
path2 = "../data/output/Train002_ensemble_sub.csv"
ensemble += pd.read_csv(path1).values[:,1:] * 0.2
ensemble += pd.read_csv(path2).values[:,1:] * 0.8

ensemble_df = pd.DataFrame(columns=columns)
ensemble_df['img'] = submit['img']
ensemble_df[labels] = ensemble

ensemble_df.to_csv("../data/output/Train002_and_Train006_ensemble_sub.csv", index=False)