## ライブラリインストール

In [1]:
import os
import random

import numpy as np
import pandas as pd
# import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.layers import Input, Dense, Conv2D, Activation
from tensorflow.keras.layers import MaxPooling2D, UpSampling2D, BatchNormalization, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0

In [2]:
def set_randvalue(value):
    # Set a seed value
    seed_value= value 
    # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
    os.environ['PYTHONHASHSEED']=str(seed_value)
    # 2. Set `python` built-in pseudo-random generator at a fixed value
    random.seed(seed_value)
    # 3. Set `numpy` pseudo-random generator at a fixed value
    np.random.seed(seed_value)
    # 4. Set `tensorflow` pseudo-random generator at a fixed value
    tf.random.set_seed(seed_value)

seed_value = 42
set_randvalue(seed_value)

## CSVロード & 半教師データ作成

In [3]:
# Train001でアンサンブルし作成したcsv
ensemble_df = pd.read_csv("../data/output/Train001_ensemble_sub.csv")
ensemble_df.head(3)

Unnamed: 0,img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,img_1.jpg,5e-06,2.1805e-10,1.019599e-07,2.198193e-07,1.692227e-08,0.9998,1.977041e-08,3.659118e-07,1.5e-05,0.000179
1,img_10.jpg,6e-06,1.0395e-10,1.336446e-08,2.174772e-07,2.943974e-09,0.999944,3.690732e-09,1.368859e-07,5e-06,4.4e-05
2,img_100.jpg,0.627001,0.004329754,0.001251251,0.01407811,0.008127563,0.016387,0.002568727,0.01114982,0.096248,0.218859


In [4]:
ensemble_values = ensemble_df.values[:,1:]

In [5]:
ensemble_max_values = np.max(ensemble_values, axis=1)
ensemble_max_indexes = np.argmax(ensemble_values, axis=1)
ensemble_df["max_pred"] = ensemble_max_values
ensemble_df["label"] = ensemble_max_indexes

In [6]:
ensemble_df.head(3)

Unnamed: 0,img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,max_pred,label
0,img_1.jpg,5e-06,2.1805e-10,1.019599e-07,2.198193e-07,1.692227e-08,0.9998,1.977041e-08,3.659118e-07,1.5e-05,0.000179,0.9998,5
1,img_10.jpg,6e-06,1.0395e-10,1.336446e-08,2.174772e-07,2.943974e-09,0.999944,3.690732e-09,1.368859e-07,5e-06,4.4e-05,0.999944,5
2,img_100.jpg,0.627001,0.004329754,0.001251251,0.01407811,0.008127563,0.016387,0.002568727,0.01114982,0.096248,0.218859,0.627001,0


In [7]:
# 信頼値90%より値が大きい行を取得
ensemble_df_over_thresh = ensemble_df[ensemble_df.max_pred > 0.95]

In [8]:
print("確率が95％より大きな行数",len(ensemble_df_over_thresh))

確率が95％より大きな行数 41563


In [9]:
ensemble_train_df = ensemble_df_over_thresh[["img","label"]]

In [10]:
ensemble_train_df["image"] = ensemble_train_df["img"]
ensemble_train_df = ensemble_train_df.drop(["img"],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ensemble_train_df["image"] = ensemble_train_df["img"]


In [11]:
# 半教師あり学習用にDataFrame作成
train_df = pd.read_csv("../data/input/csvs/train.csv")
# train_df["img"] = train_df["image"]
merge_df = pd.concat([train_df, ensemble_train_df])
merge_df = merge_df.reset_index()
merge_df["label"] = merge_df["label"].astype(str)

In [12]:
# 学習データ数
len(merge_df)

63987

In [13]:
merge_df

Unnamed: 0,index,image,label
0,0,img_100026.jpg,0
1,1,img_10003.jpg,0
2,2,img_100050.jpg,0
3,3,img_100074.jpg,0
4,4,img_10012.jpg,0
...,...,...,...
63982,79720,img_99993.jpg,7
63983,79722,img_99995.jpg,3
63984,79723,img_99996.jpg,4
63985,79724,img_99998.jpg,6


In [14]:
# ラベルごとの数
merge_df.label.value_counts()

5    9327
3    9183
4    8227
2    7789
1    6393
7    6138
6    5953
8    4343
0    3428
9    3206
Name: label, dtype: int64

## モデル作成

In [15]:
# EfficientNetB0のモデルを作成
def create_model(weight_flg=False):
    weight = None
    if weight_flg:
        weight = "../model/efficientnetb0_notop.h5" # ImageNetで学習されたモデルをロード
    # include_top=False; 全結合層なし
    base_model = EfficientNetB0(weights=weight, include_top=False, pooling='avg', input_shape=(224,224,3))
    x = Dense(512, activation='relu')(base_model.output)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(10, activation='softmax')(x)
    model = Model(base_model.input, output)
    
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

## 学習

In [16]:
# Hyper parameters
img_size = 224
batch_size = 16
epochs = 250

In [17]:
# Generator作成
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20, # rotation range
    width_shift_range=30/img_size, # 30 pixel
    height_shift_range=30/img_size, # 30 pixel
    zoom_range=0.2
)

valid_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [20]:
%%time
# 学習と評価データでのラベルの分布数を一定に保ち学習
skf = StratifiedKFold(n_splits = 5)
splitter = skf.split(merge_df["image"],merge_df["label"])
for i, (train_ids, valid_ids) in enumerate(splitter, 1):
    # データ生成
    train, valid = merge_df.iloc[train_ids], merge_df.iloc[valid_ids]
    train_datagenerator = train_datagen.flow_from_dataframe(
        train,
        directory='../data/input/imgs/train/semi-supervised_imgs/',
        x_col='image',
        y_col='label',
        target_size=(img_size, img_size),
        class_mode='categorical',
        batch_size=batch_size,
        seed=seed_value
    )

    valid_datagenerator = valid_datagen.flow_from_dataframe(
        valid,
        directory='../data/input/imgs/train/semi-supervised_imgs/',
        x_col='image',
        y_col='label',
        target_size=(img_size, img_size),
        class_mode='categorical',
        batch_size=batch_size,
        seed=seed_value
    )
    
    model = create_model()
    
    # 早期終了
    early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
    
    # 早期終了するのでval_lossが小さいモデルを保存
    model_path = '../model/' +  'Train002_' + "fold" + str(i) + "_best_model.h5"
    checkpoint = ModelCheckpoint(
                    filepath=model_path,
                    monitor='val_loss',
                    save_best_only=True,
                    period=1)
    
    history = model.fit(
        train_datagenerator,
        steps_per_epoch=int(len(train)//batch_size),
        epochs=epochs,
        validation_data=valid_datagenerator,
        validation_steps=int(len(valid)//batch_size),
        verbose=1,
        shuffle=True,
        callbacks=[early_stop, checkpoint])

Found 51189 validated image filenames belonging to 10 classes.
Found 12798 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 00013: early stopping
Found 51189 validated image filenames belonging to 10 classes.
Found 12798 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 00013: early stopping
Found 51190 validated image filenames belonging to 10 classes.
Found 12797 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 00015: early stopping
Found 51190 validated image file

## 推論

In [22]:
submit = pd.read_csv('../data/input/csvs/sample_submission.csv')
columns = submit.columns.values
labels = submit.columns[1:].values

test_generator = test_datagen.flow_from_dataframe(
    submit,
    directory='../data/input/imgs/test/',
    x_col='img',
    y_col='c0', # ダミー変数
    target_size=(img_size, img_size),
    class_mode=None,
    batch_size=1,
    verbose=1,
    shuffle=False
)

def inference(fold_num):
    model = create_model()
    weight_path = '../model/Train002_fold' + str(fold_num) + '_best_model.h5'
    output_path = '../data/output/Train002_fold' + str(fold_num) + '_sub.csv'
    model.load_weights(weight_path)
    
    pred = model.predict(test_generator, verbose=1)
    pred_df = pd.DataFrame(columns=columns)
    pred_df['img'] = submit['img']
    pred_df[labels] = pred
    pred_df.to_csv(output_path, index=False)

Found 79726 validated image filenames.


In [23]:
for i in range(1,6):
    inference(i)



### アンサンブル：単純平均

In [24]:
submit = pd.read_csv('../data/input/csvs/sample_submission.csv')
columns = submit.columns.values
labels = submit.columns[1:].values

ensemble = 0
for i in range(1,6):
    path = "../data/output/Train002_fold" + str(i) +"_sub.csv"
    ensemble += pd.read_csv(path).values[:,1:] / 5 # fold数で割る

ensemble_df = pd.DataFrame(columns=columns)
ensemble_df['img'] = submit['img']
ensemble_df[labels] = ensemble

ensemble_df.to_csv("../data/output/Train002_ensemble_sub.csv", index=False)

#### アンサンブル：加重平均

fold数ごとのPublic scoreを元に加重平均  
  
 fold: Public score  
 fold1: 0.35424  
 fold2: 0.44200  
 fold3: 0.32399  
 fold4: 0.40812  
 fold5: 0.55640  

In [35]:
fold1_weight = 0.25
fold2_weight = 0.25
fold3_weight = 0.25
fold4_weight = 0.25
fold5_weight = 0.125

In [36]:
submit = pd.read_csv('../data/input/csvs/sample_submission.csv')
columns = submit.columns.values
labels = submit.columns[1:].values

weighted_ensemble = 0
for i in range(1,6):
    path = "../data/output/Train002_fold" + str(i) +"_sub.csv"
    weight = 1
    if i == 1: weight = fold1_weight
    elif i == 2: weight = fold2_weight
    elif i == 3: weight = fold3_weight
    elif i == 4: weight = fold4_weight
    elif i == 5: weight = fold5_weight
        
    weighted_ensemble += pd.read_csv(path).values[:,1:] * weight # foldの重みを掛ける

ensemble_df = pd.DataFrame(columns=columns)
ensemble_df['img'] = submit['img']
ensemble_df[labels] = weighted_ensemble

ensemble_df.to_csv("../data/output/Train002_weighted_ensemble_sub.csv", index=False)