In [1]:
import pandas as pd

## EDA: ドライバーごとのデータ数

In [2]:
df = pd.read_csv("../data/input/csvs/train.csv")
driver = pd.read_csv("../data/input/csvs/driver_imgs_list.csv")

In [4]:
#ドライバーの数
driver.subject.nunique()

26

In [12]:
# ドライバーごとのデータ数
driver.groupby("subject").size()

subject
p002     725
p012     823
p014     876
p015     875
p016    1078
p021    1237
p022    1233
p024    1226
p026    1196
p035     848
p039     651
p041     605
p042     591
p045     724
p047     835
p049    1011
p050     790
p051     920
p052     740
p056     794
p061     809
p064     820
p066    1034
p072     346
p075     814
p081     823
dtype: int64

ドライバーごとのデータ数と各ラベルの数を算出

In [9]:
def print_classname_size(driver_name):
    print(f"Driver name: {driver_name}\n")
#     print(driver[driver.subject==driver_name][""])
    driverrows = driver[driver.subject==driver_name]
    sumrows = len(driverrows)
    classnames = driver[driver.subject==driver_name]["classname"].unique()
    print(f"->Sum rows per a driver: {sumrows}")
#     print(driver[driver.subject==driver_name].groupby("classname").size())
    print()
    for classname in classnames:
        print(f"{classname} ratio: {round((len(driverrows[driverrows.classname==classname])/sumrows)*100,1)}%")
    

In [11]:
drivers = driver.subject.unique()
for d in drivers:
    print_classname_size(d)

Driver name: p002

->Sum rows per a driver: 725

c0 ratio: 10.5%
c1 ratio: 10.2%
c2 ratio: 11.9%
c3 ratio: 10.9%
c4 ratio: 11.6%
c5 ratio: 10.5%
c6 ratio: 11.4%
c7 ratio: 9.9%
c8 ratio: 6.1%
c9 ratio: 7.0%
Driver name: p012

->Sum rows per a driver: 823

c0 ratio: 10.2%
c1 ratio: 11.5%
c2 ratio: 11.1%
c3 ratio: 10.8%
c4 ratio: 11.8%
c5 ratio: 11.7%
c6 ratio: 9.1%
c7 ratio: 8.7%
c8 ratio: 7.5%
c9 ratio: 7.5%
Driver name: p014

->Sum rows per a driver: 876

c0 ratio: 11.4%
c1 ratio: 11.8%
c2 ratio: 11.4%
c3 ratio: 11.4%
c4 ratio: 11.8%
c5 ratio: 11.6%
c6 ratio: 11.5%
c7 ratio: 8.8%
c8 ratio: 4.3%
c9 ratio: 5.9%
Driver name: p015

->Sum rows per a driver: 875

c0 ratio: 9.0%
c1 ratio: 9.7%
c2 ratio: 10.1%
c3 ratio: 10.7%
c4 ratio: 11.5%
c5 ratio: 11.5%
c6 ratio: 11.3%
c7 ratio: 9.3%
c8 ratio: 9.8%
c9 ratio: 7.0%
Driver name: p016

->Sum rows per a driver: 1078

c0 ratio: 10.3%
c1 ratio: 9.5%
c2 ratio: 9.4%
c3 ratio: 11.9%
c4 ratio: 9.6%
c5 ratio: 9.6%
c6 ratio: 10.0%
c7 ratio: 9.4%
c8 rat

## ライブラリインストール

In [36]:
import os
import random

import numpy as np
import pandas as pd
# import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.utils import plot_model, to_categorical, multi_gpu_model
from tensorflow.keras.layers import Input, Dense, Conv2D, Activation
from tensorflow.keras.layers import MaxPooling2D, UpSampling2D, BatchNormalization, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0

In [15]:
def set_randvalue(value):
    # Set a seed value
    seed_value= value 
    # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
    os.environ['PYTHONHASHSEED']=str(seed_value)
    # 2. Set `python` built-in pseudo-random generator at a fixed value
    random.seed(seed_value)
    # 3. Set `numpy` pseudo-random generator at a fixed value
    np.random.seed(seed_value)
    # 4. Set `tensorflow` pseudo-random generator at a fixed value
    tf.random.set_seed(seed_value)

seed_value = 42
set_randvalue(seed_value)

## CSVロード

In [17]:
# train.csvが元のラベルごとにディレクトリに保存されていたデータから作成したcsv
df = pd.read_csv("../data/input/csvs/train.csv")
df["label"] = df["label"].astype(str)
# df["img"] = df["image"]
df.head(3)

Unnamed: 0,image,label
0,img_100026.jpg,0
1,img_10003.jpg,0
2,img_100050.jpg,0


In [26]:
driver["image"] = driver["img"]

In [28]:
# driver csvと元のcsvをマージ
merge_df = pd.merge(df, driver, on='image')
merge_df

Unnamed: 0,image,label,subject,classname,img
0,img_100026.jpg,0,p015,c0,img_100026.jpg
1,img_10003.jpg,0,p022,c0,img_10003.jpg
2,img_100050.jpg,0,p022,c0,img_100050.jpg
3,img_100074.jpg,0,p051,c0,img_100074.jpg
4,img_10012.jpg,0,p002,c0,img_10012.jpg
...,...,...,...,...,...
22419,img_99761.jpg,9,p014,c9,img_99761.jpg
22420,img_99801.jpg,9,p050,c9,img_99801.jpg
22421,img_99927.jpg,9,p045,c9,img_99927.jpg
22422,img_9993.jpg,9,p042,c9,img_9993.jpg


## モデル作成

In [47]:
# EfficientNetB0のモデルを作成
def create_model(weight_flg=False):
    weight = None
    if weight_flg:
        weight = "../model/efficientnetb0_notop.h5" # ImageNetで学習されたモデルをロード
    # include_top=False; 全結合層なし
    base_model = EfficientNetB0(weights=weight, include_top=False, pooling='avg', input_shape=(224,224,3))
    x = Dense(512, activation='relu')(base_model.output)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(10, activation='softmax')(x)
    model = Model(base_model.input, output)
    
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# EfficientNetB0のモデルを作成
def create_multigpu_model(weight_flg=False):
    tf.debugging.set_log_device_placement(True)

    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        weight = None
        if weight_flg:
            weight = "../model/efficientnetb0_notop.h5" # ImageNetで学習されたモデルをロード
        # include_top=False; 全結合層なし
        base_model = EfficientNetB0(weights=weight, include_top=False, pooling='avg', input_shape=(224,224,3))
        x = Dense(512, activation='relu')(base_model.output)
        x = Dropout(0.5)(x)
        x = Dense(512, activation='relu')(x)
        x = Dropout(0.5)(x)
        output = Dense(10, activation='softmax')(x)
        model = Model(base_model.input, output)

        model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
        return model

## 学習

In [51]:
# Hyper parameters
img_size = 224
batch_size = 16
epochs = 250

In [23]:
# Generator作成
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20, # rotation range
    width_shift_range=30/img_size, # 30 pixel
    height_shift_range=30/img_size, # 30 pixel
    zoom_range=0.2
)

valid_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [52]:
%%time
# 学習と評価データでのラベルの分布数を一定に保ち学習
# skf = StratifiedKFold(n_splits = 5)
# splitter = skf.split(df["image"],df["label"])
gkf = GroupKFold(n_splits = 5)
splitter = gkf.split(merge_df["image"],merge_df["label"],merge_df["subject"])
for i, (train_ids, valid_ids) in enumerate(splitter, 1):
    # データ生成
    train, valid = df.iloc[train_ids], df.iloc[valid_ids]
    train_datagenerator = train_datagen.flow_from_dataframe(
        train,
        directory='../data/input/imgs/train/imgs/',
        x_col='image',
        y_col='label',
        target_size=(img_size, img_size),
        class_mode='categorical',
        batch_size=batch_size,
        seed=seed_value
    )

    valid_datagenerator = valid_datagen.flow_from_dataframe(
        valid,
        directory='../data/input/imgs/train/imgs/',
        x_col='image',
        y_col='label',
        target_size=(img_size, img_size),
        class_mode='categorical',
        batch_size=batch_size,
        seed=seed_value
    )
    
    model = create_model()
#     model = create_multigpu_model()
    
    # 早期終了
    early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
    
    # 早期終了するのでval_lossが小さいモデルを保存
    model_path = '../model/' +  'Train003_' + "fold" + str(i) + "_best_model.h5"
    checkpoint = ModelCheckpoint(
                    filepath=model_path,
                    monitor='val_loss',
                    save_best_only=True,
                    period=1)
    
    history = model.fit(
        train_datagenerator,
        steps_per_epoch=int(len(train)//batch_size),
        epochs=epochs,
        validation_data=valid_datagenerator,
        validation_steps=int(len(valid)//batch_size),
        verbose=1,
        shuffle=True,
        callbacks=[early_stop, checkpoint])

Found 18017 validated image filenames belonging to 10 classes.
Found 4407 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 00011: early stopping
Found 17949 validated image filenames belonging to 10 classes.
Found 4475 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 00014: early stopping
Found 18060 validated image filenames belonging to 10 classes.
Found 4364 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 00007: early stopping
Found 17720 validated image filenames belonging to 10 classes.
Found 4704 validated image filenames belonging to 10 classes.
Epoch 1/250
Epoch 2/250
E

## 推論

In [53]:
submit = pd.read_csv('../data/input/csvs/sample_submission.csv')

test_generator = test_datagen.flow_from_dataframe(
    submit,
    directory='../data/input/imgs/test/',
    x_col='img',
    y_col='c0', # ダミー変数
    target_size=(img_size, img_size),
    class_mode=None,
    batch_size=1,
    verbose=1,
    shuffle=False
)

columns = submit.columns.values
labels = submit.columns[1:].values

def inference(fold_num):
    model = create_model()
    weight_path = '../model/Train003_fold' + str(fold_num) + '_best_model.h5'
    output_path = '../data/output/Train003_fold' + str(fold_num) + '_sub.csv'
    model.load_weights(weight_path)
    
    pred = model.predict(test_generator, verbose=1)
    pred_df = pd.DataFrame(columns=columns)
    pred_df['img'] = submit['img']
    pred_df[labels] = pred
    pred_df.to_csv(output_path, index=False)
    
for i in range(1,6):
    inference(i)

Found 79726 validated image filenames.


## アンサンブル：単純平均