# Important notice
- Run this notebook in Docker

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.io import read_file, decode_png
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization, AveragePooling2D, GlobalAveragePooling2D

In [None]:
load_dotenv()

DATA_DIR = os.getenv("DATA_DIR")
PROCESSED_DATA_DIR = f"{DATA_DIR}/processed"

In [None]:
# Load image 0.png from the data directory
img = read_file(f"{PROCESSED_DATA_DIR}/images/train/not_rotten/0.png")

# Decode the image
img = decode_png(img, channels=3)

### Define image loading constants

In [None]:
IMAGE_SIZE = img.shape[0]
BATCH_SIZE = 8
SEED = 42
VALIDATION_SPLIT = 0.2

### Load data

In [None]:
train_df = pd.read_csv(f"{PROCESSED_DATA_DIR}/train.csv")
test_df = pd.read_csv(f"{PROCESSED_DATA_DIR}/test.csv")

In [None]:
train_generator = ImageDataGenerator(
    rescale=1./255,
    # rotation_range=15,
    # width_shift_range=0.1,
    # height_shift_range=0.1,
    # horizontal_flip=True,
    # vertical_flip=True,
    validation_split=VALIDATION_SPLIT,
)

train_ds = train_generator.flow_from_directory(
    directory=f"{PROCESSED_DATA_DIR}/images/train",
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    class_mode="binary",
    seed=SEED,
    subset="training",
)

In [None]:
validation_generator = ImageDataGenerator(
    rescale=1./255,
    validation_split=VALIDATION_SPLIT,
)

validation_ds = validation_generator.flow_from_directory(
    directory=f"{PROCESSED_DATA_DIR}/images/train",
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    class_mode="binary",
    seed=SEED,
    subset="validation",
)

### Define modeling constants

In [None]:
CLASS_MODE = "binary"
LOSS_FUNCTION = "binary_crossentropy"

### Create the model

In [None]:
model = Sequential([
    Conv2D(16, (3, 3), activation='relu', input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(32, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(256, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    GlobalAveragePooling2D(),
    Dropout(0.5),
    
    Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss=LOSS_FUNCTION,
    metrics=['accuracy'],
)

### Define early stopping callback

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1,
)

### Train the model

In [None]:
history = model.fit(
    train_ds,
    validation_data=validation_ds,
    batch_size=BATCH_SIZE,
    epochs=2,
    verbose=1,
    callbacks=[early_stopping],
)

### Evaluate the model

In [None]:
# Show the history of the model
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(history.history['val_accuracy'], label='val_accuracy')
ax1.plot(history.history['accuracy'], label='accuracy')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()

ax2.plot(history.history['val_loss'], label='val_loss')
ax2.plot(history.history['loss'], label='loss')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()

plt.show()

### Save the model

In [None]:
model.save("models/model.h5")

### Generate Kaggle predictions

In [None]:
test_generator = ImageDataGenerator(rescale=1./255.0)

test_ds = test_generator.flow_from_directory(
    f"{PROCESSED_DATA_DIR}/images/test",
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    class_mode=None,
    shuffle=False,
)

In [None]:
predictions = model.predict(test_ds)

### Save submission

In [None]:
def get_image_ids(files):
    return [file.split('/')[-1].split('\\')[-1].split('.')[0] for file in files]

test_image_ids = get_image_ids(test_ds.filenames)

submission_df = pd.DataFrame({
    'index': test_image_ids,
    'rot': predictions.flatten(),
})

submission_df.sort_values(by='index', inplace=True)

submission_df.to_csv(f"models/submission.csv", index=False, float_format='%.16f')