In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%env PYTHONPATH = /home/adityasidharta/git/kaggle_humpback_new_whale
%env PROJECT_PATH = /home/adityasidharta/git/kaggle_humpback_new_whale
%cd /home/adityasidharta/git/kaggle_humpback_new_whale

In [None]:
%matplotlib inline
import datetime
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import os
from model.train import fit_model
from sklearn.externals import joblib
from utils.envs import *
from utils.config import *
from model.validation import validate_model, new_whale_threshold
from torch.optim.lr_scheduler import ReduceLROnPlateau
from model.test import predict_model
from model.functions import loss_fn, metric_fn, pred_fn
from utils.common import get_label, remove_new_whale, create_label, create_kaggle_submission, get_categories
from model.arch import se_resnext101
from model.dataset import train_transform, test_transform, TestDataset, TrainDataset
from sklearn.model_selection import train_test_split

In [None]:
torch.set_num_threads(12)

In [None]:
train_df = pd.read_csv(train_repo)
categories = get_categories(train_df)
dev_df, val_df = train_test_split(train_df, test_size = 0.20)
dev_df = remove_new_whale(dev_df)

In [None]:
ohe_model, train_image_label, train_ohe_label = create_label(train_df, categories)
joblib.dump(ohe_model, os.path.join(artifacts_path, 'ohe_model.pkl'))

In [None]:
dev_image_label, dev_ohe_label = get_label(dev_df, ohe_model)
val_image_label, val_ohe_label = get_label(val_df, ohe_model)
test_image_label = [x for x in os.listdir(test_path) if x.endswith('.jpg')]

In [None]:
dev_dataset = TrainDataset(dev_image_label, dev_ohe_label, train_path, train_transform, DEVICE)
val_dataset = TrainDataset(val_image_label, val_ohe_label, train_path, test_transform, DEVICE)
test_dataset = TestDataset(test_image_label, test_path, test_transform, DEVICE)

In [None]:
dev_dataloader = DataLoader(dev_dataset, batch_size = 16, shuffle = True, num_workers = 0)
val_dataloader = DataLoader(val_dataset, batch_size = 16, shuffle = False, num_workers = 0)
test_dataloader = DataLoader(test_dataset, batch_size = 16, shuffle = False, num_workers = 0)

In [None]:
N_CLASSES = 5005
model = se_resnext101(n_classes=N_CLASSES, pretrained = True, device = DEVICE).cuda()
n_epoch = 60
dev_dataloader = dev_dataloader
optimizer = optim.Adam(model.parameters(), lr=0.005)
criterion = nn.BCEWithLogitsLoss()
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

In [None]:
model = fit_model(
    model,
    n_epoch,
    dev_dataloader,
    optimizer,
    criterion,
    loss_fn,
    metric_fn,
    val_dataloader=val_dataloader,
    checkpoint=True,
    model_filename="pytorch",
)

In [None]:
total_loss, total_metric = validate_model(model, criterion, loss_fn, metric_fn, val_dataloader)
print("Total Loss : {}".format(total_loss))
print("Total Metric : {}".format(total_metric))

In [None]:
threshold_df = new_whale_threshold(0.01, 1.0, 0.01, model, pred_proba_fn, val_dataloader)

In [None]:
threshold_df.to_csv(os.path.join(artifacts_path, 'threshold_{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d-%H%M'))), index=False)

In [None]:
result = predict_model(model, test_dataloader, pred_fn, threshold=0.50)

In [None]:
result_df = create_kaggle_submission(test_image_label, result, ohe_model)

In [None]:
result_df.to_csv(os.path.join(result_path, 'submission_{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d-%H%M'))), index=False)