In [7]:
import pandas as pd
import os
import matplotlib.pyplot as plt 
import cv2
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim.lr_scheduler as lr_scheduler
import multiprocessing as mp 
import numpy as np
from sklearn.model_selection import StratifiedKFold 
import timm 
from tqdm import tqdm
from torch import nn

ModuleNotFoundError: No module named 'pandas'

In [None]:
import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

train_transforms = A.Compose([
    A.Transpose(),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
#     A.ShiftScaleRotate(p=0.5),
    A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
    ToTensorV2(transpose_mask=True)
])

val_transforms = A.Compose([
    A.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
    ToTensorV2(transpose_mask=True)
])

In [None]:
!git clone https://github.com/Booss3my/CV_blueprints.git
    
from CV_blueprints.cached_dataset import CachedDataset
from CV_blueprints.utils import clear_memory_and_display_gpu_info

data_root = "/kaggle/input/eyes-on-the-ground/content"
train_root = os.path.join(data_root, "train")
test_root = os.path.join(data_root, "test")

In [None]:
# rm -r CV_blueprints/

In [None]:
train_data = pd.read_csv(os.path.join(data_root,"train.csv"))
remove_lines = [fn for fn in train_data.filename if " " in fn]
train_data = train_data.loc[~train_data.filename.isin(remove_lines)].drop_duplicates().reset_index(drop=True)
holdout = train_data.sample(frac=0.2,ignore_index=True)
train_data = train_data.loc[~train_data.filename.isin(holdout.filename)].reset_index(drop=True)


In [None]:
# damage = list(train_data.damage.unique())
# growth_stage = list(train_data.growth_stage.unique())
# f, ax = plt.subplots(len(damage),len(growth_stage),figsize=(15,15))
# plt.tight_layout()
# for i,d in enumerate(damage):
#     for j,gs in enumerate(growth_stage):
#         pool = train_data.loc[(train_data.damage==d)&(train_data.growth_stage==gs)]
#         if len(pool)>=1:
#             image_name = pool.sample(1).filename.item()
#             ax[i,j].imshow(cv2.imread(os.path.join(train_root,image_name)))
#             ax[i,j].title.set_text("damage: "+d+" , gr stage: "+gs)

In [None]:
NUM_EPOCHS = 9

Scheduler

In [None]:
def lr_lambda(epoch):
    # LR to be 0.1 * (1/1+0.01*epoch)
    base_lr = 1
    factor = 0.06
    return base_lr/(1+factor*epoch)

def save_state(model,epoch,fold,val_loss,train_loss):
    state = {
        'epoch': epoch,
        'fold':fold,
        'state_dict': model.state_dict(),
        'val_loss':val_loss,
        'train_loss':train_loss
    }
    torch.save(state, f"model_state_fl{fold}_ep{epoch}.pt")

# epochs = np.arange(30)
# lrs=[lr_lambda(e) for e in epochs]
# plt.plot(epochs,lrs)

In [None]:
def run_test(dataloader,model):
    val_accuracy,val_loss,val_iterator = 0,0,iter(dataloader)
    criterion = nn.MSELoss()
    with torch.no_grad():
        for k in tqdm(range(len(dataloader))):                
            val_data,val_lab = next(iter(dataloader))
            val_output = model.eval()(val_data.to(device))
            val_loss += torch.sqrt(criterion(val_output.squeeze(),(val_lab/100).to(device)))
    print(f' ---- validation loss = {val_loss/(k+1)}')
    dataloader.dataset.set_use_cache(True)
    return val_loss/(k+1)

    
def train(model,batch_size=128,lr=0.02,grad_acc = 4):
    skf = StratifiedKFold(n_splits=4)
    image_paths = np.array([os.path.join(train_root,fn) for fn in train_data.filename])
    labels = torch.tensor(train_data.extent.to_numpy()).type(torch.float)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(lr=lr,params=model.parameters())
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda)
    
    smallest_val = 1000
    
    for fold, (train_index, val_index) in enumerate(skf.split(image_paths,labels)):    
        train_dataset = CachedDataset(image_paths[train_index],labels[train_index],tfs=train_transforms)
        val_dataset = CachedDataset(image_paths[val_index],labels[val_index],tfs=val_transforms)

        val_dataloader=DataLoader(val_dataset,batch_size=batch_size,shuffle=True)
        train_dataloader=DataLoader(train_dataset,batch_size=batch_size,num_workers=2,shuffle=True) #prameterize batchsize


        for epoch in range(NUM_EPOCHS):

                running_loss,iterator= 0,iter(train_dataloader)

                for i in tqdm(range(len(iterator)),f'Iterating through {len(iterator)} batches'):   
                    training_data,training_lab=next(iterator)
                    output = model(training_data.to(device))
    
                    loss = torch.sqrt(criterion(output.squeeze(),(training_lab/100).to(device))) #RMSE loss

                    running_loss+=loss
                    (loss/grad_acc).backward()
                    
                    if i%grad_acc==grad_acc-1:
                        optimizer.step()
                        optimizer.zero_grad()
                        
                train_dataloader.dataset.set_use_cache(True)   #use cache
                
                #validation run
                if epoch%2==1:
                    val_loss = run_test(val_dataloader,model)
                    if val_loss<smallest_val and epoch > 3:
                        save_state(model,epoch,fold,val_loss,running_loss/(i+1))
                        smallest_val = val_loss

                model.train()
                running_loss=running_loss/(i+1)
                
                scheduler.step()
                print(f'epoch[{epoch+1}]/[{NUM_EPOCHS}]  training loss --- {running_loss} --- lr-- {optimizer.param_groups[0]["lr"]}') 

        del train_dataloader
        del val_dataloader

In [None]:
base_model= timm.create_model('efficientnet_b1_pruned', pretrained=True)
base_model.classifier = nn.Linear(in_features=1280,out_features=1,bias=True)

In [None]:
%%capture
class ETG_model(nn.Module):
    def __init__(self,base,base_out):
        super(ETG_model,self).__init__()
        self.base=base
        self.fc=nn.Linear(base_out,1)
        self.sigmoid = nn.Sigmoid()
        self.Relu = nn.ReLU()
    def forward(self,y):
        y=self.sigmoid(self.fc(self.base(y)))
        return y
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = base_model
model = nn.DataParallel(model, device_ids=[0, 1])
model.to(device)

In [None]:
train(model, batch_size=64, lr=0.001,grad_acc=10)

In [None]:
clear_memory_and_display_gpu_info()

# **Generate submission**

In [None]:
# state =torch.load("/kaggle/working/model_state_fl1_ep1.pt")

In [None]:
# model.load_state_dict(state["state_dict"])

In [None]:
test_csv = pd.read_csv("/kaggle/input/subm-file/Test.csv")
submission_file = pd.read_csv("/kaggle/input/subm-file/SampleSubmission.csv")
test_fnames = submission_file.merge(test_csv, how="inner", on="ID").filename

test_paths = np.array([os.path.join(test_root,fn) for fn in test_fnames])
test_labels = torch.tensor(np.zeros(test_paths.shape))

test_dataset = CachedDataset(test_paths,test_labels,tfs=val_transforms,cache=False)
test_dataloader = DataLoader(test_dataset,batch_size = 256,num_workers=2,shuffle=False)

test_iter=iter(test_dataloader)
test_output=torch.tensor([]).to(device)
for i,(data,_) in tqdm(enumerate(test_iter)):
    with torch.no_grad():
        test_output = torch.cat((test_output,model.eval()(data.to(device))))

submission = submission_file.copy()        
submission["extent"] = (100*abs(test_output)).type(torch.int).cpu()
submission.to_csv("submission_ob.csv",index=False)