In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader
from torch.nn import functional as F

from modules.loss import Multiclass_focal_loss
from modules.loss import Binary_focal_loss

from models.scn import SCNet
from modules.dataset import CropDataset

import lightgbm as lgb

In [2]:
seed = 9032020
torch.manual_seed(seed)

model_class = lgb.LGBMClassifier
settings = {
    'CHANNELS': 64,
    'NUM_CHANNELS': 6,
    'INPUT_SIZE': 5,
    'BANDS': 'B02 B03 B04 B08 B12 CLD'.split(' '),
    'NUM_EPOCHS': 16,
    'BATCH_SIZE': 128,
    'ADAM_LR': 1e-4,
    'DATA_FOLDER': 'dataframes/',
    'DEVICE': torch.device('cuda:0')
}
boost_settings = {
        "n_estimators": 128,
        "random_state": seed,
        "objective" : "multiclass",
        "num_class" : 7,
        "class_weight": None,
        "num_leaves" : 60,
        "max_depth": -1,
        "learning_rate" : 0.1,
        "subsample" : 0.9,
        "colsample_bytree" : 0.4,
        "verbose" : 1}

In [3]:
def train_model(class_num, CHANNELS, NUM_CHANNELS, INPUT_SIZE, BANDS, NUM_EPOCHS, BATCH_SIZE, ADAM_LR, DATA_FOLDER, DEVICE):
    
    #Get data
    train_df = pd.read_pickle(DATA_FOLDER + 'train_data.csv')
    test_df = pd.read_pickle(DATA_FOLDER + 'test_data.csv')
    
    #Get lambda-function for dataframe label change
    DATAFRAME_LAMBDA = lambda a: 1 if a == class_num else 0
        
    #Create dataloder instances
    train_dataset = CropDataset(train_df, bands=BANDS, crop_size=INPUT_SIZE, classf=DATAFRAME_LAMBDA, autobalance=class_num)
    test_dataset = CropDataset(test_df, bands=BANDS, crop_size=INPUT_SIZE, classf=DATAFRAME_LAMBDA, need_id=True)

    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    #Create NN instance
    Model = SCNet(input_size=INPUT_SIZE, num_inputs=NUM_CHANNELS, in_channels=1, out_channels=CHANNELS, num_classes=1).to(DEVICE)
    
    #Criterion and optimizer
    fl = Binary_focal_loss(gamma=2.0, alpha=1.0)
    optimizer = torch.optim.Adam(Model.parameters(), lr=ADAM_LR)
    
    #Train loop
    for epoch_num in range(NUM_EPOCHS):
        for X, Y in iter(train_dataloader):
            optimizer.zero_grad()
            X, Y = X.to(DEVICE), Y.to(DEVICE)
            #MinMax Normalization for better training
            X = (X - X.min()) / (X.max() - X.min())
            #
            Y_pred = Model(X)
            loss = fl(Y.float(), Y_pred.squeeze(1))
            loss.backward()
            optimizer.step()
    
    torch.save(Model.state_dict(), f'Model--{class_num}.pt')
    
    Model.eval()
    
    #Predict test set
    F_IDS = []
    PREDS = []
    for X, F_ID in iter(test_dataloader):
        Y_pred = Model(X.to(DEVICE))
        F_IDS.extend(list(F_ID.numpy()))
        PREDS.extend(list(Y_pred.squeeze(1).detach().cpu().numpy()))
    
    return Model, F_IDS, PREDS

In [6]:
def train_second_level_model(model_list, class_list, test_df, M_class, boost_settings, settings):
    
    train_df = pd.read_pickle(settings['DATA_FOLDER'] + 'train_data.csv')
    Y = list(train_df['class'])
    PREDS = []
    for i, class_num in enumerate(class_list):
        #Get model
        Model = model_list[i]
        
        #Get lambda-function for dataframe label change
        DATAFRAME_LAMBDA = lambda a: 1 if a == class_num else 0
        
        #Create dataloder instance
        train_dataset = CropDataset(train_df, bands=settings['BANDS'], crop_size=settings['INPUT_SIZE'], classf=DATAFRAME_LAMBDA)
        train_dataloader = DataLoader(train_dataset, batch_size=settings['BATCH_SIZE'], shuffle=False, num_workers=0)
        
        #Predict
        N_PREDS = []
        for X, _ in iter(train_dataloader):
            N_PREDS.extend(list(Model(X.to(settings['DEVICE'])).squeeze(1).detach().cpu().numpy()))
        PREDS.append(N_PREDS)
    
    #Assemble new train dataframe
    #print(PREDS)
    colnames = ['class', *[f'feature_{i}' for i in range(len(model_list))]]
    train_dataframe = pd.DataFrame(list(zip(Y, *PREDS)), columns=colnames)
    
    #Clean ram
    del PREDS
    del train_dataset
    del train_dataloader
    
    #Train 2nd level boosting model
    gbm = M_class(**boost_settings)
    gbm.fit(train_dataframe.loc[:, 'feature_0':].values, train_dataframe.loc[:, 'class'].values)
    
    #Predict test_df
    PREDS = gbm.predict_proba(test_df.loc[:,'feature_0':])
    
    #Assemble final test_df
    colnames = ['Field_ID', *[F'Crop_ID_{i + 1}' for i in range(7)]]
    final_df = pd.DataFrame(list(zip(list(test_df.loc[:, 'field_id']), *PREDS.T.tolist())), columns=colnames)
    return final_df

In [7]:
#Train 1st lvl models
m_list = []
predictions = []
class_list = [i for i in range(1, 8)]

for i in class_list:
    Model, F_IDS, preds = train_model(i, **settings)
    m_list.append(Model)
    predictions.append(preds)

colnames = ['field_id', *[f'feature_{i}' for i in range(len(m_list))]]
test_dataframe = pd.DataFrame(list(zip(F_IDS, *predictions)), columns=colnames)

#Clean some space in ram
del F_IDS
del predictions

#Train 2nd lvl model and get df with predictions
final_df = train_second_level_model(m_list, class_list, test_dataframe, model_class, boost_settings, settings)



In [15]:
sub_df = final_df.groupby('Field_ID').mean()

In [29]:
#Check lack of Field_ID:
sub_fids = pd.read_csv('SampleSubmission.csv')['Field_ID']
out_fids = sub_fids[~sub_fids.isin(sub_df.index)].values

#For all lacked ids create random rows (I hope that nothing will went wrong and it will not be necessary)
for idx in out_fids:
    randrow = np.random.random((1, 7))
    randrow = randrow / np.sum(randrow)

    sub_df.loc[idx] = randrow[0]

Unnamed: 0_level_0,Crop_ID_1,Crop_ID_2,Crop_ID_3,Crop_ID_4,Crop_ID_5,Crop_ID_6,Crop_ID_7
Field_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
807,0.496463,0.201916,0.032132,0.160540,0.052212,0.045939,0.010798
1070,0.509048,0.209402,0.031984,0.131571,0.052153,0.047022,0.018820
1105,0.545616,0.167482,0.026375,0.159058,0.041659,0.035196,0.024615
1121,0.186545,0.172855,0.015059,0.207878,0.184777,0.180249,0.052637
1260,0.508940,0.211539,0.024682,0.131717,0.067764,0.033548,0.021810
...,...,...,...,...,...,...,...
4785,0.124357,0.162224,0.137466,0.170029,0.109889,0.179915,0.116121
4788,0.071099,0.037915,0.149457,0.267076,0.007908,0.128107,0.338438
4790,0.142248,0.154683,0.042405,0.189019,0.060790,0.165834,0.245021
4793,0.135378,0.188480,0.178935,0.126478,0.039746,0.169319,0.161665


In [None]:
sub_df.to_csv('final-submission.csv')