In [None]:
import torch 
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn import  ensemble, preprocessing, metrics
import seaborn
import random
import pandas as pd
from torch.utils.data import Dataset,DataLoader
from torch import nn
import os
BASEPATH="../input/agriculture-master-competition"

In [None]:
class CFG():
    def __init__(self):
        self.BATCHSIZE=2000
        self.EPOCH=200
        self.savescore=97
        self.hiddenlayers=2
        self.hiddennum=1000
        self.DEVICE=torch.device('cuda')
cfg=CFG()

In [None]:
def seed_torch(seed=2021):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
#seed_torch()

## Load Data

In [None]:
def read_csvfile(filepath):
    df = pd.read_csv(filepath)
    return df
input_df = read_csvfile(os.path.join(BASEPATH,"train_data.csv"))
labelcolumns = list(input_df.columns[20:])
featurecolumns = list(input_df.columns[1:20])

## Pre-Processing

In [None]:
def removeOutlier(input_df):
    input_df = input_df[input_df["d.wind_speed"] != -9999]
    input_df = input_df[input_df["d.photometric"]<input_df["d.photometric"].std()*3]
    input_df = input_df[input_df["d.outside_photometric"]<input_df["d.outside_photometric"].std()*3]
    input_df = input_df[input_df["d.radiometric"]<input_df["d.radiometric"].std()*3]
    return input_df
input_df = removeOutlier(input_df)

In [None]:
def changeTimeFormat(input_df):
    input_df["d.log_time"] = [int(i.split(' ')[0].split('/')[1]+i.split(' ')[0].split('/')[2] + i.split(' ')[1].replace(':',''))/1e3 for i in input_df["d.log_time"].tolist()]
    print(input_df["d.log_time"].iloc[0])
    return input_df
input_df=changeTimeFormat(input_df)

In [None]:
def logData(df,columnsname):
    df[columnsname]=df[columnsname].apply(lambda x: x if x!=0 else 000.1)
    df[columnsname] = np.log(df[columnsname]).fillna(0)
    return df
input_df = logData(input_df,"d.wind_direction")
input_df[featurecolumns].describe()

In [None]:
def replacezerovalues(df,columnsname):
    featureX = df[df[columnsname] != 0][featurecolumns].copy()
    LabelX =featureX[columnsname].copy()
    featureX = featureX.drop(columns=columnsname)
    rf = RandomForestRegressor(n_estimators = 50, random_state = 2021)
    rf.fit(featureX,LabelX)
    
    featureY = df[featurecolumns].copy()
    featureY = featureY.drop(columns=columnsname)
    newLabelY = rf.predict(featureY)
    df[columnsname] = newLabelY
    return df
#columnsnames=["d.wind_speed","d.radiometric","d.outside_photometric","d.photometric"]
columnsnames=["d.wind_speed"]
for idx,columnsname in enumerate(columnsnames):
    input_df = replacezerovalues(input_df,columnsname)
#    input_df = logData(input_df,columnsname)

In [None]:
def showactuatorplot(df):
    fig,ax =  plt.subplots(3,4,figsize=(17,10))
    plotindex=0
    for colname in labelcolumns:
        x = list(set(df[colname]))
        y=[]
        sorted(x) 
        for i in x:
            y.append(len(df[df[colname] == i]))
        print(colname)
        print(y[:10])
        
        ax[plotindex//4,plotindex%4].bar(x,y)
        ax[plotindex//4,plotindex%4].set_title(colname)
        plotindex+=1
    fig.show()
#showactuatorplot(input_df)

In [None]:
def dataNormalized(feature_df,zeromean=True):
    if zeromean:
        feature_df = (feature_df - feature_df.mean())/feature_df.std()
    else:
        feature_df=(feature_df-feature_df.min())/(feature_df.max()-feature_df.min())
    feature_df["d.rainfall_detect"]=0
    return feature_df
normalized_feature = dataNormalized(input_df[featurecolumns],False)
input_df[featurecolumns] = normalized_feature

In [None]:
def balanceDataset(train_df,valid_df,minnum):
    for lc in labelcolumns:
        print(len(train_df[train_df[lc] == 0]),len(valid_df[valid_df[lc] == 0]),len(train_df[train_df[lc] == 1]),len(valid_df[valid_df[lc] == 1]))
    print("===")
    for lc in labelcolumns:
        l=0 if len(valid_df[valid_df[lc] == 1]) > len(valid_df[valid_df[lc] == 0]) else 1
        trainn = len(train_df[train_df[lc] == l])
        validn=len(valid_df[valid_df[lc] == l])
        if validn+1 < minnum*(trainn+validn):
            moven = (trainn+validn)*minnum-validn
            move_rows = train_df[train_df[lc] == l].sample(frac=moven/trainn).copy()
            valid_df=valid_df.append(move_rows)
            train_df=train_df.drop(move_rows["index"])

            validnn=len(valid_df[valid_df[lc] == (not l)])
            if validnn > len(move_rows):
                move_rows = valid_df[valid_df[lc] == (not l)].sample(frac=len(move_rows)/validnn).copy()
                train_df=train_df.append(move_rows)
                valid_df=valid_df.drop(move_rows["index"])
    for lc in labelcolumns:
        print(len(train_df[train_df[lc] == 0]),len(valid_df[valid_df[lc] == 0]),len(train_df[train_df[lc] == 1]),len(valid_df[valid_df[lc] == 1]))
    return train_df,valid_df


def splitDataframe(df,train_sample):
    shuffle_df = df.sample(frac=1,random_state=2021)
    train_df,valid_df = np.split(shuffle_df,[int(train_sample*len(shuffle_df))])
    #print(len(train_df),len(valid_df))
    train_df,valid_df=balanceDataset(train_df,valid_df,0.2)
    #print(len(train_df),len(valid_df))
    return train_df,valid_df
            
train_df,valid_df=splitDataframe(input_df,0.8)

In [None]:
def trainDataframeBalance(df):
    #showactuatorplot(df)
    balancecolumns = labelcolumns[:6]
    balance_df = pd.DataFrame(columns = df.columns)
    for cn in balancecolumns:
        balance_df=balance_df.append(df[df[cn] == 0].copy())
        
    balancecolumns = labelcolumns[-2:]
    for cn in balancecolumns:
        balance_df=balance_df.append(df[df[cn] == 1].copy())
           
    balancecolumns = labelcolumns[:6]
    random.shuffle(balancecolumns)
    for cn in balancecolumns:
        tmp_zero=balance_df[balance_df[cn] == 0]
        tmp_one=balance_df[balance_df[cn] == 1]
        if 2*len(tmp_one)<len(tmp_zero):
            balance_df=balance_df.append(df[df[cn] == 1].sample(frac=(len(tmp_zero)-len(tmp_one))/(len(tmp_zero)*2)))
    #showactuatorplot(balance_df)
    return balance_df
#trainDataframeBalance(train_df)
        

In [None]:
def getBalanceTrainingDataLoader(df):
    balancetrain_df = trainDataframeBalance(df)
    trainDataset=AgricultureDataset(df)
    trainDataLoader = DataLoader(trainDataset,num_workers=5,shuffle=True,batch_size = cfg.BATCHSIZE)
    return trainDataset,trainDataLoader


In [None]:
class AgricultureDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.dataframe = df
        self.feature_df = df[featurecolumns]
        self.label_df = df[labelcolumns]
    
    def __len__(self):
        return self.dataframe.shape[0]
    
    def __getitem__(self, index: int):
        
        #self.feature_df = torch.from_numpy(self.feature_df)
        return torch.FloatTensor(self.feature_df.iloc[index]),torch.FloatTensor(self.label_df.iloc[index])
trainDataset,trainDataLoader = getBalanceTrainingDataLoader(train_df)
validDataset = AgricultureDataset(valid_df)
validDataLoader = DataLoader(validDataset,num_workers=5,shuffle=False,batch_size = cfg.BATCHSIZE)

In [None]:
class swish(nn.Module):
    def __init__(self):
        super(swish, self).__init__()
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = x * self.sigmoid(x)
        return x

class testModel(nn.Module):
    def __init__(self,input_size,output_size,hiddenlayers,hiddennum):
        super(testModel,self).__init__()
        self.act = swish()
        self.input_stem =  nn.Sequential(

            nn.Linear(input_size,hiddennum),          
            nn.BatchNorm1d(hiddennum),
            self.act
        )
        hiddenlist =[]
        for h in range(hiddenlayers):
            hiddenlist.append(nn.Sequential(
                nn.Linear(hiddennum,hiddennum),          
                nn.BatchNorm1d(hiddennum),
                nn.ReLU(inplace=True)
            ))
        self.hiddens = nn.Sequential(*hiddenlist)
        self.output = nn.Sequential(
            nn.Linear(hiddennum,output_size,bias=True), 
            nn.Sigmoid()
        )
    def forward(self,x):
        x = self.input_stem(x)
        x = self.hiddens(x)
        return self.output(x)
model = testModel(len(featurecolumns),len(labelcolumns),cfg.hiddenlayers,cfg.hiddennum).to(cfg.DEVICE)

In [None]:
weight=torch.tensor([1,1,1,1,1,2,2,5,5,2,2]).to(cfg.DEVICE)
criterion = nn.BCELoss(weight=None, reduction='mean')#input:FloatTensor target:FloatTensor
#criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=10, verbose=True, threshold=0.0001, cooldown=1)

In [None]:
def calcaulateMacroF1(allpred,allans,allpredacc,nclasses,rou=3):
    recalls = [0 if allans[i] == 0 else 100*allpredacc[i]/allans[i] for  i in range(0,nclasses)]
    precisions = [0 if allpred[i] == 0 else 100*allpredacc[i]/allpred[i] for  i in range(0,nclasses)]
    avg_recalls = float(sum(recalls) / nclasses)
    avg_precisions = float(sum(precisions) / nclasses)
    beta=0.000001
    macro_f1 =(2+beta)*(avg_recalls*avg_precisions)/((avg_recalls+avg_precisions)+beta)
    macro_f1 = round(macro_f1,rou)
    precisions = [round(p,rou) for p in precisions]
    recalls = [round(r,rou) for r in recalls]
    return macro_f1,recalls ,precisions

In [None]:
def evaluation(dataloader,model):
    model.train()
    totalloss=0
    totalacc=[0]*len(labelcolumns)
    totalans=[0]*len(labelcolumns)
    totalpred=[0]*len(labelcolumns)
    acc=0
    totalscore=0
    itter_count=0
    for x,y in dataloader:
        x = x.to(cfg.DEVICE)
        y = y.to(cfg.DEVICE)
        itter_count=1+itter_count
        preds = model(x)
        loss = criterion(preds,y)
        totalloss+=loss.detach()
        preds = preds.cpu().detach()
        for idx in range(len(preds)):
            allacc=True
            pred = [int(p>0.5) for p in preds[idx]]
            if list(y[idx]) == list(pred):
                acc+=1
            for lidx in range(len(labelcolumns)):
                if labelcolumns[lidx] in input_df.columns[-2:]:
                    totalans[lidx]+=not int(y[idx][lidx])
                    predvalue = pred[lidx]
                    totalpred[lidx]+= not predvalue
                    if y[idx][lidx] == predvalue:
                        totalacc[lidx]+= not predvalue
                else:
                    totalans[lidx]+=int(y[idx][lidx])
                    predvalue = pred[lidx]
                    totalpred[lidx]+= predvalue
                    if y[idx][lidx] == predvalue:
                        totalacc[lidx]+= predvalue
        macro_f1,recalls ,precisions = calcaulateMacroF1(totalpred,totalans,totalacc,len(labelcolumns))
        print('\r{}/{} f1-score: {} acc:{} loss:{}'.format("%03d"%itter_count,len(dataloader),"%.3f"%macro_f1,"%.2f"%(acc/((itter_count+1)*cfg.BATCHSIZE)),"%.3f"%round(float(totalloss/(itter_count+1)),3)),end='',flush=True)
    if cfg.savescore<macro_f1:
        torch.save(model.state_dict(), f"{macro_f1}.pkl")
        cfg.savescore = macro_f1
    scheduler.step(macro_f1)
    print('\r{}/{} f1-score: {} acc:{} loss:{}'.format("%03d"%itter_count,len(dataloader),"%.3f"%macro_f1,"%.2f"%(acc/len(validDataset)),"%.3f"%round(float(totalloss/(itter_count)),3)),end='',flush=True)
    print('\nrecall: {} \nprecis: {}'.format(recalls,precisions))

def train_one_eopch(dataloader,model,criterion,optimizer):
    model.train()
    totalloss=0
    totalacc=[0]*len(labelcolumns)
    totalans=[0]*len(labelcolumns)
    totalpred=[0]*len(labelcolumns)
    acc=0
    totalscore=0
    itter_count=0
    for x,y in dataloader:
        itter_count+=1
        x = x.to(cfg.DEVICE)
        y = y.to(cfg.DEVICE)
        preds = model(x)
        optimizer.zero_grad()
        loss = criterion(preds,y)
        loss.backward()
        optimizer.step()
        totalloss+=loss.detach()
        preds = preds.cpu().detach()
        
        for idx in range(len(preds)):
            allacc=True
            pred = [int(p>0.5) for p in preds[idx]]
            if list(y[idx]) == list(pred):
                acc+=1
            for lidx in range(len(labelcolumns)):
                if labelcolumns[lidx] in input_df.columns[-2:]:
                    totalans[lidx]+=not int(y[idx][lidx])
                    predvalue = pred[lidx]
                    totalpred[lidx]+= not predvalue
                    if y[idx][lidx] == predvalue:
                        totalacc[lidx]+= not predvalue
                else:
                    totalans[lidx]+=int(y[idx][lidx])
                    predvalue = pred[lidx]
                    totalpred[lidx]+= predvalue
                    if y[idx][lidx] == predvalue:
                        totalacc[lidx]+= predvalue
        macro_f1,recalls ,precisions = calcaulateMacroF1(totalpred,totalans,totalacc,len(labelcolumns))
        print('\r{}/{} f1-score: {} acc:{} loss:{}'.format("%03d"%itter_count,len(dataloader),"%.3f"%macro_f1,"%.2f"%(acc/((itter_count)*cfg.BATCHSIZE)),"%.3f"%round(float(totalloss/(itter_count)),3)),end='',flush=True)
    print('\r{}/{} f1-score: {} acc:{} loss:{}'.format("%03d"%itter_count,len(dataloader),"%.3f"%macro_f1,"%.2f"%(acc/len(trainDataset)),"%.3f"%round(float(totalloss/(itter_count)),3)),end='',flush=True)
    print('\nrecall: {} \nprecis: {}'.format(recalls,precisions))
        
for e in range(cfg.EPOCH):
    print(f"\nEPOCH:{e+1}")
    trainDataset,trainDataLoader = getBalanceTrainingDataLoader(train_df)
    train_one_eopch(trainDataLoader,model,criterion,optimizer)
    print('')
    evaluation(validDataLoader,model)