In [None]:
import torch 
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn import  ensemble, preprocessing, metrics
import matplotlib
import seaborn
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from torch.utils.data import Dataset,DataLoader
from xgboost import XGBClassifier
from torch import nn
import os
BASEPATH="../input/agriculture-master-competition"

In [None]:
def seed_torch(seed=2021):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_torch()

In [None]:
def read_csvfile(filepath):
    df = pd.read_csv(filepath)
    return df
input_df = read_csvfile(os.path.join(BASEPATH,"train_data.csv"))
labelcolumns = list(input_df.columns[20:])
featurecolumns = list(input_df.columns[1:20])
featurecolumns.remove("d.rainfall_detect")

In [None]:
def removeOutlier(input_df):
    input_df = input_df[input_df["d.wind_speed"] != -9999]
    input_df = input_df[input_df["d.photometric"]<input_df["d.photometric"].std()*3]
    input_df = input_df[input_df["d.outside_photometric"]<input_df["d.outside_photometric"].std()*3]
    input_df = input_df[input_df["d.radiometric"]<input_df["d.radiometric"].std()*3]
    return input_df
input_df = removeOutlier(input_df)

In [None]:
def changeTimeFormat(input_df):
    #print(input_df["d.log_time"].iloc[6500])
    input_df["d.log_time"] = [int(i.split(' ')[0].split('/')[1].zfill(2)+i.split(' ')[0].split('/')[2].zfill(2) + i.split(' ')[1].replace(':',''))/1e5 for i in input_df["d.log_time"].tolist()]
    #print(input_df["d.log_time"].iloc[6500])
    return input_df
input_df=changeTimeFormat(input_df)

In [None]:
def logData(df,columnsname):
    df[columnsname]=df[columnsname].apply(lambda x: x if x!=0 else 000.1)
    df[columnsname] = np.log(df[columnsname]).fillna(0)
    return df
input_df = logData(input_df,"d.wind_direction")
input_df[featurecolumns].describe()

In [None]:
sns.distplot(input_df["d.wind_direction"])

In [None]:
xgb = XGBClassifier(n_estimators=100)
def replacezerovalues(df,columnsname,train=True):
    featureX = df[df[columnsname] != 0][featurecolumns].copy()
    LabelX =featureX[columnsname].copy()
    featureX = featureX.drop(columns=columnsname)
    
    #rf = RandomForestRegressor(n_estimators = 50, random_state = 2021)
    #rf.fit(featureX,LabelX)
    if train:
        #xgb = XGBClassifier(n_estimators=100)
        xgb.fit(np.array(featureX), np.array(LabelX))
    
    featureY = df[featurecolumns].copy()
    featureY = featureY.drop(columns=columnsname)
    
    newLabelY = xgb.predict(np.array(featureY))
    df[columnsname] = newLabelY
    return df
columnsnames=["d.wind_speed"]#,"d.radiometric"]#,"d.outside_photometric","d.photometric"]
#columnsnames=["d.wind_speed"]
fig ,ax = plt.subplots(len(columnsnames)+1,2,figsize=(20,20))
for idx,columnsname in enumerate(columnsnames):
    sns.distplot(input_df[columnsname],ax=ax[idx,0])
    input_df = replacezerovalues(input_df,columnsname)
    #input_df = logData(input_df,columnsname)
    sns.distplot(input_df[columnsname],ax=ax[idx,1])

In [None]:
def showDataHist():
    fig,axis = plt.subplots(6,3,figsize = (20,40))
    for idx,cln in enumerate(featurecolumns):
        sns.distplot(input_df[cln],ax=axis[idx//3,idx%3])
    fig.show()
showDataHist()

In [None]:
def dataNormalized(feature_df,zeromean=True):
    if zeromean:
        feature_df = (feature_df - feature_df.mean())/feature_df.std()
    else:
        feature_df=(feature_df-feature_df.min())/(feature_df.max()-feature_df.min())
    return feature_df
normalized_feature = dataNormalized(input_df[featurecolumns],True)
input_df[featurecolumns] = normalized_feature

In [None]:
def balanceDataset(train_df,valid_df,minnum):
    #for lc in labelcolumns:
    #    print(len(train_df[train_df[lc] == 0]),len(valid_df[valid_df[lc] == 0]),len(train_df[train_df[lc] == 1]),len(valid_df[valid_df[lc] == 1]))
    #print("===")
    for lc in labelcolumns:
        l=0 if len(valid_df[valid_df[lc] == 1]) > len(valid_df[valid_df[lc] == 0]) else 1
        trainn = len(train_df[train_df[lc] == l])
        validn=len(valid_df[valid_df[lc] == l])
        if validn+1 < minnum*(trainn+validn):
            moven = (trainn+validn)*minnum-validn
            move_rows = train_df[train_df[lc] == l].sample(frac=moven/trainn).copy()
            valid_df=valid_df.append(move_rows)
            train_df=train_df.drop(move_rows["index"])

            validnn=len(valid_df[valid_df[lc] == (not l)])
            if validnn > len(move_rows):
                move_rows = valid_df[valid_df[lc] == (not l)].sample(frac=len(move_rows)/validnn).copy()
                train_df=train_df.append(move_rows)
                valid_df=valid_df.drop(move_rows["index"])
    #for lc in labelcolumns:
    #    print(len(train_df[train_df[lc] == 0]),len(valid_df[valid_df[lc] == 0]),len(train_df[train_df[lc] == 1]),len(valid_df[valid_df[lc] == 1]))
    return train_df,valid_df


def splitDataframe(df,train_sample):
    shuffle_df = df.sample(frac=1,random_state=2021)
    train_df,valid_df = np.split(shuffle_df,[int(train_sample*len(shuffle_df))])
    #print(len(train_df),len(valid_df))
    train_df,valid_df=balanceDataset(train_df,valid_df,0.01)
    #print(len(train_df),len(valid_df))
    return train_df,valid_df
            
train_df,valid_df=splitDataframe(input_df,0.98)
print(len(train_df),len(valid_df))

In [None]:
def trainDataframeBalance(df):
    balancecolumns = labelcolumns[:6]
    balance_df = pd.DataFrame(columns = df.columns)
    for cn in balancecolumns:
        balance_df=balance_df.append(df[df[cn] == 0].copy())
        
    balancecolumns = labelcolumns[-2:]
    for cn in balancecolumns:
        balance_df=balance_df.append(df[df[cn] == 1].copy())
        
    #showactuatorplot(balance_df)
    balancecolumns = labelcolumns[:6]
    for cn in balancecolumns:
        tmp_zero=balance_df[balance_df[cn] == 0]
        tmp_one=balance_df[balance_df[cn] == 1]
        if 2*len(tmp_one)<len(tmp_zero):
            balance_df=balance_df.append(df[df[cn] == 1].sample(frac=(len(tmp_zero)-len(tmp_one))/(len(tmp_zero)*3)))
    #showactuatorplot(balance_df)
    return balance_df
#balance_df=trainDataframeBalance(train_df)
        

In [None]:
class CFG():
    def __init__(self):
        self.BATCHSIZE=100
        self.EPOCH=200
        self.savescore=80
        self.DEVICE=torch.device('cuda')
cfg=CFG()

In [None]:
def splitRandomforestDataset(df):
    featureX = [r.tolist() for i,r in df[featurecolumns].iterrows()]
    labelX = [r.tolist() for i,r in df[labelcolumns].iterrows()] 
    return featureX,labelX
featureX,labelX = splitRandomforestDataset(train_df)
featureY,labelY = splitRandomforestDataset(valid_df)

In [None]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 31)
rf.fit(featureX,labelX)

In [None]:
predictions = rf.predict(featureY)

In [None]:
accuracy = metrics.accuracy_score(labelY, predictions)
print(accuracy)

In [None]:
def calcaulateMacroF1(allpred,allans,allpredacc,nclasses,rou=3):
    recalls = [0 if allans[i] == 0 else 100*allpredacc[i]/allans[i] for  i in range(0,nclasses)]
    precisions = [0 if allpred[i] == 0 else 100*allpredacc[i]/allpred[i] for  i in range(0,nclasses)]
    avg_recalls = float(sum(recalls) / nclasses)
    avg_precisions = float(sum(precisions) / nclasses)
    beta=0.000001
    macro_f1 =(2+beta)*(avg_recalls*avg_precisions)/((avg_recalls+avg_precisions)+beta)
    macro_f1 = round(macro_f1,rou)
    precisions = [round(p,rou) for p in precisions]
    recalls = [round(r,rou) for r in recalls]
    return macro_f1,recalls ,precisions

In [None]:
totalacc=[0]*len(labelcolumns)
totalans=[0]*len(labelcolumns)
totalpred=[0]*len(labelcolumns)
acc=0
for idx in range(len(predictions)):
    allacc=True
    pred = [int(p>0.5) for p in predictions[idx]]
    if list(labelY[idx]) == list(pred):
        acc+=1
    for lidx in range(len(labelcolumns)):
        if labelcolumns[lidx] in input_df.columns[-2:] and False:
            totalans[lidx]+=not int(labelY[idx][lidx])
            predvalue = pred[lidx]
            totalpred[lidx]+= not predvalue
            if labelY[idx][lidx] == predvalue:
                totalacc[lidx]+= not predvalue
        else:
            totalans[lidx]+=int(labelY[idx][lidx])
            predvalue = pred[lidx]
            totalpred[lidx]+= predvalue
            if labelY[idx][lidx] == predvalue:
                totalacc[lidx]+= predvalue
print(acc/len(labelY))
macro_f1,recalls ,precisions = calcaulateMacroF1(totalpred,totalans,totalacc,len(labelcolumns))
print('f1-score: {} acc:{}'.format("%.3f"%macro_f1,"%.2f"%(acc/len(labelY))))
print('\nrecall: {} \nprecis: {}'.format(recalls,precisions))
print(sum(precisions)/len(precisions),sum(recalls)/len(recalls))
print(metrics.accuracy_score(labelY, predictions))
print(metrics.precision_recall_fscore_support(labelY, predictions,average='micro'))

In [None]:
test_df = read_csvfile(os.path.join(BASEPATH,"test_data.csv"))
test_df=test_df.drop("d.rainfall_detect",axis=1)
print(featurecolumns)
test_df.info()

In [None]:

submit = read_csvfile(os.path.join(BASEPATH,"submission.csv"))
test_df["d.log_time"] = [int(i.split(' ')[0].split('-')[1]+i.split(' ')[0].split('-')[2]+ i.split(' ')[1].split(':')[0]+ i.split(':')[1])/1e5  for i in test_df["d.log_time"]]
print(test_df["d.log_time"])

test_df = logData(test_df,"d.wind_direction")
for idx,columnsname in enumerate(columnsnames):
    test_df = replacezerovalues(test_df,columnsname,False)
    #test_df = logData(test_df,columnsname)

normalized_feature = dataNormalized(test_df,True)

test_df = normalized_feature
display(test_df.describe())
display(train_df[featurecolumns].describe())
ans = rf.predict(test_df[test_df.columns[1:]])



In [None]:
ans_df = pd.DataFrame(ans,columns=labelcolumns)
ans_df.to_csv("submission.csv")