In [1]:
import pandas as pd
import numpy as np
import sklearn
import torch
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
%matplotlib inline

In [2]:
#Seed 고정
random_seed=1
random.seed(random_seed)
torch.manual_seed(random_seed)
torch.random.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
np.random.seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
train_origin_data=pd.read_csv('./data/custom_contest/train.csv')
test_origin_data=pd.read_csv('./data/custom_contest/test.csv')
'''
0은 바꿀 필요가 있음 o는 숫자이므로 유지
신고번호 = x
신고일자 = x
통관지세관부호 = o
신고인부호 = 0
수입자부호 = 0
해외 거래처 부호 = 0
특송업체부호 = 0 

'''
train_origin_data=train_origin_data.fillna('missing')
# 데이터 확인
print('Data 종류')
print(train_origin_data.shape)
print(train_origin_data.columns)
# 쓸모없는 데이터 날리기
train_origin_data.drop('신고번호',axis=1,inplace=True)
train_origin_data.drop('신고일자',axis=1,inplace=True)
train_origin_data.drop('수입자부호',axis=1,inplace=True)
train_origin_data.drop('검사결과코드',axis=1,inplace=True)
train_origin_data.drop('해외거래처부호',axis=1,inplace=True)
train_origin_data.drop('HS10단위부호',axis=1,inplace=True)
# target 두개 분리
crime_target=torch.tensor(train_origin_data.pop('우범여부').to_numpy())#,dtype=torch.float)
priority_target=torch.tensor(train_origin_data.pop('핵심적발').to_numpy())
#numerical data 분리
train_weight=np.log(train_origin_data.pop('신고중량(KG)').to_numpy()+1).reshape(-1,1)
train_price=np.log(train_origin_data.pop('과세가격원화금액').to_numpy()+1).reshape(-1,1)
train_custom_rate=train_origin_data.pop('관세율').to_numpy().reshape(-1,1)
# 분리 확인
print('제거 후 데이터 종류',train_origin_data.columns)
print(train_origin_data.tail())


Data 종류
(89355, 24)
Index(['신고번호', '신고일자', '통관지세관부호', '신고인부호', '수입자부호', '해외거래처부호', '특송업체부호',
       '수입통관계획코드', '수입신고구분코드', '수입거래구분코드', '수입종류코드', '징수형태코드', '신고중량(KG)',
       '과세가격원화금액', '운송수단유형코드', '반입보세구역부호', 'HS10단위부호', '적출국가코드', '원산지국가코드',
       '관세율구분코드', '관세율', '검사결과코드', '우범여부', '핵심적발'],
      dtype='object')
제거 후 데이터 종류 Index(['통관지세관부호', '신고인부호', '특송업체부호', '수입통관계획코드', '수입신고구분코드', '수입거래구분코드',
       '수입종류코드', '징수형태코드', '운송수단유형코드', '반입보세구역부호', '적출국가코드', '원산지국가코드',
       '관세율구분코드'],
      dtype='object')
       통관지세관부호  신고인부호   특송업체부호 수입통관계획코드 수입신고구분코드  수입거래구분코드  수입종류코드  징수형태코드  \
89350       10  M9SYU   PR5UFJ        C        B        11      21      11   
89351       41  T7VQN  missing        E        E        15      11      11   
89352       40  7Q31W  missing        C        B        29      21      11   
89353       40  UJ0JR   O04TIW        F        B        15      21      14   
89354       30  4TUUB  missing        Z        B        15      21      11   

       운송수단유형코드

In [4]:
for key in train_origin_data.keys():
    enc=OneHotEncoder().fit(train_origin_data[key].to_numpy().reshape(-1,1))
    encoded_data=enc.transform(train_origin_data[key].to_numpy().reshape(-1,1))
    print(key,':',encoded_data.shape)

통관지세관부호 : (89355, 40)
신고인부호 : (89355, 965)
특송업체부호 : (89355, 81)
수입통관계획코드 : (89355, 7)
수입신고구분코드 : (89355, 4)
수입거래구분코드 : (89355, 25)
수입종류코드 : (89355, 10)
징수형태코드 : (89355, 9)
운송수단유형코드 : (89355, 6)
반입보세구역부호 : (89355, 568)
적출국가코드 : (89355, 89)
원산지국가코드 : (89355, 94)
관세율구분코드 : (89355, 35)


In [5]:

# One hot encoding
enc=OneHotEncoder(dtype=np.float32).fit(train_origin_data.to_numpy().reshape(-1,len(train_origin_data.columns)))
train_encoded_data=enc.transform(train_origin_data.to_numpy().reshape(-1,len(train_origin_data.columns))).toarray()
print("encoded dataset",train_encoded_data.shape)

# concat dataset
train_price_tensor=torch.tensor(train_price,dtype=torch.float)
train_weight_tensor=torch.tensor(train_weight,dtype=torch.float)
train_custom_rate_tensor=torch.tensor(train_custom_rate,dtype=torch.float)
train_encoded_data_tensor=torch.tensor(train_encoded_data,dtype=torch.float)
train_tensor_data=torch.cat((train_encoded_data_tensor,train_price_tensor,train_weight_tensor,train_custom_rate_tensor),dim=1)
# train_tensor_data=torch.cat((train_price_tensor,train_weight_tensor,train_custom_rate_tensor),dim=1)
del train_price,train_weight,train_custom_rate,train_encoded_data
print(train_tensor_data.size())


encoded dataset (89355, 1933)
torch.Size([89355, 1936])


In [26]:
#data 자르기
batch_size=128
test_split_rate=0.2
indices=np.arange(len(train_tensor_data))
crime_dataset=TensorDataset(train_tensor_data,crime_target)
priority_dataset=TensorDataset(train_tensor_data,priority_target.float())
# x,y train_index,test_index 보내기
# model 구조
train_indices,test_indices=train_test_split(indices,stratify=crime_target)
np.save('./data/custom_contest/mod_data.npy',train_tensor_data.numpy())
np.save('./data/custom_contest/mod_crime_target.npy',crime_target)
np.save('./data/custom_contest/mod_priority_target.npy',priority_target)
np.save('./data/custom_contest/mod_train_index.npy',train_indices)
np.save('./data/custom_contest/mod_test_index.npy',test_indices)
#crime
crime_train_dataset=Subset(crime_dataset,train_indices)
crime_test_dataset=Subset(crime_dataset,test_indices)
crime_train_data_loader=DataLoader(crime_train_dataset,batch_size=batch_size,shuffle=True,)
crime_test_data_loader=DataLoader(crime_train_dataset,batch_size=batch_size,shuffle=False)
#priority
priority_train_dataset=Subset(priority_dataset,train_indices)
priority_test_dataset=Subset(priority_dataset,test_indices)
priority_train_data_loader=DataLoader(priority_train_dataset,batch_size=batch_size,shuffle=True,)
priority_test_data_loader=DataLoader(priority_train_dataset,batch_size=batch_size,shuffle=False)


In [15]:

crime_model=nn.Sequential(nn.Linear(train_tensor_data.shape[1],5000),
    nn.BatchNorm1d(5000),
    nn.ReLU(),
    nn.Linear(5000,1000),
    nn.BatchNorm1d(1000),
    nn.ReLU(),
    nn.Linear(1000,100),
    nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100,2),
    # nn.Sigmoid()
    )
# criterion=torch.nn.BCELoss()
crime_criterion=torch.nn.CrossEntropyLoss()
crime_optimizer=torch.optim.Adam(crime_model.parameters(),lr=1e-3,weight_decay=1e-4)
epochs=100


In [16]:
def train(epoch,train_data_loader,model,optimizer,criterion):
  model.train()
  total=0
  correct=0.0
  train_loss=0.0
  for batch_idx, (data,targets) in enumerate(train_data_loader):
      data,targets=data.cuda(),targets.cuda()
      outputs=model(data)
      loss=criterion(outputs,targets)#.view(-1,1))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      # correct+=torch.eq(torch.ge(outputs,0.5).float(),targets.view(-1,1)).sum()
      correct+=torch.eq(torch.max(outputs,dim=1)[1],targets).sum()
      total += targets.size(0)
      train_loss+=loss.item()
      if batch_idx%50==1:
          # print(outputs)
          # print(torch.eq(torch.ge(outputs,0.5).float(),targets.view(-1,1)).view(-1))
          print('\r{}epoch {}/{}, Accurcay: {:.2f} Loss:{:.5f}'.format(epoch,total,len(train_data_loader.dataset),correct/total*100.0,train_loss/(batch_idx+1)),end='')

def eval(epoch,test_data_loader,model,optimizer,criterion):
    model.eval()
    eval_loss=0.0
    correct=0.0
    total=0.0
    with torch.no_grad():
      for batch_idx,(data,targets) in enumerate(test_data_loader):
          data,targets=data.cuda(),targets.cuda()
          outputs=model(data)
          loss=criterion(outputs,targets)#.view(-1,1))
          # correct+=torch.eq(torch.ge(outputs,0.5).float(),targets.view(-1,1)).sum()
          correct+=torch.eq(torch.max(outputs,dim=1)[1],targets).sum()
          total += targets.size(0)
          eval_loss +=loss.item()

    print('\n Eval: {} epoch, Accuracy: {:.2f}, Loss: {}'.format(epoch,correct/total*100.0,eval_loss/(batch_idx+1)))


crime_model.cuda()
for epoch in range(1,epochs+1):
  train(epoch,crime_train_data_loader,crime_model,crime_optimizer,crime_criterion)
  eval(epoch,crime_test_data_loader,crime_model,crime_optimizer,crime_criterion)


1epoch 64256/67016, Accurcay: 79.30 Loss:0.44348
 Eval: 1 epoch, Accuracy: 79.01, Loss: 0.4317009210700297
2epoch 6656/67016, Accurcay: 80.06 Loss:0.42172

KeyboardInterrupt: 

In [24]:
priority_model=nn.Sequential(nn.Linear(train_tensor_data.shape[1],5000),
    nn.BatchNorm1d(5000),
    nn.ReLU(),
    nn.Linear(5000,1000),
    nn.BatchNorm1d(1000),
    nn.ReLU(),
    nn.Linear(1000,100),
    nn.BatchNorm1d(100),
    nn.ReLU(),
    nn.Linear(100,1)
)
priority_criterion=torch.nn.MSELoss()
priority_optimizer=torch.optim.Adam(priority_model.parameters(),lr=1e-3,weight_decay=1e-4)
epochs=100

In [30]:
def train(epoch,train_data_loader,model,optimizer,criterion):
  model.train()
  total=0
  correct=0.0
  train_loss=0.0
  for batch_idx, (data,targets) in enumerate(train_data_loader):
      data,targets=data.cuda(),targets.cuda()
      outputs=model(data)
      loss=criterion(torch.clip(outputs,0,2),targets.view(-1,1))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      # correct+=torch.eq(torch.ge(outputs,0.5).float(),targets.view(-1,1)).sum()
      correct+=torch.eq(torch.round(outputs),targets.view(-1,1)).sum()
      total += targets.size(0)
      train_loss+=loss.item()
      if batch_idx%50==1:
          # print(outputs)
          # print(torch.eq(torch.ge(outputs,0.5).float(),targets.view(-1,1)).view(-1))
          print('\r{}epoch {}/{}, Accurcay: {:.2f} Loss:{:.5f}'.format(epoch,total,len(train_data_loader.dataset),correct/total*100.0,train_loss/(batch_idx+1)),end='')

def eval(epoch,test_data_loader,model,optimizer,criterion):
    model.eval()
    eval_loss=0.0
    correct=0.0
    total=0.0
    with torch.no_grad():
      for batch_idx,(data,targets) in enumerate(test_data_loader):
          data,targets=data.cuda(),targets.cuda()
          outputs=model(data)
          loss=criterion(outputs,targets.view(-1,1))
          # correct+=torch.eq(torch.ge(outputs,0.5).float(),targets.view(-1,1)).sum()
          correct+=torch.eq(torch.round(outputs),targets.view(-1,1)).sum()
          total += targets.size(0)
          eval_loss +=loss.item()

    print('\n Eval: {} epoch, Accuracy: {:.2f}, Loss: {}'.format(epoch,correct/total*100.0,eval_loss/(batch_idx+1)))


priority_model.cuda()
for epoch in range(1,epochs+1):
  train(epoch,priority_train_data_loader,priority_model,priority_optimizer,priority_criterion)
  eval(epoch,priority_test_data_loader,priority_model,priority_optimizer,priority_criterion)


1epoch 64256/67016, Accurcay: 72.77 Loss:0.31335
 Eval: 1 epoch, Accuracy: 68.90, Loss: 0.3111581713074946
2epoch 64256/67016, Accurcay: 74.15 Loss:0.29433
 Eval: 2 epoch, Accuracy: 77.92, Loss: 0.3166913243138608
3epoch 64256/67016, Accurcay: 76.24 Loss:0.27248
 Eval: 3 epoch, Accuracy: 77.70, Loss: 0.2554301707969822
4epoch 64256/67016, Accurcay: 77.58 Loss:0.25226
 Eval: 4 epoch, Accuracy: 76.60, Loss: 0.27607957105941444
5epoch 64256/67016, Accurcay: 78.83 Loss:0.23214
 Eval: 5 epoch, Accuracy: 81.93, Loss: 0.26851120308212195
6epoch 64256/67016, Accurcay: 80.16 Loss:0.21633
 Eval: 6 epoch, Accuracy: 83.75, Loss: 0.19827706671056858
7epoch 64256/67016, Accurcay: 81.23 Loss:0.20167
 Eval: 7 epoch, Accuracy: 84.52, Loss: 0.2201266085856971
8epoch 64256/67016, Accurcay: 81.80 Loss:0.19106
 Eval: 8 epoch, Accuracy: 85.43, Loss: 0.17858036082835144
9epoch 64256/67016, Accurcay: 82.26 Loss:0.18235
 Eval: 9 epoch, Accuracy: 83.76, Loss: 0.22383690190337996
10epoch 64256/67016, Accurcay: 8

KeyboardInterrupt: 