In [1]:
import pandas as pd
import numpy as np
import sklearn
import torch
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset
%matplotlib inline

In [2]:
train_origin_data=pd.read_csv('./data/custom_contest/train.csv')
test_origin_data=pd.read_csv('./data/custom_contest/test.csv')
'''
0은 바꿀 필요가 있음 o는 숫자이므로 유지
신고번호 = x
신고일자 = x
통관지세관부호 = o
신고인부호 = 0
수입자부호 = 0
해외 거래처 부호 = 0
특송업체부호 = 0 

'''

# 데이터 확인
print('Data 종류')
print(train_origin_data.shape)
print(train_origin_data.columns)
# 쓸모없는 데이터 날리기
train_origin_data.drop('신고번호',axis=1,inplace=True)
train_origin_data.drop('수입자부호',axis=1,inplace=True)
# target 두개 분리
crime_target=torch.tensor(train_origin_data.pop('우범여부').to_numpy())
priority_target=torch.tensor(train_origin_data.pop('핵심적발').to_numpy())
#numerical data 분리
train_weight=np.log(train_origin_data.pop('신고중량(KG)').to_numpy()).reshape(-1,1)
train_price=np.log(train_origin_data.pop('과세가격원화금액').to_numpy()).reshape(-1,1)
train_custom_rate=train_origin_data.pop('관세율').to_numpy().reshape(-1,1)
# 분리 확인
print(train_origin_data.columns)


Data 종류
(89355, 24)
Index(['신고번호', '신고일자', '통관지세관부호', '신고인부호', '수입자부호', '해외거래처부호', '특송업체부호',
       '수입통관계획코드', '수입신고구분코드', '수입거래구분코드', '수입종류코드', '징수형태코드', '신고중량(KG)',
       '과세가격원화금액', '운송수단유형코드', '반입보세구역부호', 'HS10단위부호', '적출국가코드', '원산지국가코드',
       '관세율구분코드', '관세율', '검사결과코드', '우범여부', '핵심적발'],
      dtype='object')
Index(['신고일자', '통관지세관부호', '신고인부호', '해외거래처부호', '특송업체부호', '수입통관계획코드', '수입신고구분코드',
       '수입거래구분코드', '수입종류코드', '징수형태코드', '운송수단유형코드', '반입보세구역부호', 'HS10단위부호',
       '적출국가코드', '원산지국가코드', '관세율구분코드', '검사결과코드'],
      dtype='object')
  train_price=np.log(train_origin_data.pop('과세가격원화금액').to_numpy()).reshape(-1,1)


In [3]:
for key in train_origin_data.keys():
    enc=OneHotEncoder().fit(train_origin_data[key].to_numpy().reshape(-1,1))
    encoded_data=enc.transform(train_origin_data[key].to_numpy().reshape(-1,1))
    print(key,':',encoded_data.shape)

신고일자 : (89355, 325)
통관지세관부호 : (89355, 40)
신고인부호 : (89355, 965)
해외거래처부호 : (89355, 4779)
특송업체부호 : (89355, 81)
수입통관계획코드 : (89355, 7)
수입신고구분코드 : (89355, 4)
수입거래구분코드 : (89355, 25)
수입종류코드 : (89355, 10)
징수형태코드 : (89355, 9)
운송수단유형코드 : (89355, 6)
반입보세구역부호 : (89355, 568)
HS10단위부호 : (89355, 2419)
적출국가코드 : (89355, 89)
원산지국가코드 : (89355, 94)
관세율구분코드 : (89355, 35)
검사결과코드 : (89355, 429)


In [4]:
from sklearn.preprocessing import OneHotEncoder
# One hot encoding
enc=OneHotEncoder(dtype=np.float32).fit(train_origin_data.to_numpy().reshape(-1,len(train_origin_data.columns)))
train_encoded_data=enc.transform(train_origin_data.to_numpy().reshape(-1,len(train_origin_data.columns))).toarray()
print("encoded dataset",train_encoded_data.shape)

# concat dataset
train_price_tensor=torch.tensor(train_price,dtype=torch.float)
train_weight_tensor=torch.tensor(train_weight,dtype=torch.float)
train_custom_rate_tensor=torch.tensor(train_custom_rate,dtype=torch.float)
train_encoded_data_tensor=torch.tensor(train_encoded_data,dtype=torch.float)
train_tensor_data=torch.cat((train_encoded_data_tensor,train_price_tensor,train_weight_tensor,train_custom_rate_tensor),dim=1)
del train_price,train_weight,train_custom_rate,train_encoded_data
print(train_tensor_data.size())
# shape 잘라야쥬
# train,valid,test set 만들기

encoded dataset (89355, 9885)
torch.Size([89355, 9888])


In [9]:

dataset=TensorDataset(train_tensor_data,crime_target)
model=nn.Sequential(nn.Linear(train_tensor_data.shape[1],500),\
    nn.ReLU(),
    nn.Linear(500,100),
    nn.ReLU(),
    nn.Linear(100,10),
    nn.ReLU(),
    nn.Linear(10,2)
    )
batch_size=64
train_data_loader=DataLoader(dataset,batch_size=batch_size,shuffle=True,)
criterion=torch.nn.CrossEntropyLoss()
optimizer=torch.optim.SGD(model.parameters(),lr=0.000001,weight_decay=1e-4)
epochs=100

In [10]:
model.cuda()
for epoch in range(1,epochs+1):
    total=0
    correct=0.0
    train_loss=0.0
    for batch_idx, (data,targets) in enumerate(train_data_loader):
        data,targets=data.cuda(),targets.cuda()
        outputs=model(data)
        loss=criterion(outputs,targets)

        optimizer.zero_grad()
        if not torch.isfinite(loss):
            if batch_idx%100==0:
                print('WARNING: non-finite loss')
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(outputs, 1)
        total += targets.size(0)
        correct += predicted.eq(targets.data).sum().float().cpu()
        train_loss+=loss.item()
        if batch_idx%100==0:
            print('{}epoch {}/{}, Accurcay: {:.2f} Loss:{:.5f}'.format(epoch,total,len(train_data_loader.dataset),correct/total*100.0,train_loss/(batch_idx+1)))

1epoch 64/89355, Accurcay: 76.56 Loss:0.61746
1epoch 6464/89355, Accurcay: 77.07 Loss:nan
1epoch 12864/89355, Accurcay: 76.97 Loss:nan
1epoch 19264/89355, Accurcay: 77.07 Loss:nan
1epoch 25664/89355, Accurcay: 76.91 Loss:nan
1epoch 32064/89355, Accurcay: 76.85 Loss:nan
1epoch 38464/89355, Accurcay: 76.84 Loss:nan
1epoch 44864/89355, Accurcay: 76.94 Loss:nan
1epoch 51264/89355, Accurcay: 76.91 Loss:nan
1epoch 57664/89355, Accurcay: 76.91 Loss:nan
1epoch 64064/89355, Accurcay: 76.89 Loss:nan


KeyboardInterrupt: 