In [109]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2022-ai-tp-17010420/train.csv
/kaggle/input/2022-ai-tp-17010420/test.csv
/kaggle/input/2022-ai-tp-17010420/sample_submit.csv


In [110]:
import numpy as np 
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import random
from sklearn.preprocessing import StandardScaler

# **시드 고정 및 device 설정**

In [111]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

seed=777
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if device == 'cuda':
    torch.cuda.manual_seed(777)
    torch.cuda.manual_seed_all(777)

torch.backends.cudnn.benchmark=False
torch.backends.cudnn.deterministic=True

# **데이터 불러오기**

In [112]:
train_data=pd.read_csv('/kaggle/input/2022-ai-tp-17010420/train.csv')
train_data

Unnamed: 0,도로 종류,가해자 차종,발생년도,횟수
0,시도,자전거,2011,626
1,시도,농기계,2008,71
2,시도,원동기장치자전거,2014,1871
3,일반국도,건설기계,2017,393
4,고속국도,건설기계,2012,21
...,...,...,...,...
625,군도,화물차,2014,1625
626,군도,승합차,2013,372
627,지방도,농기계,2006,95
628,시도,특수차,2020,324


In [113]:
x_test=pd.read_csv('/kaggle/input/2022-ai-tp-17010420/test.csv')
x_test

Unnamed: 0,도로 종류,가해자 차종,발생년도
0,지방도,건설기계,2007
1,고속국도,승합차,2016
2,고속국도,자전거,2010
3,일반국도,농기계,2009
4,일반국도,이륜차,2019
...,...,...,...
266,일반국도,건설기계,2013
267,특별광역시도,건설기계,2011
268,지방도,자전거,2019
269,특별광역시도,이륜차,2011


# **데이터 가공**

In [114]:
roadLabel=train_data.groupby('도로 종류')
roadLabel.groups.keys()

dict_keys(['고속국도', '군도', '시도', '일반국도', '지방도', '특별광역시도'])

In [115]:
carLabel=train_data.groupby('가해자 차종')
carLabel.groups.keys()

dict_keys(['건설기계', '농기계', '승용차', '승합차', '원동기장치자전거', '이륜차', '자전거', '특수차', '화물차'])

In [116]:
num=0
for i in roadLabel.groups.keys():
    train_data.loc[train_data['도로 종류']==i,'도로 종류']=num
    x_test.loc[x_test['도로 종류']==i,'도로 종류']=num
    num+=1

num=0
for i in carLabel.groups.keys():
    train_data.loc[train_data['가해자 차종']==i,'가해자 차종']=num
    x_test.loc[x_test['가해자 차종']==i,'가해자 차종']=num
    num+=1

train_data

Unnamed: 0,도로 종류,가해자 차종,발생년도,횟수
0,2,6,2011,626
1,2,1,2008,71
2,2,4,2014,1871
3,3,0,2017,393
4,0,0,2012,21
...,...,...,...,...
625,1,8,2014,1625
626,1,3,2013,372
627,4,1,2006,95
628,2,7,2020,324


In [117]:
x_test

Unnamed: 0,도로 종류,가해자 차종,발생년도
0,4,0,2007
1,0,3,2016
2,0,6,2010
3,3,1,2009
4,3,5,2019
...,...,...,...
266,3,0,2013
267,5,0,2011
268,4,6,2019
269,5,5,2011


In [118]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

x_train=train_data.iloc[:,:-1]
y_train=np.array(train_data.iloc[:,-1]).reshape(630,1)

x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

x_train = torch.FloatTensor(x_train).to(device)
y_train = torch.FloatTensor(y_train).to(device)
x_test = torch.FloatTensor(x_test).to(device)

# **학습 파라미터 설정**

In [119]:
learning_rate = 0.0001
training_epochs = 10000
node_num=512
batch_size = 100

# **데이터 로더 설정**

In [120]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset=TensorDataset(x_train,y_train)
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size)

# **모델 생성**

In [121]:
class NN(torch.nn.Module):
    def __init__(self):
        super(NN, self).__init__() 

        self.linear1 = nn.Linear(3, node_num, bias=True)
        self.linear2 = nn.Linear(node_num, node_num, bias=True)
        self.linear3 = nn.Linear(node_num, node_num, bias=True)
        self.linear4 = nn.Linear(node_num, node_num, bias=True)
        self.linear5 = nn.Linear(node_num, 1, bias=True)
        self.relu = nn.ReLU()
        
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.xavier_uniform_(self.linear2.weight)
        nn.init.xavier_uniform_(self.linear3.weight)
        nn.init.xavier_uniform_(self.linear4.weight)
        nn.init.xavier_uniform_(self.linear5.weight)

    def forward(self,x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        out = self.relu(out)
        out = self.linear3(out)
        out = self.relu(out)
        out = self.linear4(out)
        out = self.relu(out)
        out = self.linear5(out)
        return out


model = NN().to(device)

# **손실 함수, opimizer 설정**

In [122]:
loss = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate) 

# **모델 학습**

In [123]:
total_batch = len(train_loader)
model.train()

for epoch in range(training_epochs):
    avg_cost = 0

    for x_train_batch, y_train_batch in train_loader:

        optimizer.zero_grad()

        x_train_batch = x_train_batch.to(device)
        y_train_batch = y_train_batch.to(device)

        hypothesis = model(x_train_batch)
        cost = loss(hypothesis, y_train_batch)

        cost.backward()
        optimizer.step()

        avg_cost += cost / total_batch

    if epoch%100==0:
        print('Epoch: {:04d} // cost:{:.9f}'.format(epoch+1,avg_cost))

print('Learning finished')

Epoch: 0001 // cost:107563624.000000000
Epoch: 0101 // cost:89925576.000000000
Epoch: 0201 // cost:86559552.000000000
Epoch: 0301 // cost:74555072.000000000
Epoch: 0401 // cost:41191352.000000000
Epoch: 0501 // cost:15159644.000000000
Epoch: 0601 // cost:4042660.750000000
Epoch: 0701 // cost:925606.500000000
Epoch: 0801 // cost:498207.625000000
Epoch: 0901 // cost:326569.781250000
Epoch: 1001 // cost:237108.656250000
Epoch: 1101 // cost:185250.734375000
Epoch: 1201 // cost:152912.093750000
Epoch: 1301 // cost:129111.070312500
Epoch: 1401 // cost:113019.242187500
Epoch: 1501 // cost:102895.726562500
Epoch: 1601 // cost:96606.546875000
Epoch: 1701 // cost:89161.804687500
Epoch: 1801 // cost:83745.273437500
Epoch: 1901 // cost:79427.414062500
Epoch: 2001 // cost:76697.109375000
Epoch: 2101 // cost:73736.039062500
Epoch: 2201 // cost:70906.320312500
Epoch: 2301 // cost:67604.687500000
Epoch: 2401 // cost:65227.535156250
Epoch: 2501 // cost:63144.933593750
Epoch: 2601 // cost:61222.75390625

# **평가용 데이터 생성**

In [124]:
with torch.no_grad():
    model.eval()

    prediction = model(x_test)

# **csv 파일로 데이터 출력**

In [125]:
# csv 변환
prediction = prediction.cpu().numpy()
submit = pd.read_csv("/kaggle/input/2022-ai-tp-17010420/sample_submit.csv")
submit['predict'] = prediction
submit.to_csv('/kaggle/working/submission.csv',index=False)