In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2021-ai-19011461-waterquality/train.csv
/kaggle/input/2021-ai-19011461-waterquality/test.csv
/kaggle/input/2021-ai-19011461-waterquality/sample_submit.csv


In [2]:
# randomness 최소화

import torch
import random

seed = 777

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

torch.use_deterministic_algorithms(True) #GPU 사용 시 런타임에러 발생하니, CPU 사용을 권장드립니다.
torch.backends.cudnn.benchmark = False

In [3]:
train = pd.read_csv("/kaggle/input/2021-ai-19011461-waterquality/train.csv")
test = pd.read_csv("/kaggle/input/2021-ai-19011461-waterquality/test.csv")
submit = pd.read_csv("/kaggle/input/2021-ai-19011461-waterquality/sample_submit.csv")

print(train.info())
print(test.info())
train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   생태구명          367 non-null    object 
 1   정점명           367 non-null    object 
 2   하수처리장         367 non-null    object 
 3   관측년도          367 non-null    int64  
 4   관측월           367 non-null    int64  
 5   일자            367 non-null    object 
 6   하수처리량(㎥/일)    263 non-null    object 
 7   BOD           367 non-null    float64
 8   COD           361 non-null    float64
 9   SS            367 non-null    float64
 10  T-N           367 non-null    float64
 11  T-P           367 non-null    float64
 12  날씨            367 non-null    object 
 13  수심            367 non-null    float64
 14  투명도(m)        367 non-null    float64
 15  수온(℃)표층       367 non-null    float64
 16  클로로필A(㎍/L)표층  367 non-null    float64
dtypes: float64(9), int64(2), object(6)
memory usage: 48.9+ KB
None
<class 'pa

Unnamed: 0,관측년도,관측월,BOD,COD,SS,T-N,T-P,수심,투명도(m),수온(℃)표층,클로로필A(㎍/L)표층
count,367.0,367.0,367.0,361.0,367.0,367.0,367.0,367.0,367.0,367.0,367.0
mean,2017.168937,6.152589,4.579837,10.543019,3.801362,9.991411,0.670717,19.215531,5.194278,17.647711,2.533815
std,2.669155,3.307325,5.581329,4.510968,5.13154,4.836379,0.47539,10.718344,3.513758,4.981578,3.081409
min,2011.0,2.0,0.5,0.1,0.2,1.817,0.016,2.5,0.2,6.57,0.01
25%,2015.0,2.0,2.15,7.9,1.7,6.7775,0.349,9.0,2.5,14.485,0.64
50%,2018.0,5.0,3.5,9.7,2.6,9.177,0.604,19.0,4.0,16.69,1.45
75%,2019.0,8.0,5.0,12.5,4.2,11.917,0.882,26.0,7.2,20.525,3.18
max,2021.0,11.0,51.6,37.2,46.0,39.46,4.155,53.0,20.0,29.35,19.2


# 1. 전처리

In [4]:
from sklearn.preprocessing import LabelEncoder

X_train = train.drop(['일자', '하수처리량(㎥/일)'], axis = 1)
X_test= test.drop(['일자', '하수처리량(㎥/일)'], axis = 1)


le = LabelEncoder() 
cols = ['생태구명', '정점명', '하수처리장', '날씨']
for c in cols:
    X_train[c] = le.fit_transform(X_train[c])
    X_test[c] = le.transform(X_test[c])

In [5]:
X_train['관측년도'] = X_train['관측년도'] - 2011 # min(min(X_test['관측년도']))
X_test['관측년도'] = X_test['관측년도'] - 2011

X_train['관측월'] =(X_train['관측월']/3).astype('int') # 분기
X_test['관측월'] =(X_test['관측월']/3).astype('int')

median = X_train['COD'].median()
X_train['COD'] = X_train['COD'].fillna(median)
X_test ['COD'] = X_test ['COD'].fillna(median)

In [6]:
X_train.corr()

Unnamed: 0,생태구명,정점명,하수처리장,관측년도,관측월,BOD,COD,SS,T-N,T-P,날씨,수심,투명도(m),수온(℃)표층,클로로필A(㎍/L)표층
생태구명,1.0,0.751753,0.032212,0.38283,-0.050762,0.429367,0.281388,0.367424,0.170138,0.460426,-0.072444,0.428017,0.808774,0.154119,-0.326672
정점명,0.751753,1.0,0.049525,0.396605,-0.042615,0.387784,0.224199,0.315569,0.240589,0.461086,-0.04838,0.062555,0.554203,0.119125,-0.11522
하수처리장,0.032212,0.049525,1.0,-0.033795,0.01132,0.064223,-0.027324,0.039191,0.037633,-0.086122,-0.018798,0.094997,0.041787,0.053304,-0.047149
관측년도,0.38283,0.396605,-0.033795,1.0,-0.103827,0.170996,-0.009834,0.130004,-0.13762,0.056274,-0.10941,-0.003759,0.283006,0.070492,-0.016047
관측월,-0.050762,-0.042615,0.01132,-0.103827,1.0,0.001795,-0.077628,0.028354,-0.138062,0.039876,-0.080346,0.012211,0.021165,0.616438,0.106321
BOD,0.429367,0.387784,0.064223,0.170996,0.001795,1.0,0.626918,0.920178,0.590959,0.579671,-0.057705,0.371656,0.312903,0.091985,-0.1159
COD,0.281388,0.224199,-0.027324,-0.009834,-0.077628,0.626918,1.0,0.602654,0.575999,0.483635,-0.082421,0.340906,0.200113,-0.045999,-0.140118
SS,0.367424,0.315569,0.039191,0.130004,0.028354,0.920178,0.602654,1.0,0.612668,0.600955,-0.061062,0.363491,0.277527,0.095008,-0.125612
T-N,0.170138,0.240589,0.037633,-0.13762,-0.138062,0.590959,0.575999,0.612668,1.0,0.477168,0.025344,0.237411,0.154516,-0.137726,-0.082557
T-P,0.460426,0.461086,-0.086122,0.056274,0.039876,0.579671,0.483635,0.600955,0.477168,1.0,-0.095595,0.247347,0.408214,0.1173,-0.156584


In [7]:
corr = abs(X_train.corr()) > 0.8
for c in corr.columns.to_list():
    tmp = corr.loc[corr[c] == True].index.to_list()
    tmp.remove(c)
    if tmp:
        print(c,":", tmp)

생태구명 : ['투명도(m)']
BOD : ['SS']
SS : ['BOD']
투명도(m) : ['생태구명']


In [8]:
y_train = X_train['클로로필A(㎍/L)표층']
X_train = X_train.drop(['생태구명', 'BOD', '클로로필A(㎍/L)표층'], axis = 1)
X_test = X_test.drop(['생태구명', 'BOD'], axis = 1)

# 2. 학습

In [9]:
X_train_tensor = torch.FloatTensor(X_train.to_numpy())
X_test_tensor = torch.FloatTensor(X_test.to_numpy())
y_train_tensor = torch.FloatTensor(y_train.to_numpy()).unsqueeze(1)

In [10]:
import torch.nn as nn
linear1 = nn.Linear(X_train_tensor.shape[1], 64)
linear2 = nn.Linear(64, 128)
linear3 = nn.Linear(128, 32)
linear4 = nn.Linear(32,1)#변경

nn.init.xavier_normal_(linear1.weight)
nn.init.xavier_normal_(linear2.weight)
nn.init.xavier_normal_(linear3.weight)
nn.init.xavier_normal_(linear4.weight)

relu = nn.ReLU()
dropout = nn.Dropout(p=0.2)

model = torch.nn.Sequential(linear1, relu, dropout,
                             linear2, relu, dropout,
                             linear3, relu, dropout,
                             linear4)

In [11]:
from sklearn.metrics import mean_squared_error

lr = 1e-3
nb_epoch = 50000

optimizer = torch.optim.Adam(model.parameters(), lr = lr)

loss = nn.MSELoss()

for epoch in range (nb_epoch+1):
    model.train()
    h = model(X_train_tensor)
#     cost = loss(h, y_train_tensor)
    cost = torch.sqrt(loss(h, y_train_tensor))

    
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    if epoch%1000 == 0 :
        print(epoch, cost.item())

0 9.853919982910156
1000 1.3295071125030518
2000 1.1388658285140991
3000 0.8767989873886108
4000 1.046666145324707
5000 0.9340775609016418
6000 1.0346555709838867
7000 0.752047598361969
8000 0.6520624756813049
9000 0.6721199154853821
10000 0.7197918891906738
11000 0.6661667823791504
12000 0.7602233290672302
13000 0.7332357168197632
14000 0.7119746804237366
15000 0.7590606212615967
16000 0.7058695554733276
17000 0.6817116737365723
18000 0.6807361245155334
19000 0.6559056043624878
20000 0.6330633163452148
21000 0.6504561305046082
22000 0.7284120917320251
23000 0.7129431962966919
24000 0.5718825459480286
25000 0.7634554505348206
26000 0.7035453915596008
27000 0.6679088473320007
28000 0.6778227686882019
29000 0.630768358707428
30000 0.6176404356956482
31000 0.6759008765220642
32000 0.6729182600975037
33000 0.5922050476074219
34000 0.6257112622261047
35000 0.6186348795890808
36000 0.7595081925392151
37000 0.6555192470550537
38000 0.6760302782058716
39000 0.5706056952476501
40000 0.551273405

In [12]:
with torch.no_grad():
    model.eval()
    y_pred = relu(model(X_test_tensor)) # 양수화

In [13]:
submit['클로로필A(㎍/L)표층'] = y_pred.detach().numpy()
submit.to_csv("submission.csv", mode = 'w', index = False)
submit

Unnamed: 0,id,클로로필A(㎍/L)표층
0,0,1.071661
1,1,1.742480
2,2,3.240564
3,3,2.560932
4,4,1.222649
...,...,...
118,118,0.487870
119,119,0.342879
120,120,0.516010
121,121,0.974960
