# Flood Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
pd.set_option('mode.chained_assignment',  None) # warning 제거

### Data load

In [5]:
datas = []

for rep in range(2012, 2023):
    data = pd.read_csv('./data/water_data/data_' + str(rep) + '.csv')
    datas.append(data)

In [298]:
for data in datas:
    print(data.shape) # 2012 ~ 2021: (26496, 15), 2022: (11376, 15)

(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(11376, 15)


In [299]:
for data in datas:
    print(data.duplicated().sum()) # 중복 데이터 x

0
0
0
0
0
0
0
0
0
0
0


In [300]:
# 모든 년도 data의 column corr 추출

data_all = pd.concat([data for data in datas], axis=0)

data_corr = data_all.corr()

data_corr['wl_1018662'].sort_values(ascending=False)

wl_1018662    1.000000
wl_1018680    0.993364
wl_1018683    0.992271
wl_1019630    0.958617
fw_1019630    0.752340
fw_1018662    0.741266
tototf        0.709398
inf           0.699971
fw_1018683    0.655167
ecpc          0.063409
tide_level   -0.002609
sfw          -0.063405
swl          -0.066725
fw_1018680         NaN
Name: wl_1018662, dtype: float64

In [301]:
data_all.fw_1018680.value_counts() # fw_1018680: 0 or nan

0.0    79487
Name: fw_1018680, dtype: int64

In [302]:
data_all.isna().sum()

ymdhm              0
swl              743
inf              743
sfw              743
ecpc             743
tototf           743
tide_level      4927
wl_1018662        59
fw_1018662     16380
wl_1018680        59
fw_1018680    196849
wl_1018683        59
fw_1018683      1279
wl_1019630        59
fw_1019630        59
dtype: int64

In [303]:
data_all.isin([0]).sum()

ymdhm             0
swl              83
inf           10122
sfw              82
ecpc              0
tototf          447
tide_level        0
wl_1018662     6912
fw_1018662        0
wl_1018680     6912
fw_1018680    79487
wl_1018683     6912
fw_1018683      183
wl_1019630     6912
fw_1019630        0
dtype: int64

In [304]:
# sns.pairplot(data_corr)
# plt.show()

### Data Preprocessing

#### 1. 2012 data

In [305]:
data_2012 = datas[0].copy()

data_2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26495 non-null  float64
 2   inf         26495 non-null  float64
 3   sfw         26495 non-null  float64
 4   ecpc        26495 non-null  float64
 5   tototf      26495 non-null  float64
 6   tide_level  25720 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26496 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  26496 non-null  float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [306]:
data_2012.isna().sum()

ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level    776
wl_1018662      0
fw_1018662      0
wl_1018680      0
fw_1018680      0
wl_1018683      0
fw_1018683      0
wl_1019630      0
fw_1019630      0
dtype: int64

In [307]:
data_2012 = data_2012[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2012.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2012-05-01 00:00,24.8,555.0,219.07,24.93,555.0,445.0,469.05,729.8,540.18,310.7,300.2,290.0,275.3
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.0,731.48,540.18,314.7,300.2,290.0,275.3
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3


In [308]:
data_2012.isna().sum()

ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level    776
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
dtype: int64

In [309]:
# mean 할당 (단, 월별 mean값 부여)

In [8]:
# row 별 month 추출 method

def get_month(data):
    idx = data.iloc[0, 0].split('-')[1]
    months = []

    for idx in range(len(data)):
        idx = data.iloc[idx, 0].split('-')[1]
        
        months.append(int(idx))

    months = np.array(months)
    
    return months

In [311]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2012)

months[:5]

array([5, 5, 5, 5, 5])

In [312]:
# month column 추가

months = months.reshape(-1)

data_2012['month'] = months

data_2012.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.8,555.0,219.07,24.93,555.0,445.0,469.05,729.8,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.0,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5


In [313]:
# data_2012 월별 split

data_2012_m5 = data_2012.groupby('month').get_group(5)
data_2012_m6 = data_2012.groupby('month').get_group(6)
data_2012_m7 = data_2012.groupby('month').get_group(7)
data_2012_m8 = data_2012.groupby('month').get_group(8)
data_2012_m9 = data_2012.groupby('month').get_group(9)
data_2012_m10 = data_2012.groupby('month').get_group(10)

In [9]:
# 월별 nan값 찾는 method

def find_nan(datas):
    month = 5
    
    for data in datas:
        print(str(month) + '월')
        print(data.isna().sum())
        print()
        
        month += 1

In [315]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10])

5월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    702
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level    67
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

7월
ymdhm         0
swl           1
inf           1
sfw           1
ecpc          1
tototf        1
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_10

In [316]:
# nan 값 mean으로 변경 (5월)

tide_level_mean = data_2012_m5.tide_level.mean()

data_2012_m5.tide_level[data_2012_m5.tide_level.isna()] = tide_level_mean

In [317]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level    67
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

7월
ymdhm         0
swl           1
inf           1
sfw           1
ecpc          1
tototf        1
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_

In [318]:
# nan 값 mean으로 변경 (6월)

tide_level_mean = data_2012_m6.tide_level.mean()

data_2012_m6.tide_level[data_2012_m6.tide_level.isna()] = tide_level_mean

In [319]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           1
inf           1
sfw           1
ecpc          1
tototf        1
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [320]:
# nan 값 mean으로 변경(7월)

swl_mean = data_2012_m7.swl.mean()
inf_mean = data_2012_m7.inf.mean()
sfw_mean = data_2012_m7.sfw.mean()
ecpc_mean = data_2012_m7.ecpc.mean()
tototf_mean = data_2012_m7.tototf.mean()
tide_level_mean = data_2012_m7.tide_level.mean()

data_2012_m7.swl[data_2012_m7.swl.isna()] = swl_mean
data_2012_m7.inf[data_2012_m7.inf.isna()] = inf_mean
data_2012_m7.sfw[data_2012_m7.sfw.isna()] = sfw_mean
data_2012_m7.ecpc[data_2012_m7.ecpc.isna()] = ecpc_mean
data_2012_m7.tototf[data_2012_m7.tototf.isna()] = tototf_mean
data_2012_m7.tide_level[data_2012_m7.tide_level.isna()] = tide_level_mean

In [321]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [322]:
# nan 값 mean으로 변경(8월)

tide_level_mean = data_2012_m8.tide_level.mean()

data_2012_m8.tide_level[data_2012_m8.tide_level.isna()] = tide_level_mean

In [323]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [324]:
# nan 값 mean으로 변경(9월)

tide_level_mean = data_2012_m9.tide_level.mean()

data_2012_m9.tide_level[data_2012_m9.tide_level.isna()] = tide_level_mean

In [325]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [326]:
# dataset 생성

dataset = pd.concat([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2012-10-31 23:10,25.250,270.2,235.01,8.99,270.2,300.0,1018.67,1395.30,1837.11,372.7,364.2,365.0,351.3,10
26492,2012-10-31 23:20,25.250,264.1,235.01,8.99,264.1,286.0,945.75,1154.21,1814.92,365.7,359.2,362.0,350.3,10
26493,2012-10-31 23:30,25.250,257.9,235.01,8.99,257.9,273.0,915.35,968.58,1792.88,362.7,356.2,359.0,349.3,10
26494,2012-10-31 23:40,25.250,264.1,235.01,8.99,264.1,261.0,846.39,776.99,1727.56,355.7,352.2,356.0,346.3,10


#### 2. 2013 data

In [327]:
data_2013 = datas[1].copy()

data_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26496 non-null  float64
 2   inf         26496 non-null  float64
 3   sfw         26496 non-null  float64
 4   ecpc        26496 non-null  float64
 5   tototf      26496 non-null  float64
 6   tide_level  26481 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26496 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  26496 non-null  float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [328]:
data_2013 = data_2013[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2013.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2013-05-01 00:00,25.29,151.0,236.48,7.53,151.0,384.0,639.32,-993.41,958.36,332.7,317.2,314.0,305.3
1,2013-05-01 00:10,25.289,173.8,236.45,7.56,173.8,369.0,690.47,-775.22,911.07,338.7,327.2,316.0,302.3
2,2013-05-01 00:20,25.289,173.8,236.45,7.56,173.8,353.0,734.64,-380.85,834.96,343.7,334.2,324.0,297.3
3,2013-05-01 00:30,25.288,173.8,236.41,7.59,173.8,338.0,770.98,-52.67,762.23,347.7,337.2,326.0,292.3
4,2013-05-01 00:40,25.278,73.9,236.04,7.96,173.9,322.0,789.5,237.34,706.48,349.7,338.2,327.0,288.3


In [329]:
data_2013.isna().sum()

ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level    15
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
dtype: int64

In [330]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2013)

months[:5]

array([5, 5, 5, 5, 5])

In [331]:
# month column 추가

months = months.reshape(-1)

data_2013['month'] = months

data_2013.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2013-05-01 00:00,25.29,151.0,236.48,7.53,151.0,384.0,639.32,-993.41,958.36,332.7,317.2,314.0,305.3,5
1,2013-05-01 00:10,25.289,173.8,236.45,7.56,173.8,369.0,690.47,-775.22,911.07,338.7,327.2,316.0,302.3,5
2,2013-05-01 00:20,25.289,173.8,236.45,7.56,173.8,353.0,734.64,-380.85,834.96,343.7,334.2,324.0,297.3,5
3,2013-05-01 00:30,25.288,173.8,236.41,7.59,173.8,338.0,770.98,-52.67,762.23,347.7,337.2,326.0,292.3,5
4,2013-05-01 00:40,25.278,73.9,236.04,7.96,173.9,322.0,789.5,237.34,706.48,349.7,338.2,327.0,288.3,5


In [332]:
# data_2013 월별 split

data_2013_m5 = data_2013.groupby('month').get_group(5)
data_2013_m6 = data_2013.groupby('month').get_group(6)
data_2013_m7 = data_2013.groupby('month').get_group(7)
data_2013_m8 = data_2013.groupby('month').get_group(8)
data_2013_m9 = data_2013.groupby('month').get_group(9)
data_2013_m10 = data_2013.groupby('month').get_group(10)

In [333]:
find_nan([data_2013_m5, data_2013_m6, data_2013_m7, data_2013_m8, data_2013_m9, data_2013_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    3
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    3
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    2
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [334]:
# nan 값 mean으로 변경(5월)

tide_level_mean = data_2013_m5.tide_level.mean()

data_2013_m5.tide_level[data_2013_m5.tide_level.isna()] = tide_level_mean

In [335]:
find_nan([data_2013_m5, data_2013_m6, data_2013_m7, data_2013_m8, data_2013_m9, data_2013_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    3
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    2
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [336]:
# nan 값 mean으로 변경(6월)

tide_level_mean = data_2013_m6.tide_level.mean()

data_2013_m6.tide_level[data_2013_m6.tide_level.isna()] = tide_level_mean

In [337]:
find_nan([data_2013_m5, data_2013_m6, data_2013_m7, data_2013_m8, data_2013_m9, data_2013_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    2
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [338]:
# nan 값 mean으로 변경(7월)

tide_level_mean = data_2013_m7.tide_level.mean()

data_2013_m7.tide_level[data_2013_m7.tide_level.isna()] = tide_level_mean

In [339]:
find_nan([data_2013_m5, data_2013_m6, data_2013_m7, data_2013_m8, data_2013_m9, data_2013_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [340]:
# nan 값 mean으로 변경(9월)

tide_level_mean = data_2013_m9.tide_level.mean()

data_2013_m9.tide_level[data_2013_m9.tide_level.isna()] = tide_level_mean

In [341]:
find_nan([data_2013_m5, data_2013_m6, data_2013_m7, data_2013_m8, data_2013_m9, data_2013_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [342]:
# nan 값 mean으로 변경(10월)

tide_level_mean = data_2013_m10.tide_level.mean()

data_2013_m10.tide_level[data_2013_m10.tide_level.isna()] = tide_level_mean

In [343]:
find_nan([data_2013_m5, data_2013_m6, data_2013_m7, data_2013_m8, data_2013_m9, data_2013_m10]) # 변경 확인

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [344]:
# dataset 생성

dataset = pd.concat([dataset, data_2013_m5, data_2013_m6, data_2013_m7, data_2013_m8, data_2013_m9, data_2013_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2013-10-31 23:10,25.231,171.9,234.40,9.60,171.9,109.0,280.22,372.96,366.60,280.7,274.2,268.0,259.3,10
26492,2013-10-31 23:20,25.224,71.0,234.10,9.90,171.0,124.0,280.22,348.63,366.60,280.7,274.2,268.0,259.3,10
26493,2013-10-31 23:30,25.219,70.6,233.91,10.09,170.6,141.0,280.22,315.76,376.45,280.7,273.2,267.0,260.3,10
26494,2013-10-31 23:40,25.219,70.2,233.91,10.09,170.2,157.0,280.22,263.13,376.45,280.7,273.2,267.0,260.3,10


#### 3. 2014 data

In [345]:
data_2014 = datas[2].copy()

data_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26476 non-null  float64
 2   inf         26476 non-null  float64
 3   sfw         26476 non-null  float64
 4   ecpc        26476 non-null  float64
 5   tototf      26476 non-null  float64
 6   tide_level  26492 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26496 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  26495 non-null  float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [346]:
data_2014 = data_2014[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2014.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2014-05-01 00:00,25.28,132.0,236.11,7.89,132.0,196.0,582.21,350.33,1056.61,325.7,316.2,315.0,311.3
1,2014-05-01 00:10,25.289,126.3,236.45,7.56,126.3,185.0,590.2,249.84,1006.88,326.7,315.2,312.0,308.3
2,2014-05-01 00:20,25.289,126.3,236.45,7.56,126.3,175.0,590.2,206.48,958.36,326.7,315.2,311.0,305.3
3,2014-05-01 00:30,25.289,132.2,236.45,7.56,132.2,165.0,598.25,249.5,911.07,327.7,315.2,311.0,302.3
4,2014-05-01 00:40,25.289,126.2,236.45,7.56,126.2,155.0,598.25,305.01,849.91,327.7,314.2,310.0,298.3


In [347]:
data_2014.isna().sum()

ymdhm          0
swl           20
inf           20
sfw           20
ecpc          20
tototf        20
tide_level     4
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
dtype: int64

In [348]:
# mean 할당 (단, 월별 mean값 부여)

In [349]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2014)

months[:5]

array([5, 5, 5, 5, 5])

In [350]:
# month column 추가

months = months.reshape(-1)

data_2014['month'] = months

data_2014.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2014-05-01 00:00,25.28,132.0,236.11,7.89,132.0,196.0,582.21,350.33,1056.61,325.7,316.2,315.0,311.3,5
1,2014-05-01 00:10,25.289,126.3,236.45,7.56,126.3,185.0,590.2,249.84,1006.88,326.7,315.2,312.0,308.3,5
2,2014-05-01 00:20,25.289,126.3,236.45,7.56,126.3,175.0,590.2,206.48,958.36,326.7,315.2,311.0,305.3,5
3,2014-05-01 00:30,25.289,132.2,236.45,7.56,132.2,165.0,598.25,249.5,911.07,327.7,315.2,311.0,302.3,5
4,2014-05-01 00:40,25.289,126.2,236.45,7.56,126.2,155.0,598.25,305.01,849.91,327.7,314.2,310.0,298.3,5


In [351]:
# data_2014 월별 split

data_2014_m5 = data_2014.groupby('month').get_group(5)
data_2014_m6 = data_2014.groupby('month').get_group(6)
data_2014_m7 = data_2014.groupby('month').get_group(7)
data_2014_m8 = data_2014.groupby('month').get_group(8)
data_2014_m9 = data_2014.groupby('month').get_group(9)
data_2014_m10 = data_2014.groupby('month').get_group(10)

In [352]:
find_nan([data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    2
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm          0
swl           20
inf           20
sfw           20
ecpc          20
tototf        20
tide_level     0
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_10

In [353]:
# nan 값 mean으로 변경 (5월)

tide_level_mean = data_2014_m5.tide_level.mean()

data_2014_m5.tide_level[data_2014_m5.tide_level.isna()] = tide_level_mean

In [354]:
find_nan([data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10]) # 변경 확인

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm          0
swl           20
inf           20
sfw           20
ecpc          20
tototf        20
tide_level     0
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_10

In [355]:
# nan 값 mean으로 변경 (7월)

tide_level_mean = data_2014_m7.tide_level.mean()

data_2014_m7.tide_level[data_2014_m7.tide_level.isna()] = tide_level_mean

In [356]:
find_nan([data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10]) # 변경 확인

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm          0
swl           20
inf           20
sfw           20
ecpc          20
tototf        20
tide_level     0
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_10

In [357]:
# nan 값 mean으로 변경 (8월)

swl_mean = data_2014_m8.swl.mean()
inf_mean = data_2014_m8.inf.mean()
sfw_mean = data_2014_m8.sfw.mean()
ecpc_mean = data_2014_m8.ecpc.mean()
tototf_mean = data_2014_m8.tototf.mean()

data_2014_m8.swl[data_2014_m8.swl.isna()] = swl_mean
data_2014_m8.inf[data_2014_m8.inf.isna()] = inf_mean
data_2014_m8.sfw[data_2014_m8.sfw.isna()] = sfw_mean
data_2014_m8.ecpc[data_2014_m8.ecpc.isna()] = ecpc_mean
data_2014_m8.tototf[data_2014_m8.tototf.isna()] = tototf_mean

In [358]:
find_nan([data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10]) # 변경 확인

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [359]:
# nan 값 mean으로 변경 (9월)

tide_level_mean = data_2014_m9.tide_level.mean()

data_2014_m9.tide_level[data_2014_m9.tide_level.isna()] = tide_level_mean

In [360]:
find_nan([data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10]) # 변경 확인

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [361]:
# dataset 생성

dataset = pd.concat([dataset, data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2014-10-31 23:10,25.126,145.4,230.56,13.44,145.4,556.0,319.84,257.19,471.08,287.7,275.2,273.0,269.3,10
26492,2014-10-31 23:20,25.127,145.3,230.58,13.42,145.3,557.0,319.84,256.83,471.08,287.7,275.2,272.0,269.3,10
26493,2014-10-31 23:30,25.131,145.2,230.75,13.25,145.2,554.0,319.84,252.76,460.03,287.7,275.2,272.0,268.3,10
26494,2014-10-31 23:40,25.140,245.1,231.05,12.95,145.1,550.0,319.84,252.76,460.03,287.7,275.2,272.0,268.3,10


#### 4. 2015 data

In [362]:
data_2015 = datas[3].copy()

data_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26483 non-null  float64
 2   inf         26483 non-null  float64
 3   sfw         26483 non-null  float64
 4   ecpc        26483 non-null  float64
 5   tototf      26483 non-null  float64
 6   tide_level  26483 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  22709 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26093 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [363]:
data_2015 = data_2015[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2015.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2015-05-01 00:00,25.19,47.0,232.86,11.14,147.0,124.0,302.53,227.02,427.69,284.7,268.2,268.0,265.3
1,2015-05-01 00:10,25.199,151.2,233.19,10.81,151.2,132.0,302.53,267.56,427.69,284.7,268.2,268.0,265.3
2,2015-05-01 00:20,25.199,151.1,233.19,10.81,151.1,140.0,308.24,320.26,427.69,285.7,268.2,268.0,265.3
3,2015-05-01 00:30,25.19,51.2,232.86,11.14,151.2,150.0,314.01,271.61,427.69,286.7,268.2,268.0,265.3
4,2015-05-01 00:40,25.19,51.2,232.86,11.14,151.2,164.0,314.01,312.15,417.17,286.7,268.2,268.0,264.3


In [364]:
data_2015.isna().sum()

ymdhm            0
swl             13
inf             13
sfw             13
ecpc            13
tototf          13
tide_level      13
fw_1018662    3787
fw_1018683     403
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [365]:
# mean 할당 (단, 월별 mean값 부여)

In [366]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2015)

months[:5]

array([5, 5, 5, 5, 5])

In [367]:
# month column 추가

months = months.reshape(-1)

data_2015['month'] = months

data_2015.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2015-05-01 00:00,25.19,47.0,232.86,11.14,147.0,124.0,302.53,227.02,427.69,284.7,268.2,268.0,265.3,5
1,2015-05-01 00:10,25.199,151.2,233.19,10.81,151.2,132.0,302.53,267.56,427.69,284.7,268.2,268.0,265.3,5
2,2015-05-01 00:20,25.199,151.1,233.19,10.81,151.1,140.0,308.24,320.26,427.69,285.7,268.2,268.0,265.3,5
3,2015-05-01 00:30,25.19,51.2,232.86,11.14,151.2,150.0,314.01,271.61,427.69,286.7,268.2,268.0,265.3,5
4,2015-05-01 00:40,25.19,51.2,232.86,11.14,151.2,164.0,314.01,312.15,417.17,286.7,268.2,268.0,264.3,5


In [368]:
# data_2015 월별 split

data_2015_m5 = data_2015.groupby('month').get_group(5)
data_2015_m6 = data_2015.groupby('month').get_group(6)
data_2015_m7 = data_2015.groupby('month').get_group(7)
data_2015_m8 = data_2015.groupby('month').get_group(8)
data_2015_m9 = data_2015.groupby('month').get_group(9)
data_2015_m10 = data_2015.groupby('month').get_group(10)

In [369]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10])

5월
ymdhm           0
swl             9
inf             9
sfw             9
ecpc            9
tototf          9
tide_level      2
fw_1018662    537
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
swl             3
inf             3
sfw             3
ecpc            3
tototf          3
tide_level      0
fw_1018662    743
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      0
fw_1018662     54
fw_1018683    188
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      8
fw_1018662

In [370]:
# nan 값 mean으로 변경 (5월)

swl_mean = data_2015_m5.swl.mean()
inf_mean = data_2015_m5.inf.mean()
sfw_mean = data_2015_m5.sfw.mean()
ecpc_mean = data_2015_m5.ecpc.mean()
tototf_mean = data_2015_m5.tototf.mean()
tide_level_mean = data_2015_m5.tide_level.mean()

data_2015_m5.swl[data_2015_m5.swl.isna()] = swl_mean
data_2015_m5.inf[data_2015_m5.inf.isna()] = inf_mean
data_2015_m5.sfw[data_2015_m5.sfw.isna()] = sfw_mean
data_2015_m5.ecpc[data_2015_m5.ecpc.isna()] = ecpc_mean
data_2015_m5.tototf[data_2015_m5.tototf.isna()] = tototf_mean
data_2015_m5.tide_level[data_2015_m5.tide_level.isna()] = tide_level_mean

In [371]:
# nan 값 mean으로 변경 (5월)

fw62_mean = data_2015_m5.fw_1018662.mean()

data_2015_m5.fw_1018662[data_2015_m5.fw_1018662.isna()] = fw62_mean

In [372]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
swl             3
inf             3
sfw             3
ecpc            3
tototf          3
tide_level      0
fw_1018662    743
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      0
fw_1018662     54
fw_1018683    188
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      8
fw_1018662    487
fw_1018683    178
fw_1

In [373]:
# nan 값 mean으로 변경 (6월)

swl_mean = data_2015_m6.swl.mean()
inf_mean = data_2015_m6.inf.mean()
sfw_mean = data_2015_m6.sfw.mean()
ecpc_mean = data_2015_m6.ecpc.mean()
tototf_mean = data_2015_m6.tototf.mean()

data_2015_m6.swl[data_2015_m6.swl.isna()] = swl_mean
data_2015_m6.inf[data_2015_m6.inf.isna()] = inf_mean
data_2015_m6.sfw[data_2015_m6.sfw.isna()] = sfw_mean
data_2015_m6.ecpc[data_2015_m6.ecpc.isna()] = ecpc_mean
data_2015_m6.tototf[data_2015_m6.tototf.isna()] = tototf_mean

In [374]:
# nan 값 mean으로 변경 (6월)

fw62_mean = data_2015_m6.fw_1018662.mean()

data_2015_m6.fw_1018662[data_2015_m6.fw_1018662.isna()] = fw62_mean

In [375]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      0
fw_1018662     54
fw_1018683    188
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      8
fw_1018662    487
fw_1018683    178
fw_1019630      0
wl_1018662      

In [376]:
# nan 값 mean으로 변경 (7월)

fw62_mean = data_2015_m7.fw_1018662.mean()
fw83_mean = data_2015_m7.fw_1018683.mean()

data_2015_m7.fw_1018662[data_2015_m7.fw_1018662.isna()] = fw62_mean
data_2015_m7.fw_1018683[data_2015_m7.fw_1018683.isna()] = fw83_mean

In [377]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      8
fw_1018662    487
fw_1018683    178
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683

In [378]:
# nan 값 mean으로 변경 (8월)

tide_level_mean = data_2015_m8.tide_level.mean()

fw62_mean = data_2015_m8.fw_1018662.mean()
fw83_mean = data_2015_m8.fw_1018683.mean()

data_2015_m8.tide_level[data_2015_m8.tide_level.isna()] = tide_level_mean

data_2015_m8.fw_1018662[data_2015_m8.fw_1018662.isna()] = fw62_mean
data_2015_m8.fw_1018683[data_2015_m8.fw_1018683.isna()] = fw83_mean

In [379]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [380]:
# nan 값 mean으로 변경 (9월)

swl_mean = data_2015_m9.swl.mean()
inf_mean = data_2015_m9.inf.mean()
sfw_mean = data_2015_m9.sfw.mean()
ecpc_mean = data_2015_m9.ecpc.mean()
tototf_mean = data_2015_m9.tototf.mean()
tide_level_mean = data_2015_m9.tide_level.mean()

data_2015_m9.swl[data_2015_m9.swl.isna()] = inf_mean
data_2015_m9.inf[data_2015_m9.inf.isna()] = inf_mean
data_2015_m9.sfw[data_2015_m9.sfw.isna()] = inf_mean
data_2015_m9.ecpc[data_2015_m9.ecpc.isna()] = inf_mean
data_2015_m9.tototf[data_2015_m9.tototf.isna()] = tototf_mean
data_2015_m9.tide_level[data_2015_m9.tide_level.isna()] = tide_level_mean

In [381]:
# nan 값 mean으로 변경 (9월)

fw62_mean = data_2015_m9.fw_1018662.mean()
fw83_mean = data_2015_m9.fw_1018683.mean()

data_2015_m9.fw_1018662[data_2015_m9.fw_1018662.isna()] = fw62_mean
data_2015_m9.fw_1018683[data_2015_m9.fw_1018683.isna()] = fw83_mean

In [382]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [383]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2015_m10.fw_1018662.mean()
fw83_mean = data_2015_m10.fw_1018683.mean()

data_2015_m10.fw_1018662[data_2015_m10.fw_1018662.isna()] = fw62_mean
data_2015_m10.fw_1018683[data_2015_m10.fw_1018683.isna()] = fw83_mean

In [384]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [385]:
# dataset 생성

dataset = pd.concat([dataset, data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.00,219.07,24.93,555.00,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.60,218.86,25.15,562.90,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.10,218.69,25.31,576.40,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.80,218.69,25.31,563.10,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.10,218.69,25.31,576.40,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2015-10-31 23:10,25.349,-20.88,224.27,19.73,83.32,475.0,875.60,-1845.96,1579.97,358.7,340.2,336.0,339.3,10
26492,2015-10-31 23:20,25.347,-20.87,224.23,19.78,83.33,457.0,955.99,-1338.61,1579.97,366.7,349.2,346.0,339.3,10
26493,2015-10-31 23:30,25.340,-125.03,223.93,20.07,83.37,438.0,1040.02,-967.58,1539.03,374.7,355.2,352.0,337.3,10
26494,2015-10-31 23:40,25.320,-332.12,223.18,20.82,83.48,420.0,1127.69,-791.01,1458.80,382.7,360.2,357.0,333.3,10


#### 5. 2016 data

In [386]:
data_2016 = datas[4].copy()

data_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26488 non-null  float64
 2   inf         26488 non-null  float64
 3   sfw         26488 non-null  float64
 4   ecpc        26488 non-null  float64
 5   tototf      26488 non-null  float64
 6   tide_level  25840 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  25247 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26122 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [387]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2016 = data_2016[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2016.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2016-05-01 00:00,25.09,451.0,215.42,28.58,654.0,474.0,512.81,677.62,748.09,316.7,304.2,300.0,291.3
1,2016-05-01 00:10,25.095,230.64,214.88,29.12,128.94,467.0,512.81,681.91,762.23,316.7,304.2,300.0,292.3
2,2016-05-01 00:20,25.089,128.98,214.64,29.36,128.98,459.0,505.38,669.04,762.23,315.7,303.2,300.0,292.3
3,2016-05-01 00:30,25.08,27.64,214.33,29.68,129.04,451.0,483.42,514.65,762.23,312.7,303.2,300.0,292.3
4,2016-05-01 00:40,25.083,27.32,214.42,29.58,129.02,441.0,483.42,493.2,762.23,312.7,302.2,300.0,292.3


In [388]:
data_2016.isna().sum()

ymdhm            0
swl              8
inf              8
sfw              8
ecpc             8
tototf           8
tide_level     656
fw_1018662    1249
fw_1018683     374
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [389]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2016)

months[:5]

array([5, 5, 5, 5, 5])

In [390]:
# month column 추가

months = months.reshape(-1)

data_2016['month'] = months

data_2016.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2016-05-01 00:00,25.09,451.0,215.42,28.58,654.0,474.0,512.81,677.62,748.09,316.7,304.2,300.0,291.3,5
1,2016-05-01 00:10,25.095,230.64,214.88,29.12,128.94,467.0,512.81,681.91,762.23,316.7,304.2,300.0,292.3,5
2,2016-05-01 00:20,25.089,128.98,214.64,29.36,128.98,459.0,505.38,669.04,762.23,315.7,303.2,300.0,292.3,5
3,2016-05-01 00:30,25.08,27.64,214.33,29.68,129.04,451.0,483.42,514.65,762.23,312.7,303.2,300.0,292.3,5
4,2016-05-01 00:40,25.083,27.32,214.42,29.58,129.02,441.0,483.42,493.2,762.23,312.7,302.2,300.0,292.3,5


In [391]:
# data_2016 월별 split

data_2016_m5 = data_2016.groupby('month').get_group(5)
data_2016_m6 = data_2016.groupby('month').get_group(6)
data_2016_m7 = data_2016.groupby('month').get_group(7)
data_2016_m8 = data_2016.groupby('month').get_group(8)
data_2016_m9 = data_2016.groupby('month').get_group(9)
data_2016_m10 = data_2016.groupby('month').get_group(10)

In [392]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10])

5월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    180
fw_1018662      0
fw_1018683    101
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    107
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level    43
fw_1018662    97
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             8
inf             8
sfw             8
ecpc            8
tototf          8
tide_level    162
fw_1018662    126
fw_1018

In [393]:
# nan 값 mean으로 변경 (5월)

tide_level_mean = data_2016_m5.tide_level.mean()

fw_mean = data_2016_m5.fw_1018683.mean()

data_2016_m5.tide_level[data_2016_m5.tide_level.isna()] = tide_level_mean

data_2016_m5.fw_1018683[data_2016_m5.fw_1018683.isna()] = fw_mean

In [394]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    107
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level    43
fw_1018662    97
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             8
inf             8
sfw             8
ecpc            8
tototf          8
tide_level    162
fw_1018662    126
fw_1018683    273
fw_1019630      0
w

In [395]:
# nan 값 mean으로 변경 (6월)

tide_level_mean = data_2016_m6.tide_level.mean()

data_2016_m6.tide_level[data_2016_m6.tide_level.isna()] = tide_level_mean

In [396]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level    43
fw_1018662    97
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             8
inf             8
sfw             8
ecpc            8
tototf          8
tide_level    162
fw_1018662    126
fw_1018683    273
fw_1019630      0
wl_1018662      0
wl_1018680   

In [397]:
# nan 값 mean으로 변경 (7월)

tide_level_mean = data_2016_m7.tide_level.mean()

fw_mean = data_2016_m7.fw_1018662.mean()

data_2016_m7.tide_level[data_2016_m7.tide_level.isna()] = tide_level_mean

data_2016_m7.fw_1018662[data_2016_m7.fw_1018662.isna()] = fw_mean

In [398]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10]) # 변경 확인

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
swl             8
inf             8
sfw             8
ecpc            8
tototf          8
tide_level    162
fw_1018662    126
fw_1018683    273
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683

In [399]:
# nan 값 mean으로 변경 (8월)

swl_mean = data_2016_m8.swl.mean()
inf_mean = data_2016_m8.inf.mean()
sfw_mean = data_2016_m8.sfw.mean()
ecpc_mean = data_2016_m8.ecpc.mean()
tototf_mean = data_2016_m8.tototf.mean()
tide_level_mean = data_2016_m8.tide_level.mean()

data_2016_m8.swl[data_2016_m8.swl.isna()] = swl_mean
data_2016_m8.inf[data_2016_m8.inf.isna()] = inf_mean
data_2016_m8.sfw[data_2016_m8.sfw.isna()] = sfw_mean
data_2016_m8.ecpc[data_2016_m8.ecpc.isna()] = ecpc_mean
data_2016_m8.tototf[data_2016_m8.tototf.isna()] = tototf_mean
data_2016_m8.tide_level[data_2016_m8.tide_level.isna()] = tide_level_mean

In [400]:
# nan 값 mean으로 변경 (8월)

fw62_mean = data_2016_m8.fw_1018662.mean()
fw83_mean = data_2016_m8.fw_1018683.mean()

data_2016_m8.fw_1018662[data_2016_m8.fw_1018662.isna()] = fw62_mean
data_2016_m8.fw_1018683[data_2016_m8.fw_1018683.isna()] = fw83_mean

In [401]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [402]:
# nan 값 mean으로 변경 (9월)

tide_level_mean = data_2016_m9.tide_level.mean()

fw62_mean = data_2016_m9.fw_1018662.mean()

data_2016_m9.tide_level[data_2016_m9.tide_level.isna()] = tide_level_mean

data_2016_m9.fw_1018662[data_2016_m9.fw_1018662.isna()] = fw62_mean

In [403]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [404]:
# nan 값 mean으로 변경 (10월)

tide_level_mean = data_2016_m10.tide_level.mean()

fw62_mean = data_2016_m10.fw_1018662.mean()

data_2016_m10.tide_level[data_2016_m10.tide_level.isna()] = tide_level_mean

data_2016_m10.fw_1018662[data_2016_m10.fw_1018662.isna()] = fw62_mean

In [405]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [406]:
# dataset 생성

dataset = pd.concat([dataset, data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.00,219.07,24.93,555.00,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.60,218.86,25.15,562.90,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.10,218.69,25.31,576.40,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.80,218.69,25.31,563.10,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.10,218.69,25.31,576.40,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2016-10-31 23:10,25.300,191.43,222.44,21.56,87.53,242.0,690.47,937.43,1159.75,338.7,325.2,326.0,317.3,10
26492,2016-10-31 23:20,25.289,87.58,222.04,21.96,87.58,229.0,631.00,776.45,1142.22,331.7,322.2,325.0,316.3,10
26493,2016-10-31 23:30,25.289,87.58,222.04,21.96,87.58,216.0,598.25,584.09,1142.22,327.7,320.2,323.0,316.3,10
26494,2016-10-31 23:40,25.289,87.58,222.04,21.96,87.58,204.0,582.21,441.31,1124.83,325.7,318.2,321.0,315.3,10


#### 6. 2017 data

In [407]:
data_2017 = datas[5].copy()

data_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26475 non-null  float64
 2   inf         26475 non-null  float64
 3   sfw         26475 non-null  float64
 4   ecpc        26475 non-null  float64
 5   tototf      26475 non-null  float64
 6   tide_level  25861 non-null  float64
 7   wl_1018662  26438 non-null  float64
 8   fw_1018662  24423 non-null  float64
 9   wl_1018680  26438 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26438 non-null  float64
 12  fw_1018683  26438 non-null  float64
 13  wl_1019630  26438 non-null  float64
 14  fw_1019630  26438 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [408]:
data_2017 = data_2017[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2017.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2017-05-01 00:00,25.28,11.0,221.7,22.3,114.0,342.0,622.73,-944.41,805.46,330.7,309.2,304.0,295.3
1,2017-05-01 00:10,25.259,16.6,220.93,23.07,119.9,326.0,647.71,-523.16,748.09,333.7,314.2,312.0,291.3
2,2017-05-01 00:20,25.256,16.61,220.83,23.17,119.91,310.0,673.2,-131.32,692.88,336.7,317.2,315.0,287.3
3,2017-05-01 00:30,25.25,0.0,220.57,23.43,119.95,295.0,681.8,157.8,639.83,337.7,317.2,316.0,283.3
4,2017-05-01 00:40,25.245,0.0,220.4,23.61,119.97,280.0,673.2,403.27,614.11,336.7,317.2,316.0,281.3


In [409]:
data_2017.isna().sum()

ymdhm            0
swl             21
inf             21
sfw             21
ecpc            21
tototf          21
tide_level     635
fw_1018662    2073
fw_1018683      58
fw_1019630      58
wl_1018662      58
wl_1018680      58
wl_1018683      58
wl_1019630      58
dtype: int64

In [410]:
# mean 할당 (단, 월별 mean값 부여)

In [411]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2017)

months[:5]

array([5, 5, 5, 5, 5])

In [412]:
# month column 추가

months = months.reshape(-1)

data_2017['month'] = months

data_2017.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2017-05-01 00:00,25.28,11.0,221.7,22.3,114.0,342.0,622.73,-944.41,805.46,330.7,309.2,304.0,295.3,5
1,2017-05-01 00:10,25.259,16.6,220.93,23.07,119.9,326.0,647.71,-523.16,748.09,333.7,314.2,312.0,291.3,5
2,2017-05-01 00:20,25.256,16.61,220.83,23.17,119.91,310.0,673.2,-131.32,692.88,336.7,317.2,315.0,287.3,5
3,2017-05-01 00:30,25.25,0.0,220.57,23.43,119.95,295.0,681.8,157.8,639.83,337.7,317.2,316.0,283.3,5
4,2017-05-01 00:40,25.245,0.0,220.4,23.61,119.97,280.0,673.2,403.27,614.11,336.7,317.2,316.0,281.3,5


In [413]:
# data_2014 월별 split

data_2017_m5 = data_2017.groupby('month').get_group(5)
data_2017_m6 = data_2017.groupby('month').get_group(6)
data_2017_m7 = data_2017.groupby('month').get_group(7)
data_2017_m8 = data_2017.groupby('month').get_group(8)
data_2017_m9 = data_2017.groupby('month').get_group(9)
data_2017_m10 = data_2017.groupby('month').get_group(10)

In [414]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10])

5월
ymdhm           0
swl            14
inf            14
sfw            14
ecpc           14
tototf         14
tide_level      1
fw_1018662    992
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level     72
fw_1018662    838
fw_1018683     58
fw_1019630     58
wl_1018662     58
wl_1018680     58
wl_1018683     58
wl_1019630     58
month           0
dtype: int64

7월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    233
fw_1018662     14
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    120
fw_1018662

In [415]:
# nan 값 mean으로 변경 (5월)

swl_mean = data_2017_m5.swl.mean()
inf_mean = data_2017_m5.inf.mean()
sfw_mean = data_2017_m5.sfw.mean()
ecpc_mean = data_2017_m5.ecpc.mean()
tototf_mean = data_2017_m5.tototf.mean()
tide_level_mean = data_2017_m5.tide_level.mean()

data_2017_m5.swl[data_2017_m5.swl.isna()] = swl_mean
data_2017_m5.inf[data_2017_m5.inf.isna()] = inf_mean
data_2017_m5.sfw[data_2017_m5.sfw.isna()] = sfw_mean
data_2017_m5.ecpc[data_2017_m5.ecpc.isna()] = ecpc_mean
data_2017_m5.tototf[data_2017_m5.tototf.isna()] = tototf_mean
data_2017_m5.tide_level[data_2017_m5.tide_level.isna()] = tide_level_mean

In [416]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2017_m5.fw_1018662.mean()

data_2017_m5.fw_1018662[data_2017_m5.fw_1018662.isna()] = fw_mean

In [417]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level     72
fw_1018662    838
fw_1018683     58
fw_1019630     58
wl_1018662     58
wl_1018680     58
wl_1018683     58
wl_1019630     58
month           0
dtype: int64

7월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    233
fw_1018662     14
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    120
fw_1018662      0
fw_1018683      0
fw_1

In [418]:
# nan 값 mean으로 변경 (6월)

swl_mean = data_2017_m6.swl.mean()
inf_mean = data_2017_m6.inf.mean()
sfw_mean = data_2017_m6.sfw.mean()
ecpc_mean = data_2017_m6.ecpc.mean()
tototf_mean = data_2017_m6.tototf.mean()
tide_level_mean = data_2017_m6.tide_level.mean()

data_2017_m6.swl[data_2017_m6.swl.isna()] = swl_mean
data_2017_m6.inf[data_2017_m6.inf.isna()] = inf_mean
data_2017_m6.sfw[data_2017_m6.sfw.isna()] = sfw_mean
data_2017_m6.ecpc[data_2017_m6.ecpc.isna()] = ecpc_mean
data_2017_m6.tototf[data_2017_m6.tototf.isna()] = tototf_mean
data_2017_m6.tide_level[data_2017_m6.tide_level.isna()] = tide_level_mean

In [419]:
# nan 값 mean으로 변경 (6월)

fw62_mean = data_2017_m6.fw_1018662.mean()
fw83_mean = data_2017_m6.fw_1018683.mean()
fw30_mean = data_2017_m6.fw_1019630.mean()

wl62_mean = data_2017_m6.wl_1018662.mean()
wl80_mean = data_2017_m6.wl_1018680.mean()
wl83_mean = data_2017_m6.wl_1018683.mean()
wl30_mean = data_2017_m6.wl_1019630.mean()

data_2017_m6.fw_1018662[data_2017_m6.fw_1018662.isna()] = fw62_mean
data_2017_m6.fw_1018683[data_2017_m6.fw_1018683.isna()] = fw83_mean
data_2017_m6.fw_1019630[data_2017_m6.fw_1019630.isna()] = fw30_mean

data_2017_m6.wl_1018662[data_2017_m6.wl_1018662.isna()] = wl62_mean
data_2017_m6.wl_1018680[data_2017_m6.wl_1018680.isna()] = wl80_mean
data_2017_m6.wl_1018683[data_2017_m6.wl_1018683.isna()] = wl83_mean
data_2017_m6.wl_1019630[data_2017_m6.wl_1019630.isna()] = wl30_mean

In [420]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    233
fw_1018662     14
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    120
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      

In [421]:
# nan 값 mean으로 변경 (7월)

tide_level_mean = data_2017_m7.tide_level.mean()

fw62_mean = data_2017_m7.fw_1018662.mean()

data_2017_m7.tide_level[data_2017_m7.tide_level.isna()] = tide_level_mean

data_2017_m7.fw_1018662[data_2017_m7.fw_1018662.isna()] = fw62_mean

In [422]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level    120
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683

In [423]:
# nan 값 mean으로 변경 (8월)

tide_level_mean = data_2017_m8.tide_level.mean()

data_2017_m8.tide_level[data_2017_m8.tide_level.isna()] = tide_level_mean

In [424]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [425]:
# nan 값 mean으로 변경 (9월)

tide_level_mean = data_2017_m9.tide_level.mean()

fw62_mean = data_2017_m9.fw_1018662.mean()

data_2017_m9.tide_level[data_2017_m9.tide_level.isna()] = tide_level_mean

data_2017_m9.fw_1018662[data_2017_m9.fw_1018662.isna()] = fw62_mean

In [426]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [427]:
# nan 값 mean으로 변경 (10월)

swl_mean = data_2017_m10.swl.mean()
inf_mean = data_2017_m10.inf.mean()
sfw_mean = data_2017_m10.sfw.mean()
ecpc_mean = data_2017_m10.ecpc.mean()
tototf_mean = data_2017_m10.tototf.mean()
tide_level_mean = data_2017_m10.tide_level.mean()

data_2017_m10.swl[data_2017_m10.swl.isna()] = swl_mean
data_2017_m10.inf[data_2017_m10.inf.isna()] = inf_mean
data_2017_m10.sfw[data_2017_m10.sfw.isna()] = sfw_mean
data_2017_m10.ecpc[data_2017_m10.ecpc.isna()] = ecpc_mean
data_2017_m10.tototf[data_2017_m10.tototf.isna()] = tototf_mean
data_2017_m10.tide_level[data_2017_m10.tide_level.isna()] = tide_level_mean

In [428]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2017_m10.fw_1018662.mean()

data_2017_m10.fw_1018662[data_2017_m10.fw_1018662.isna()] = fw62_mean

In [429]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [430]:
# dataset 생성

dataset = pd.concat([dataset, data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2017-10-31 23:10,25.200,156.0,218.73,25.27,156.0,185.0,253.57,179.12,337.86,275.7,261.2,263.0,256.3,10
26492,2017-10-31 23:20,25.200,156.0,218.73,25.27,156.0,193.0,253.57,170.98,337.86,275.7,262.2,263.0,256.3,10
26493,2017-10-31 23:30,25.200,156.0,218.73,25.27,156.0,202.0,253.57,166.91,337.86,275.7,262.2,263.0,256.3,10
26494,2017-10-31 23:40,25.200,156.0,218.73,25.27,156.0,213.0,258.79,138.41,337.86,276.7,261.2,263.0,256.3,10


#### 7. 2018 data

In [431]:
data_2018 = datas[6].copy()

data_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26401 non-null  float64
 2   inf         26401 non-null  float64
 3   sfw         26401 non-null  float64
 4   ecpc        26401 non-null  float64
 5   tototf      26401 non-null  float64
 6   tide_level  26366 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26290 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26380 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [432]:
data_2018 = data_2018[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2018.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2018-05-01 00:00,25.02,314.0,212.14,31.86,314.0,168.0,707.97,458.22,1107.57,340.7,324.2,327.0,314.3
1,2018-05-01 00:10,25.01,175.0,211.77,32.23,276.0,157.0,707.97,453.16,1073.46,340.7,323.2,326.0,312.3
2,2018-05-01 00:20,25.01,262.0,211.77,32.23,262.0,147.0,707.97,483.62,1039.9,340.7,322.2,325.0,310.3
3,2018-05-01 00:30,25.01,262.0,211.77,32.23,262.0,137.0,690.47,527.28,990.57,338.7,321.2,324.0,307.3
4,2018-05-01 00:40,25.01,262.0,211.77,32.23,262.0,127.0,690.47,561.97,958.36,338.7,320.2,323.0,305.3


In [433]:
data_2018.isna().sum()

ymdhm           0
swl            95
inf            95
sfw            95
ecpc           95
tototf         95
tide_level    130
fw_1018662    206
fw_1018683    116
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
dtype: int64

In [434]:
# mean 할당 (단, 월별 mean값 부여)

In [435]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2018)

months[:5]

array([5, 5, 5, 5, 5])

In [436]:
# month column 추가

months = months.reshape(-1)

data_2018['month'] = months

data_2018.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2018-05-01 00:00,25.02,314.0,212.14,31.86,314.0,168.0,707.97,458.22,1107.57,340.7,324.2,327.0,314.3,5
1,2018-05-01 00:10,25.01,175.0,211.77,32.23,276.0,157.0,707.97,453.16,1073.46,340.7,323.2,326.0,312.3,5
2,2018-05-01 00:20,25.01,262.0,211.77,32.23,262.0,147.0,707.97,483.62,1039.9,340.7,322.2,325.0,310.3,5
3,2018-05-01 00:30,25.01,262.0,211.77,32.23,262.0,137.0,690.47,527.28,990.57,338.7,321.2,324.0,307.3,5
4,2018-05-01 00:40,25.01,262.0,211.77,32.23,262.0,127.0,690.47,561.97,958.36,338.7,320.2,323.0,305.3,5


In [437]:
# data_2018 월별 split

data_2018_m5 = data_2018.groupby('month').get_group(5)
data_2018_m6 = data_2018.groupby('month').get_group(6)
data_2018_m7 = data_2018.groupby('month').get_group(7)
data_2018_m8 = data_2018.groupby('month').get_group(8)
data_2018_m9 = data_2018.groupby('month').get_group(9)
data_2018_m10 = data_2018.groupby('month').get_group(10)

In [438]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10])

5월
ymdhm          0
swl           27
inf           27
sfw           27
ecpc          27
tototf        27
tide_level     2
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

6월
ymdhm          0
swl           63
inf           63
sfw           63
ecpc          63
tototf        63
tide_level     1
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

7월
ymdhm          0
swl            3
inf            3
sfw            3
ecpc           3
tototf         3
tide_level    17
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             2
inf             2
sfw             2
ecpc            2
tototf          2
tide_level     39
fw_1018662    158
fw_1018683      0
fw_1019630      0
w

In [439]:
# nan 값 mean으로 변경 (5월)

swl_mean = data_2018_m5.swl.mean()
inf_mean = data_2018_m5.inf.mean()
sfw_mean = data_2018_m5.sfw.mean()
ecpc_mean = data_2018_m5.ecpc.mean()
tototf_mean = data_2018_m5.tototf.mean()
tide_level_mean = data_2018_m5.tide_level.mean()

data_2018_m5.swl[data_2018_m5.swl.isna()] = swl_mean
data_2018_m5.inf[data_2018_m5.inf.isna()] = inf_mean
data_2018_m5.sfw[data_2018_m5.sfw.isna()] = sfw_mean
data_2018_m5.ecpc[data_2018_m5.ecpc.isna()] = ecpc_mean
data_2018_m5.tototf[data_2018_m5.tototf.isna()] = tototf_mean
data_2018_m5.tide_level[data_2018_m5.tide_level.isna()] = tide_level_mean


In [440]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm          0
swl           63
inf           63
sfw           63
ecpc          63
tototf        63
tide_level     1
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

7월
ymdhm          0
swl            3
inf            3
sfw            3
ecpc           3
tototf         3
tide_level    17
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             2
inf             2
sfw             2
ecpc            2
tototf          2
tide_level     39
fw_1018662    158
fw_1018683      0
fw_1019630      0
wl_1018662      

In [441]:
# nan 값 mean으로 변경 (6월)

swl_mean = data_2018_m6.swl.mean()
inf_mean = data_2018_m6.inf.mean()
sfw_mean = data_2018_m6.sfw.mean()
ecpc_mean = data_2018_m6.ecpc.mean()
tototf_mean = data_2018_m6.tototf.mean()
tide_level_mean = data_2018_m6.tide_level.mean()

data_2018_m6.swl[data_2018_m6.swl.isna()] = swl_mean
data_2018_m6.inf[data_2018_m6.inf.isna()] = inf_mean
data_2018_m6.sfw[data_2018_m6.sfw.isna()] = sfw_mean
data_2018_m6.ecpc[data_2018_m6.ecpc.isna()] = ecpc_mean
data_2018_m6.tototf[data_2018_m6.tototf.isna()] = tototf_mean
data_2018_m6.tide_level[data_2018_m6.tide_level.isna()] = tide_level_mean

In [442]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
swl            3
inf            3
sfw            3
ecpc           3
tototf         3
tide_level    17
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             2
inf             2
sfw             2
ecpc            2
tototf          2
tide_level     39
fw_1018662    158
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680   

In [443]:
# nan 값 mean으로 변경 (7월)

swl_mean = data_2018_m7.swl.mean()
inf_mean = data_2018_m7.inf.mean()
sfw_mean = data_2018_m7.sfw.mean()
ecpc_mean = data_2018_m7.ecpc.mean()
tototf_mean = data_2018_m7.tototf.mean()
tide_level_mean = data_2018_m7.tide_level.mean()

data_2018_m7.swl[data_2018_m7.swl.isna()] = swl_mean
data_2018_m7.inf[data_2018_m7.inf.isna()] = inf_mean
data_2018_m7.sfw[data_2018_m7.sfw.isna()] = sfw_mean
data_2018_m7.ecpc[data_2018_m7.ecpc.isna()] = ecpc_mean
data_2018_m7.tototf[data_2018_m7.tototf.isna()] = tototf_mean
data_2018_m7.tide_level[data_2018_m7.tide_level.isna()] = tide_level_mean

In [444]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
swl             2
inf             2
sfw             2
ecpc            2
tototf          2
tide_level     39
fw_1018662    158
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683

In [445]:
# nan 값 mean으로 변경 (8월)

swl_mean = data_2018_m8.swl.mean()
inf_mean = data_2018_m8.inf.mean()
sfw_mean = data_2018_m8.sfw.mean()
ecpc_mean = data_2018_m8.ecpc.mean()
tototf_mean = data_2018_m8.tototf.mean()
tide_level_mean = data_2018_m8.tide_level.mean()

data_2018_m8.swl[data_2018_m8.swl.isna()] = swl_mean
data_2018_m8.inf[data_2018_m8.inf.isna()] = inf_mean
data_2018_m8.sfw[data_2018_m8.sfw.isna()] = sfw_mean
data_2018_m8.ecpc[data_2018_m8.ecpc.isna()] = ecpc_mean
data_2018_m8.tototf[data_2018_m8.tototf.isna()] = tototf_mean
data_2018_m8.tide_level[data_2018_m8.tide_level.isna()] = tide_level_mean

In [446]:
# nan 값 mean으로 변경 (8월)

fw_mean = data_2018_m8.fw_1018662.mean()

data_2018_m8.fw_1018662[data_2018_m8.fw_1018662.isna()] = fw_mean

In [447]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [448]:
# nan 값 mean으로 변경 (9월)

tide_level_mean = data_2018_m9.tide_level.mean()

fw_mean = data_2018_m9.fw_1018683.mean()

data_2018_m9.tide_level[data_2018_m9.tide_level.isna()] = tide_level_mean

data_2018_m9.fw_1018683[data_2018_m9.fw_1018683.isna()] = fw_mean

In [449]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [450]:
# nan 값 mean으로 변경 (10월)

tide_level_mean = data_2018_m10.tide_level.mean()

fw62_mean = data_2018_m10.fw_1018662.mean()
fw83_mean = data_2018_m10.fw_1018683.mean()

data_2018_m10.tide_level[data_2018_m10.tide_level.isna()] = tide_level_mean

data_2018_m10.fw_1018662[data_2018_m10.fw_1018662.isna()] = fw62_mean
data_2018_m10.fw_1018683[data_2018_m10.fw_1018683.isna()] = fw83_mean

In [451]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [452]:
# dataset 생성

dataset = pd.concat([dataset, data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2018-10-31 23:10,25.110,50.0,215.42,28.58,152.0,559.0,314.01,157.37,406.79,286.7,275.2,275.0,263.3,10
26492,2018-10-31 23:20,25.110,152.0,215.42,28.58,152.0,551.0,314.01,107.52,406.79,286.7,274.2,274.0,263.3,10
26493,2018-10-31 23:30,25.120,152.0,215.42,28.58,152.0,542.0,314.01,140.60,406.79,286.7,274.2,274.0,263.3,10
26494,2018-10-31 23:40,25.120,254.0,215.79,28.21,152.0,531.0,314.01,165.41,396.55,286.7,274.2,274.0,262.3,10


#### 8. 2019 data

In [453]:
data_2019 = datas[7].copy()

data_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         25994 non-null  float64
 2   inf         25994 non-null  float64
 3   sfw         25994 non-null  float64
 4   ecpc        25994 non-null  float64
 5   tototf      25994 non-null  float64
 6   tide_level  26455 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  25829 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [454]:
data_2019 = data_2019[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2019.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2019-05-01 00:00,25.0,0.0,211.41,32.59,329.0,186.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
1,2019-05-01 00:10,25.0,0.0,211.41,32.59,198.0,199.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
2,2019-05-01 00:20,25.0,0.0,211.41,32.59,137.0,215.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
3,2019-05-01 00:30,25.0,0.0,211.41,32.59,136.0,233.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
4,2019-05-01 00:40,25.0,35.0,211.41,32.59,136.0,249.0,355.94,353.51,438.33,293.7,278.2,278.0,266.3


In [455]:
data_2019.isna().sum()

ymdhm           0
swl           502
inf           502
sfw           502
ecpc          502
tototf        502
tide_level     41
fw_1018662    667
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
dtype: int64

In [456]:
# mean 할당 (단, 월별 mean값 부여)

In [457]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2019)

months[:5]

array([5, 5, 5, 5, 5])

In [458]:
# month column 추가

months = months.reshape(-1)

data_2019['month'] = months

data_2019.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2019-05-01 00:00,25.0,0.0,211.41,32.59,329.0,186.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
1,2019-05-01 00:10,25.0,0.0,211.41,32.59,198.0,199.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
2,2019-05-01 00:20,25.0,0.0,211.41,32.59,137.0,215.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
3,2019-05-01 00:30,25.0,0.0,211.41,32.59,136.0,233.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
4,2019-05-01 00:40,25.0,35.0,211.41,32.59,136.0,249.0,355.94,353.51,438.33,293.7,278.2,278.0,266.3,5


In [459]:
# data_2014 월별 split

data_2019_m5 = data_2019.groupby('month').get_group(5)
data_2019_m6 = data_2019.groupby('month').get_group(6)
data_2019_m7 = data_2019.groupby('month').get_group(7)
data_2019_m8 = data_2019.groupby('month').get_group(8)
data_2019_m9 = data_2019.groupby('month').get_group(9)
data_2019_m10 = data_2019.groupby('month').get_group(10)

In [460]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10])

5월
ymdhm           0
swl           202
inf           202
sfw           202
ecpc          202
tototf        202
tide_level      5
fw_1018662    277
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
swl            36
inf            36
sfw            36
ecpc           36
tototf         36
tide_level     10
fw_1018662    339
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     7
fw_1018662    40
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    8
fw_1018662    7
fw_1018683    0
fw_1019

In [461]:
# nan 값 mean으로 변경 (5월)

swl_mean = data_2019_m5.swl.mean()
inf_mean = data_2019_m5.inf.mean()
sfw_mean = data_2019_m5.sfw.mean()
ecpc_mean = data_2019_m5.ecpc.mean()
tototf_mean = data_2019_m5.tototf.mean()
tide_level_mean = data_2019_m5.tide_level.mean()

data_2019_m5.swl[data_2019_m5.swl.isna()] = swl_mean
data_2019_m5.inf[data_2019_m5.inf.isna()] = inf_mean
data_2019_m5.sfw[data_2019_m5.sfw.isna()] = sfw_mean
data_2019_m5.ecpc[data_2019_m5.ecpc.isna()] = ecpc_mean
data_2019_m5.tototf[data_2019_m5.tototf.isna()] = tototf_mean
data_2019_m5.tide_level[data_2019_m5.tide_level.isna()] = tide_level_mean

In [462]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2019_m5.fw_1018662.mean()

data_2019_m5.fw_1018662[data_2019_m5.fw_1018662.isna()] = fw_mean

In [463]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
swl            36
inf            36
sfw            36
ecpc           36
tototf         36
tide_level     10
fw_1018662    339
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     7
fw_1018662    40
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    8
fw_1018662    7
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_10

In [464]:
# nan 값 mean으로 변경 (6월)

swl_mean = data_2019_m6.swl.mean()
inf_mean = data_2019_m6.inf.mean()
sfw_mean = data_2019_m6.sfw.mean()
ecpc_mean = data_2019_m6.ecpc.mean()
tototf_mean = data_2019_m6.tototf.mean()
tide_level_mean = data_2019_m6.tide_level.mean()

data_2019_m6.swl[data_2019_m6.swl.isna()] = swl_mean
data_2019_m6.inf[data_2019_m6.inf.isna()] = inf_mean
data_2019_m6.sfw[data_2019_m6.sfw.isna()] = sfw_mean
data_2019_m6.ecpc[data_2019_m6.ecpc.isna()] = ecpc_mean
data_2019_m6.tototf[data_2019_m6.tototf.isna()] = tototf_mean
data_2019_m6.tide_level[data_2019_m6.tide_level.isna()] = tide_level_mean

In [465]:
# nan 값 mean으로 변경 (6월)

fw_mean = data_2019_m6.fw_1018662.mean()

data_2019_m6.fw_1018662[data_2019_m6.fw_1018662.isna()] = fw_mean

In [466]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     7
fw_1018662    40
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    8
fw_1018662    7
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_

In [467]:
# nan 값 mean으로 변경 (7월)

tide_level_mean = data_2019_m7.tide_level.mean()

fw_mean = data_2019_m7.fw_1018662.mean()

data_2019_m7.tide_level[data_2019_m7.tide_level.isna()] = tide_level_mean

data_2019_m7.fw_1018662[data_2019_m7.fw_1018662.isna()] = fw_mean

In [468]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    8
fw_1018662    7
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [469]:
# nan 값 mean으로 변경 (8월)

tide_level_mean = data_2019_m8.tide_level.mean()

fw_mean = data_2019_m8.fw_1018662.mean()

data_2019_m8.tide_level[data_2019_m8.tide_level.isna()] = tide_level_mean

data_2019_m8.fw_1018662[data_2019_m8.fw_1018662.isna()] = fw_mean

In [470]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [471]:
# nan 값 mean으로 변경 (9월)

swl_mean = data_2019_m9.swl.mean()
inf_mean = data_2019_m9.inf.mean()
sfw_mean = data_2019_m9.sfw.mean()
ecpc_mean = data_2019_m9.ecpc.mean()
tototf_mean = data_2019_m9.tototf.mean()
tide_level_mean = data_2019_m9.tide_level.mean()

data_2019_m9.swl[data_2019_m9.swl.isna()] = swl_mean
data_2019_m9.inf[data_2019_m9.inf.isna()] = inf_mean
data_2019_m9.sfw[data_2019_m9.sfw.isna()] = sfw_mean
data_2019_m9.ecpc[data_2019_m9.ecpc.isna()] = ecpc_mean
data_2019_m9.tototf[data_2019_m9.tototf.isna()] = tototf_mean
data_2019_m9.tide_level[data_2019_m9.tide_level.isna()] = tide_level_mean

In [472]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [473]:
# nan 값 mean으로 변경 (10월)

swl_mean = data_2019_m10.swl.mean()
inf_mean = data_2019_m10.inf.mean()
sfw_mean = data_2019_m10.sfw.mean()
ecpc_mean = data_2019_m10.ecpc.mean()
tototf_mean = data_2019_m10.tototf.mean()
tide_level_mean = data_2019_m10.tide_level.mean()

data_2019_m10.swl[data_2019_m10.swl.isna()] = swl_mean
data_2019_m10.inf[data_2019_m10.inf.isna()] = inf_mean
data_2019_m10.sfw[data_2019_m10.sfw.isna()] = sfw_mean
data_2019_m10.ecpc[data_2019_m10.ecpc.isna()] = ecpc_mean
data_2019_m10.tototf[data_2019_m10.tototf.isna()] = tototf_mean
data_2019_m10.tide_level[data_2019_m10.tide_level.isna()] = tide_level_mean

In [474]:
# nan 값 mean으로 변경 (10월)

fw_mean = data_2019_m10.fw_1018662.mean()

data_2019_m10.fw_1018662[data_2019_m10.fw_1018662.isna()] = fw_mean

In [475]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [476]:
# dataset 생성

dataset = pd.concat([dataset, data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2019-10-31 23:10,25.120,127.0,215.79,28.21,127.0,435.0,1689.09,-898.87,1996.24,427.7,405.2,400.0,358.3,10
26492,2019-10-31 23:20,25.120,133.0,215.79,28.21,133.0,417.0,1744.63,-573.68,1950.09,431.7,409.2,403.0,356.3,10
26493,2019-10-31 23:30,25.120,133.0,215.79,28.21,140.0,399.0,1758.66,14.75,1904.48,432.7,411.2,405.0,354.3,10
26494,2019-10-31 23:40,25.120,140.0,215.79,28.21,140.0,382.0,1730.66,403.06,1814.92,430.7,411.2,405.0,350.3,10


#### 9. 2020 data

In [477]:
data_2020 = datas[8].copy()

data_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26487 non-null  float64
 2   inf         26487 non-null  float64
 3   sfw         26487 non-null  float64
 4   ecpc        26487 non-null  float64
 5   tototf      26487 non-null  float64
 6   tide_level  26453 non-null  float64
 7   wl_1018662  26495 non-null  float64
 8   fw_1018662  22813 non-null  float64
 9   wl_1018680  26495 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26495 non-null  float64
 12  fw_1018683  26495 non-null  float64
 13  wl_1019630  26495 non-null  float64
 14  fw_1019630  26495 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [478]:
data_2020 = data_2020[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2020.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2020-05-01 00:00,25.02,34.0,212.14,31.86,135.0,398.0,269.4,231.3,301.39,278.7,279.2,264.0,252.3
1,2020-05-01 00:10,25.01,0.0,211.77,32.23,135.0,386.0,269.4,219.12,301.39,278.7,279.2,264.0,252.3
2,2020-05-01 00:20,25.01,0.0,211.77,32.23,135.0,374.0,269.4,215.07,301.39,278.7,279.2,264.0,252.3
3,2020-05-01 00:30,25.01,0.0,211.77,32.23,135.0,363.0,269.4,219.12,292.61,278.7,279.2,264.0,251.3
4,2020-05-01 00:40,25.01,28.0,211.77,32.23,129.0,352.0,269.4,227.24,283.96,278.7,279.2,264.0,250.3


In [479]:
data_2020.isna().sum()

ymdhm            0
swl              9
inf              9
sfw              9
ecpc             9
tototf           9
tide_level      43
fw_1018662    3683
fw_1018683       1
fw_1019630       1
wl_1018662       1
wl_1018680       1
wl_1018683       1
wl_1019630       1
dtype: int64

In [480]:
# mean 할당 (단, 월별 mean값 부여)

In [481]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2020)

months[:5]

array([5, 5, 5, 5, 5])

In [482]:
# month column 추가

months = months.reshape(-1)

data_2020['month'] = months

data_2020.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2020-05-01 00:00,25.02,34.0,212.14,31.86,135.0,398.0,269.4,231.3,301.39,278.7,279.2,264.0,252.3,5
1,2020-05-01 00:10,25.01,0.0,211.77,32.23,135.0,386.0,269.4,219.12,301.39,278.7,279.2,264.0,252.3,5
2,2020-05-01 00:20,25.01,0.0,211.77,32.23,135.0,374.0,269.4,215.07,301.39,278.7,279.2,264.0,252.3,5
3,2020-05-01 00:30,25.01,0.0,211.77,32.23,135.0,363.0,269.4,219.12,292.61,278.7,279.2,264.0,251.3,5
4,2020-05-01 00:40,25.01,28.0,211.77,32.23,129.0,352.0,269.4,227.24,283.96,278.7,279.2,264.0,250.3,5


In [483]:
# data_2014 월별 split

data_2020_m5 = data_2020.groupby('month').get_group(5)
data_2020_m6 = data_2020.groupby('month').get_group(6)
data_2020_m7 = data_2020.groupby('month').get_group(7)
data_2020_m8 = data_2020.groupby('month').get_group(8)
data_2020_m9 = data_2020.groupby('month').get_group(9)
data_2020_m10 = data_2020.groupby('month').get_group(10)

In [484]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10])

5월
ymdhm           0
swl             8
inf             8
sfw             8
ecpc            8
tototf          8
tide_level      7
fw_1018662    563
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      7
fw_1018662    569
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level      4
fw_1018662    144
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm            0
swl              0
inf              0
sfw              0
ecpc             0
tototf           0
tide_level       6
fw_

In [485]:
# nan 값 mean으로 변경 (5월)

swl_mean = data_2020_m5.swl.mean()
inf_mean = data_2020_m5.inf.mean()
sfw_mean = data_2020_m5.sfw.mean()
ecpc_mean = data_2020_m5.ecpc.mean()
tototf_mean = data_2020_m5.tototf.mean()
tide_level_mean = data_2020_m5.tide_level.mean()

data_2020_m5.swl[data_2020_m5.swl.isna()] = swl_mean
data_2020_m5.inf[data_2020_m5.inf.isna()] = inf_mean
data_2020_m5.sfw[data_2020_m5.sfw.isna()] = sfw_mean
data_2020_m5.ecpc[data_2020_m5.ecpc.isna()] = ecpc_mean
data_2020_m5.tototf[data_2020_m5.tototf.isna()] = tototf_mean
data_2020_m5.tide_level[data_2020_m5.tide_level.isna()] = tide_level_mean

In [486]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2020_m5.fw_1018662.mean()

data_2020_m5.fw_1018662[data_2020_m5.fw_1018662.isna()] = fw_mean

In [487]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
swl             0
inf             0
sfw             0
ecpc            0
tototf          0
tide_level      7
fw_1018662    569
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level      4
fw_1018662    144
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm            0
swl              0
inf              0
sfw              0
ecpc             0
tototf           0
tide_level       6
fw_1018662    1213
fw_1018683    

In [488]:
# nan 값 mean으로 변경 (6월)

tide_level_mean = data_2020_m6.tide_level.mean()

fw_mean = data_2020_m6.fw_1018662.mean()

data_2020_m6.tide_level[data_2020_m6.tide_level.isna()] = tide_level_mean

data_2020_m6.fw_1018662[data_2020_m6.fw_1018662.isna()] = fw_mean

In [489]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level      4
fw_1018662    144
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm            0
swl              0
inf              0
sfw              0
ecpc             0
tototf           0
tide_level       6
fw_1018662    1213
fw_1018683       0
fw_1019630       0
wl_101

In [490]:
# nan 값 mean으로 변경 (7월)

swl_mean = data_2020_m7.swl.mean()
inf_mean = data_2020_m7.inf.mean()
sfw_mean = data_2020_m7.sfw.mean()
ecpc_mean = data_2020_m7.ecpc.mean()
tototf_mean = data_2020_m7.tototf.mean()
tide_level_mean = data_2020_m7.tide_level.mean()

data_2020_m7.swl[data_2020_m7.swl.isna()] = swl_mean
data_2020_m7.inf[data_2020_m7.inf.isna()] = inf_mean
data_2020_m7.sfw[data_2020_m7.sfw.isna()] = sfw_mean
data_2020_m7.ecpc[data_2020_m7.ecpc.isna()] = ecpc_mean
data_2020_m7.tototf[data_2020_m7.tototf.isna()] = tototf_mean
data_2020_m7.tide_level[data_2020_m7.tide_level.isna()] = tide_level_mean

In [491]:
# nan 값 mean으로 변경 (7월)

fw_mean = data_2020_m7.fw_1018662.mean()

data_2020_m7.fw_1018662[data_2020_m7.fw_1018662.isna()] = fw_mean

In [492]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm            0
swl              0
inf              0
sfw              0
ecpc             0
tototf           0
tide_level       6
fw_1018662    1213
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       

In [493]:
# nan 값 mean으로 변경 (8월)

tide_level_mean = data_2020_m8.tide_level.mean()

fw_mean = data_2020_m8.fw_1018662.mean()

data_2020_m8.tide_level[data_2020_m8.tide_level.isna()] = tide_level_mean

data_2020_m8.fw_1018662[data_2020_m8.fw_1018662.isna()] = fw_mean

In [494]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [495]:
# nan 값 mean으로 변경 (9월)

tide_level_mean = data_2020_m9.tide_level.mean()

data_2020_m9.tide_level[data_2020_m9.tide_level.isna()] = tide_level_mean

In [496]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [497]:
# nan 값 mean으로 변경 (10월)

tide_level_mean = data_2020_m10.tide_level.mean()

fw62_mean = data_2020_m10.fw_1018662.mean()
fw83_mean = data_2020_m10.fw_1018683.mean()
fw30_mean = data_2020_m10.fw_1019630.mean()

wl62_mean = data_2020_m10.wl_1018662.mean()
wl80_mean = data_2020_m10.wl_1018680.mean()
wl83_mean = data_2020_m10.wl_1018683.mean()
wl30_mean = data_2020_m10.wl_1019630.mean()


data_2020_m10.tide_level[data_2020_m10.tide_level.isna()] = tide_level_mean

data_2020_m10.fw_1018662[data_2020_m10.fw_1018662.isna()] = fw62_mean
data_2020_m10.fw_1018683[data_2020_m10.fw_1018683.isna()] = fw83_mean
data_2020_m10.fw_1019630[data_2020_m10.fw_1019630.isna()] = fw30_mean

data_2020_m10.wl_1018662[data_2020_m10.wl_1018662.isna()] = wl62_mean
data_2020_m10.wl_1018680[data_2020_m10.wl_1018680.isna()] = wl80_mean
data_2020_m10.wl_1018683[data_2020_m10.wl_1018683.isna()] = wl83_mean
data_2020_m10.wl_1019630[data_2020_m10.wl_1019630.isna()] = wl30_mean

In [498]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [499]:
# dataset 생성

dataset = pd.concat([dataset, data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2020-10-31 23:10,25.120,136.0,215.79,28.21,136.0,234.0,574.28,295.70,1142.22,324.7,340.2,324.0,316.3,10
26492,2020-10-31 23:20,25.110,34.0,215.42,28.58,136.0,223.0,582.21,189.27,1107.57,325.7,339.2,322.0,314.3,10
26493,2020-10-31 23:30,25.090,0.0,214.69,29.31,137.0,211.0,590.20,215.39,1039.90,326.7,338.2,321.0,310.3,10
26494,2020-10-31 23:40,25.090,0.0,214.69,29.31,137.0,200.0,598.25,228.26,990.57,327.7,338.2,320.0,307.3,10


#### 10. 2021 data

In [500]:
data_2021 = datas[9].copy()

data_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26458 non-null  float64
 2   inf         26458 non-null  float64
 3   sfw         26458 non-null  float64
 4   ecpc        26458 non-null  float64
 5   tototf      26458 non-null  float64
 6   tide_level  23895 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  24966 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26169 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [501]:
data_2021 = data_2021[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2021.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2021-05-01 00:00,24.93,0.0,208.88,35.12,138.0,311.0,905.32,983.18,895.58,361.7,346.2,340.0,301.3
1,2021-05-01 00:10,24.93,0.0,208.88,35.12,138.0,297.0,856.07,1285.8,1006.88,356.7,338.2,335.0,308.3
2,2021-05-01 00:20,24.92,0.0,208.52,35.48,138.0,283.0,770.98,1381.43,1090.45,347.7,331.2,331.0,313.3
3,2021-05-01 00:30,24.92,0.0,208.52,35.48,151.0,271.0,681.8,1359.95,1124.83,337.7,327.2,329.0,315.3
4,2021-05-01 00:40,24.92,0.0,208.52,35.48,151.0,259.0,606.35,1267.65,1142.22,328.7,323.2,327.0,316.3


In [502]:
data_2021.isna().sum()

ymdhm            0
swl             38
inf             38
sfw             38
ecpc            38
tototf          38
tide_level    2601
fw_1018662    1530
fw_1018683     327
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [503]:
# mean 할당 (단, 월별 mean값 부여)

In [504]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2021)

months[:5]

array([5, 5, 5, 5, 5])

In [505]:
# month column 추가

months = months.reshape(-1)

data_2021['month'] = months

data_2021.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2021-05-01 00:00,24.93,0.0,208.88,35.12,138.0,311.0,905.32,983.18,895.58,361.7,346.2,340.0,301.3,5
1,2021-05-01 00:10,24.93,0.0,208.88,35.12,138.0,297.0,856.07,1285.8,1006.88,356.7,338.2,335.0,308.3,5
2,2021-05-01 00:20,24.92,0.0,208.52,35.48,138.0,283.0,770.98,1381.43,1090.45,347.7,331.2,331.0,313.3,5
3,2021-05-01 00:30,24.92,0.0,208.52,35.48,151.0,271.0,681.8,1359.95,1124.83,337.7,327.2,329.0,315.3,5
4,2021-05-01 00:40,24.92,0.0,208.52,35.48,151.0,259.0,606.35,1267.65,1142.22,328.7,323.2,327.0,316.3,5


In [506]:
# data_2020 월별 split

data_2021_m5 = data_2021.groupby('month').get_group(5)
data_2021_m6 = data_2021.groupby('month').get_group(6)
data_2021_m7 = data_2021.groupby('month').get_group(7)
data_2021_m8 = data_2021.groupby('month').get_group(8)
data_2021_m9 = data_2021.groupby('month').get_group(9)
data_2021_m10 = data_2021.groupby('month').get_group(10)

In [507]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10])

5월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     8
fw_1018662    94
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

6월
ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level     12
fw_1018662      0
fw_1018683    102
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
swl           23
inf           23
sfw           23
ecpc          23
tototf        23
tide_level    27
fw_1018662     0
fw_1018683    26
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             5
inf             5
sfw             5
ecpc            5
tototf          5
tide_level      9
fw_1018662    438
fw_1018683      0
fw_1

In [508]:
# nan 값 mean으로 변경 (5월)

tide_level_mean = data_2021_m5.tide_level.mean()

fw_mean = data_2021_m5.fw_1018662.mean()

data_2021_m5.tide_level[data_2021_m5.tide_level.isna()] = tide_level_mean

data_2021_m5.fw_1018662[data_2021_m5.fw_1018662.isna()] = fw_mean

In [509]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level     12
fw_1018662      0
fw_1018683    102
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
swl           23
inf           23
sfw           23
ecpc          23
tototf        23
tide_level    27
fw_1018662     0
fw_1018683    26
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             5
inf             5
sfw             5
ecpc            5
tototf          5
tide_level      9
fw_1018662    438
fw_1018683      0
fw_1019630      0
w

In [510]:
# nan 값 mean으로 변경 (6월)

swl_mean = data_2021_m6.swl.mean()
inf_mean = data_2021_m6.inf.mean()
sfw_mean = data_2021_m6.sfw.mean()
ecpc_mean = data_2021_m6.ecpc.mean()
tototf_mean = data_2021_m6.tototf.mean()
tide_level_mean = data_2021_m6.tide_level.mean()

data_2021_m6.swl[data_2021_m6.swl.isna()] = swl_mean
data_2021_m6.inf[data_2021_m6.inf.isna()] = inf_mean
data_2021_m6.sfw[data_2021_m6.sfw.isna()] = sfw_mean
data_2021_m6.ecpc[data_2021_m6.ecpc.isna()] = ecpc_mean
data_2021_m6.tototf[data_2021_m6.tototf.isna()] = tototf_mean
data_2021_m6.tide_level[data_2021_m6.tide_level.isna()] = tide_level_mean

In [511]:
# nan 값 mean으로 변경 (6월)

fw_mean = data_2021_m6.fw_1018683.mean()

data_2021_m6.fw_1018683[data_2021_m6.fw_1018683.isna()] = fw_mean

In [512]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
swl           23
inf           23
sfw           23
ecpc          23
tototf        23
tide_level    27
fw_1018662     0
fw_1018683    26
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
swl             5
inf             5
sfw             5
ecpc            5
tototf          5
tide_level      9
fw_1018662    438
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680   

In [513]:
# nan 값 mean으로 변경 (7월)

swl_mean = data_2021_m7.swl.mean()
inf_mean = data_2021_m7.inf.mean()
sfw_mean = data_2021_m7.sfw.mean()
ecpc_mean = data_2021_m7.ecpc.mean()
tototf_mean = data_2021_m7.tototf.mean()
tide_level_mean = data_2021_m7.tide_level.mean()

data_2021_m7.swl[data_2021_m7.swl.isna()] = swl_mean
data_2021_m7.inf[data_2021_m7.inf.isna()] = inf_mean
data_2021_m7.sfw[data_2021_m7.sfw.isna()] = sfw_mean
data_2021_m7.ecpc[data_2021_m7.ecpc.isna()] = ecpc_mean
data_2021_m7.tototf[data_2021_m7.tototf.isna()] = tototf_mean
data_2021_m7.tide_level[data_2021_m7.tide_level.isna()] = tide_level_mean

In [514]:
# nan 값 mean으로 변경 (7월)

fw_mean = data_2021_m7.fw_1018683.mean()

data_2021_m7.fw_1018683[data_2021_m7.fw_1018683.isna()] = fw_mean

In [515]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
swl             5
inf             5
sfw             5
ecpc            5
tototf          5
tide_level      9
fw_1018662    438
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683

In [516]:
# nan 값 mean으로 변경 (8월)

swl_mean = data_2021_m8.swl.mean()
inf_mean = data_2021_m8.inf.mean()
sfw_mean = data_2021_m8.sfw.mean()
ecpc_mean = data_2021_m8.ecpc.mean()
tototf_mean = data_2021_m8.tototf.mean()
tide_level_mean = data_2021_m8.tide_level.mean()

data_2021_m8.swl[data_2021_m8.swl.isna()] = swl_mean
data_2021_m8.inf[data_2021_m8.inf.isna()] = inf_mean
data_2021_m8.sfw[data_2021_m8.sfw.isna()] = sfw_mean
data_2021_m8.ecpc[data_2021_m8.ecpc.isna()] = ecpc_mean
data_2021_m8.tototf[data_2021_m8.tototf.isna()] = tototf_mean
data_2021_m8.tide_level[data_2021_m8.tide_level.isna()] = tide_level_mean

In [517]:
# nan 값 mean으로 변경 (8월)

fw_mean = data_2021_m8.fw_1018662.mean()

data_2021_m8.fw_1018662[data_2021_m8.fw_1018662.isna()] = fw_mean

In [518]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [519]:
# nan 값 mean으로 변경 (9월)

swl_mean = data_2021_m9.swl.mean()
inf_mean = data_2021_m9.inf.mean()
sfw_mean = data_2021_m9.sfw.mean()
ecpc_mean = data_2021_m9.ecpc.mean()
tototf_mean = data_2021_m9.tototf.mean()
tide_level_mean = data_2021_m9.tide_level.mean()

data_2021_m9.swl[data_2021_m9.swl.isna()] = swl_mean
data_2021_m9.inf[data_2021_m9.inf.isna()] = inf_mean
data_2021_m9.sfw[data_2021_m9.sfw.isna()] = sfw_mean
data_2021_m9.ecpc[data_2021_m9.ecpc.isna()] = ecpc_mean
data_2021_m9.tototf[data_2021_m9.tototf.isna()] = tototf_mean
data_2021_m9.tide_level[data_2021_m9.tide_level.isna()] = tide_level_mean

In [520]:
# nan 값 mean으로 변경 (9월)

fw_mean = data_2021_m9.fw_1018662.mean()

data_2021_m9.fw_1018662[data_2021_m9.fw_1018662.isna()] = fw_mean

In [521]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [522]:
# nan 값 mean으로 변경 (10월)

swl_mean = data_2021_m10.swl.mean()
inf_mean = data_2021_m10.inf.mean()
sfw_mean = data_2021_m10.sfw.mean()
ecpc_mean = data_2021_m10.ecpc.mean()
tototf_mean = data_2021_m10.tototf.mean()
tide_level_mean = data_2021_m10.tide_level.mean()

data_2021_m10.swl[data_2021_m10.swl.isna()] = swl_mean
data_2021_m10.inf[data_2021_m10.inf.isna()] = inf_mean
data_2021_m10.sfw[data_2021_m10.sfw.isna()] = sfw_mean
data_2021_m10.ecpc[data_2021_m10.ecpc.isna()] = ecpc_mean
data_2021_m10.tototf[data_2021_m10.tototf.isna()] = tototf_mean
data_2021_m10.tide_level[data_2021_m10.tide_level.isna()] = tide_level_mean

In [523]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2021_m10.fw_1018662.mean()
fw83_mean = data_2021_m10.fw_1018683.mean()

data_2021_m10.fw_1018662[data_2021_m10.fw_1018662.isna()] = fw62_mean
data_2021_m10.fw_1018683[data_2021_m10.fw_1018683.isna()] = fw83_mean

In [524]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
mo

In [525]:
# dataset 생성

dataset = pd.concat([dataset, data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.0,219.07,24.93,555.0,445.000000,469.050000,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.000000,498.000000,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.000000,490.680000,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.000000,476.210000,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.1,218.69,25.31,576.4,450.000000,476.210000,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26491,2021-10-31 23:10,25.040,56.0,212.86,31.14,157.0,364.119875,438.147794,207.27,310.31,269.7,258.2,265.0,253.3,10
26492,2021-10-31 23:20,25.040,56.0,212.86,31.14,157.0,364.119875,438.147794,211.33,319.36,270.7,258.2,265.0,254.3,10
26493,2021-10-31 23:30,25.040,56.0,212.86,31.14,157.0,364.119875,438.147794,219.46,319.36,270.7,258.2,265.0,254.3,10
26494,2021-10-31 23:40,25.040,56.0,212.86,31.14,157.0,364.119875,438.147794,227.59,319.36,271.7,258.2,265.0,254.3,10


#### 11. 2022 data

In [6]:
data_2022 = datas[-1].copy()

data_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11376 entries, 0 to 11375
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       11376 non-null  object 
 1   swl         11340 non-null  float64
 2   inf         11340 non-null  float64
 3   sfw         11340 non-null  float64
 4   ecpc        11340 non-null  float64
 5   tototf      11340 non-null  float64
 6   tide_level  11363 non-null  float64
 7   wl_1018662  11376 non-null  float64
 8   fw_1018662  8191 non-null   float64
 9   wl_1018680  11376 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  11376 non-null  float64
 12  fw_1018683  11376 non-null  float64
 13  wl_1019630  11376 non-null  float64
 14  fw_1019630  11376 non-null  float64
dtypes: float64(14), object(1)
memory usage: 1.3+ MB


In [7]:
data_2022 = data_2022[['ymdhm', 'swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2022.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2022-05-01 00:00,25.2,739.36,218.73,25.27,124.36,81.0,337.64,0.0,471.08,290.7,284.2,279.0,269.3
1,2022-05-01 00:10,25.2,124.48,218.73,25.27,124.48,72.0,343.69,145.14,438.33,291.7,284.2,279.0,266.3
2,2022-05-01 00:20,25.2,124.2,218.73,25.27,124.2,64.0,337.64,232.22,417.17,290.7,284.2,279.0,264.3
3,2022-05-01 00:30,25.2,124.35,218.73,25.27,124.35,58.0,331.65,302.71,406.79,289.7,284.2,279.0,263.3
4,2022-05-01 00:40,25.19,0.0,218.36,25.64,124.42,58.0,319.84,340.03,417.17,287.7,284.2,279.0,264.3


In [528]:
data_2022.isna().sum()

ymdhm            0
swl             36
inf             36
sfw             36
ecpc            36
tototf          36
tide_level      13
fw_1018662    3185
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [529]:
# mean 할당 (단, 월별 mean값 부여)

In [10]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2022)

months[:5]

array([5, 5, 5, 5, 5])

In [11]:
# month column 추가

months = months.reshape(-1)

data_2022['month'] = months

data_2022.head()

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2022-05-01 00:00,25.2,739.36,218.73,25.27,124.36,81.0,337.64,0.0,471.08,290.7,284.2,279.0,269.3,5
1,2022-05-01 00:10,25.2,124.48,218.73,25.27,124.48,72.0,343.69,145.14,438.33,291.7,284.2,279.0,266.3,5
2,2022-05-01 00:20,25.2,124.2,218.73,25.27,124.2,64.0,337.64,232.22,417.17,290.7,284.2,279.0,264.3,5
3,2022-05-01 00:30,25.2,124.35,218.73,25.27,124.35,58.0,331.65,302.71,406.79,289.7,284.2,279.0,263.3,5
4,2022-05-01 00:40,25.19,0.0,218.36,25.64,124.42,58.0,319.84,340.03,417.17,287.7,284.2,279.0,264.3,5


In [12]:
# data_2022 월별 split

data_2022_m5 = data_2022.groupby('month').get_group(5)
data_2022_m6 = data_2022.groupby('month').get_group(6)
data_2022_m7 = data_2022.groupby('month').get_group(7)

In [13]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7])

5월
ymdhm            0
swl              0
inf              0
sfw              0
ecpc             0
tototf           0
tide_level       9
fw_1018662    1969
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

6월
ymdhm            0
swl             36
inf             36
sfw             36
ecpc            36
tototf          36
tide_level       3
fw_1018662    1180
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     1
fw_1018662    36
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64



In [14]:
# nan 값 mean으로 변경 (5월)

tide_level_mean = data_2022_m5.tide_level.mean()

fw_mean = data_2022_m5.fw_1018662.mean()

data_2022_m5.tide_level[data_2022_m5.tide_level.isna()] = tide_level_mean

data_2022_m5.fw_1018662[data_2022_m5.fw_1018662.isna()] = fw_mean

In [15]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm            0
swl             36
inf             36
sfw             36
ecpc            36
tototf          36
tide_level       3
fw_1018662    1180
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     1
fw_1018662    36
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64



In [16]:
# dataset 생성 (단, 5월까지만 dataset이고, 6,7월은 submission이기 때문에 5월 data만 datset에 추가)

dataset = pd.concat([dataset, data_2022_m5], axis=0)

dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.00,219.07,24.93,555.00,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.60,218.86,25.15,562.90,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.10,218.69,25.31,576.40,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.80,218.69,25.31,563.10,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.10,218.69,25.31,576.40,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4459,2022-05-31 23:10,25.180,140.94,217.99,26.01,140.94,220.0,285.72,575.57,493.58,281.7,281.2,278.0,271.3,5
4460,2022-05-31 23:20,25.180,141.07,217.99,26.01,141.07,208.0,274.78,501.04,505.03,279.7,279.2,278.0,272.3,5
4461,2022-05-31 23:30,25.180,141.01,217.99,26.01,141.01,196.0,269.40,425.89,505.03,278.7,277.2,277.0,272.3,5
4462,2022-05-31 23:40,25.190,755.75,218.36,25.64,140.75,184.0,264.07,198.19,493.58,277.7,276.2,276.0,271.3,5


In [537]:
# 2022-06, 07은 Data Leakage를 피하기 위해 해당 달의 mean이 아닌 dataset의 mean(6, 7)으로 변경

In [17]:
dataset_m6 = dataset.groupby('month').get_group(6)
dataset_m7 = dataset.groupby('month').get_group(7)

In [18]:
# nan 값 mean으로 변경 (6월)

swl_mean = dataset_m6.swl.mean()
inf_mean = dataset_m6.inf.mean()
sfw_mean = dataset_m6.sfw.mean()
ecpc_mean = dataset_m6.ecpc.mean()
tototf_mean = dataset_m6.tototf.mean()
tide_level_mean = dataset_m6.tide_level.mean()

data_2022_m6.swl[data_2022_m6.swl.isna()] = swl_mean
data_2022_m6.inf[data_2022_m6.inf.isna()] = inf_mean
data_2022_m6.sfw[data_2022_m6.sfw.isna()] = sfw_mean
data_2022_m6.ecpc[data_2022_m6.ecpc.isna()] = ecpc_mean
data_2022_m6.tototf[data_2022_m6.tototf.isna()] = tototf_mean
data_2022_m6.tide_level[data_2022_m6.tide_level.isna()] = tide_level_mean

In [19]:
# nan 값 mean으로 변경 (6월)

fw_mean = dataset_m6.fw_1018662.mean()

data_2022_m6.fw_1018662[data_2022_m6.fw_1018662.isna()] = fw_mean

In [20]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     1
fw_1018662    36
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64



In [21]:
# nan 값 mean으로 변경 (7월)

tide_level_mean = dataset_m7.tide_level.mean()

fw_mean = dataset_m7.fw_1018662.mean()

data_2022_m7.tide_level[data_2022_m7.tide_level.isna()] = tide_level_mean

data_2022_m7.fw_1018662[data_2022_m7.fw_1018662.isna()] = fw_mean

In [22]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7])

5월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
swl           0
inf           0
sfw           0
ecpc          0
tototf        0
tide_level    0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64



### Dataset save

In [544]:
dataset.to_csv('./data/dataset.csv', index=False)

### Dataset load

In [3]:
dataset = pd.read_csv('./data/dataset.csv')

dataset.shape

(269424, 15)

### Sub_mission

In [23]:
sub_mission = pd.concat([data_2022_m6, data_2022_m7], axis=0)

sub_mission

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
4464,2022-06-01 00:00,25.19,140.60,218.36,25.64,140.60,162.0,269.40,8.23,471.08,0.0,0.0,0.0,0.0,6
4465,2022-06-01 00:10,25.19,140.78,218.36,25.64,140.78,151.0,280.22,28.82,449.12,0.0,0.0,0.0,0.0,6
4466,2022-06-01 00:20,25.20,755.90,218.73,25.27,140.90,141.0,296.87,12.35,417.17,0.0,0.0,0.0,0.0,6
4467,2022-06-01 00:30,25.19,0.00,218.36,25.64,140.94,130.0,302.53,53.52,386.43,0.0,0.0,0.0,0.0,6
4468,2022-06-01 00:40,25.19,140.63,218.36,25.64,140.63,120.0,296.87,107.04,366.60,0.0,0.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11371,2022-07-18 23:10,25.04,259.23,212.86,31.14,259.23,510.0,319.84,-456.41,974.40,0.0,0.0,0.0,0.0,7
11372,2022-07-18 23:20,25.04,260.46,212.86,31.14,260.46,492.0,314.01,-717.30,1006.88,0.0,0.0,0.0,0.0,7
11373,2022-07-18 23:30,25.04,259.37,212.86,31.14,259.37,475.0,387.55,-843.37,1039.90,0.0,0.0,0.0,0.0,7
11374,2022-07-18 23:40,25.04,259.13,212.86,31.14,259.13,458.0,454.91,-1023.37,1073.46,0.0,0.0,0.0,0.0,7


In [57]:
X_sub = sub_mission.drop(['ymdhm', 'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630'], axis=1)

X_sub

Unnamed: 0,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,month
4464,25.19,140.60,218.36,25.64,140.60,162.0,269.40,8.23,471.08,6
4465,25.19,140.78,218.36,25.64,140.78,151.0,280.22,28.82,449.12,6
4466,25.20,755.90,218.73,25.27,140.90,141.0,296.87,12.35,417.17,6
4467,25.19,0.00,218.36,25.64,140.94,130.0,302.53,53.52,386.43,6
4468,25.19,140.63,218.36,25.64,140.63,120.0,296.87,107.04,366.60,6
...,...,...,...,...,...,...,...,...,...,...
11371,25.04,259.23,212.86,31.14,259.23,510.0,319.84,-456.41,974.40,7
11372,25.04,260.46,212.86,31.14,260.46,492.0,314.01,-717.30,1006.88,7
11373,25.04,259.37,212.86,31.14,259.37,475.0,387.55,-843.37,1039.90,7
11374,25.04,259.13,212.86,31.14,259.13,458.0,454.91,-1023.37,1073.46,7


### Train, Val, Test split

#### Train, Test split

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
dataset

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,24.800,555.00,219.07,24.93,555.00,445.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,24.794,464.60,218.86,25.15,562.90,449.0,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,24.789,478.10,218.69,25.31,576.40,451.0,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,24.789,464.80,218.69,25.31,563.10,452.0,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,24.789,478.10,218.69,25.31,576.40,450.0,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4459,2022-05-31 23:10,25.180,140.94,217.99,26.01,140.94,220.0,285.72,575.57,493.58,281.7,281.2,278.0,271.3,5
4460,2022-05-31 23:20,25.180,141.07,217.99,26.01,141.07,208.0,274.78,501.04,505.03,279.7,279.2,278.0,272.3,5
4461,2022-05-31 23:30,25.180,141.01,217.99,26.01,141.01,196.0,269.40,425.89,505.03,278.7,277.2,277.0,272.3,5
4462,2022-05-31 23:40,25.190,755.75,218.36,25.64,140.75,184.0,264.07,198.19,493.58,277.7,276.2,276.0,271.3,5


In [55]:
# X, y 분류

X = dataset[['swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018683', 'fw_1019630', 'month']]
y = dataset.iloc[:, -5:]

X.shape, y.shape

((273888, 10), (273888, 5))

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=2022)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((219110, 10), (54778, 10), (219110, 5), (54778, 5))

In [59]:
X_train.month.value_counts(), y_train.month.value_counts()

(5     42801
 8     35800
 7     35738
 10    35732
 9     34588
 6     34451
 Name: month, dtype: int64,
 5     42801
 8     35800
 7     35738
 10    35732
 9     34588
 6     34451
 Name: month, dtype: int64)

In [35]:
X_test.month.value_counts(), y_test.month.value_counts()

(5     10767
 10     8908
 7      8902
 8      8840
 6      8749
 9      8612
 Name: month, dtype: int64,
 5     10767
 10     8908
 7      8902
 8      8840
 6      8749
 9      8612
 Name: month, dtype: int64)

#### Train, Val split

In [471]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=X_train.month, shuffle=True, random_state=2022)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((172431, 6), (43108, 6), (172431, 5), (43108, 5))

In [231]:
X_train.month.value_counts(), y_train.month.value_counts()

(5     31426
 8     28570
 10    28570
 7     28569
 9     27648
 6     27648
 Name: month, dtype: int64,
 5     31426
 8     28570
 10    28570
 7     28569
 9     27648
 6     27648
 Name: month, dtype: int64)

In [232]:
X_val.month.value_counts(), y_val.month.value_counts()

(5     7857
 7     7143
 10    7142
 8     7142
 9     6912
 6     6912
 Name: month, dtype: int64,
 5     7857
 7     7143
 10    7142
 8     7142
 9     6912
 6     6912
 Name: month, dtype: int64)

### Scaling

In [450]:
from sklearn.preprocessing import StandardScaler

In [51]:
# months col 제거

X_train = X_train.drop(['month'], axis=1)
y_train = y_train.drop(['month'], axis=1)

# X_val = X_val.drop(['month'], axis=1)
# y_val = y_val.drop(['month'], axis=1)

X_test = X_test.drop(['month'], axis=1)
y_test = y_test.drop(['month'], axis=1)

In [73]:
y_train = y_train.drop(['month'], axis=1)
y_test = y_test.drop(['month'], axis=1)

In [74]:
X_train.shape, X_test.shape

((219110, 10), (54778, 10))

In [492]:
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
# X_val = ss.transform(X_val)
X_test = ss.transform(X_test)
X_sub = ss.transform(X_sub)

### Learning

In [36]:
from sklearn.ensemble import RandomForestRegressor

In [75]:
rfr = RandomForestRegressor(n_jobs=4)

rfr.fit(X_train, y_train)

In [76]:
y_pred = rfr.predict(X_test)

In [None]:
rfr.

In [39]:
from sklearn.metrics import mean_squared_error, r2_score

In [77]:
np.sqrt(mean_squared_error(y_test, y_pred)) / r2_score(y_test, y_pred)

2.5163298121618505

In [78]:
rfr.n_estimators

100

In [90]:
# Grid Search

from sklearn.model_selection import GridSearchCV, KFold

In [96]:
param = {
    'n_estimators' : range(100, 200, 5),
    'max_depth' : range(10, 15)
}

In [97]:
kfold = KFold(shuffle=True, random_state=2022)

In [99]:
rfr2 = RandomForestRegressor(n_jobs=4, verbose=True)

In [100]:
grid = GridSearchCV(rfr2, param, n_jobs=4, cv=kfold, refit=True, verbose=True)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   20.4s
[Parallel(n_jobs=4)]: Done 140 out of 140 | elapsed:  1.1min finished


In [101]:
grid.best_params_

{'max_depth': 14, 'n_estimators': 140}

In [102]:
y_pred = grid.best_estimator_.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 140 out of 140 | elapsed:    0.4s finished


In [103]:
np.sqrt(mean_squared_error(y_test, y_pred)) / r2_score(y_test, y_pred)

3.05557839646147

In [None]:
'''
n_estimators: 100 = 3.317265
n_estimators: 190 = 3.310137

'''

### Model save

In [43]:
import pickle

In [79]:
saved_model = pickle.dump(rfr, open('./model/RFR_n_100.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [104]:
saved_model = pickle.dump(grid.best_estimator_, open('./model/RFR_n_140_d_14.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

### Model load

In [105]:
loaded_model = pickle.load(open('./model/RFR_n_140_d_14.pkl', 'rb'))

In [106]:
y_pred = loaded_model.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 140 out of 140 | elapsed:    0.4s finished


In [107]:
np.sqrt(mean_squared_error(y_test, y_pred)) / r2_score(y_test, y_pred)

3.0555783964614704

### Make Sub_mission

In [67]:
X_sub = X_sub.to_numpy()

In [108]:
y_pred = loaded_model.predict(X_sub)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 140 out of 140 | elapsed:    0.0s finished


In [109]:
y_pred

array([[277.71162414, 267.02988588, 269.15780949, 268.7784851 ],
       [280.44548241, 267.80197539, 268.17745816, 266.31792903],
       [284.30447244, 274.06840915, 271.75850388, 265.04088874],
       ...,
       [297.86982872, 304.32174915, 306.76995855, 310.24194618],
       [309.96727092, 308.13969815, 310.45471647, 311.91202518],
       [325.98825049, 313.63132987, 315.13579553, 315.04426702]])

In [110]:
sub_mission = pd.read_csv('./data/sample_submission.csv')

In [111]:
sub_mission

Unnamed: 0,ymdhm,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2022-06-01 00:00,0,0,0,0
1,2022-06-01 00:10,0,0,0,0
2,2022-06-01 00:20,0,0,0,0
3,2022-06-01 00:30,0,0,0,0
4,2022-06-01 00:40,0,0,0,0
...,...,...,...,...,...
6907,2022-07-18 23:10,0,0,0,0
6908,2022-07-18 23:20,0,0,0,0
6909,2022-07-18 23:30,0,0,0,0
6910,2022-07-18 23:40,0,0,0,0


In [112]:
sub_mission[['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']] = y_pred

In [113]:
sub_mission

Unnamed: 0,ymdhm,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2022-06-01 00:00,277.711624,267.029886,269.157809,268.778485
1,2022-06-01 00:10,280.445482,267.801975,268.177458,266.317929
2,2022-06-01 00:20,284.304472,274.068409,271.758504,265.040889
3,2022-06-01 00:30,284.452746,273.045440,271.363350,262.163577
4,2022-06-01 00:40,284.766017,274.061326,271.543797,258.314175
...,...,...,...,...,...
6907,2022-07-18 23:10,286.782483,292.151038,296.195436,308.356015
6908,2022-07-18 23:20,286.684645,298.850694,300.045765,308.912993
6909,2022-07-18 23:30,297.869829,304.321749,306.769959,310.241946
6910,2022-07-18 23:40,309.967271,308.139698,310.454716,311.912025


In [114]:
sub_mission.to_csv('./result/sample_submission.csv', index=False)