# Flood Prediction

In [218]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data load

In [219]:
datas = []

for rep in range(2012, 2023):
    data = pd.read_csv('./data/water_data/data_' + str(rep) + '.csv')
    datas.append(data)

In [220]:
for data in datas:
    print(data.shape) # 2012 ~ 2021: (26496, 15), 2022: (11376, 15)

(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(26496, 15)
(11376, 15)


In [221]:
for data in datas:
    print(data.duplicated().sum()) # 중복 데이터 x

0
0
0
0
0
0
0
0
0
0
0


In [222]:
# 모든 년도 data의 column corr 추출()

data_all = pd.concat([data for data in datas], axis=0)

data_corr = data_all.corr()

data_corr['wl_1018662'].sort_values(ascending=False) # wl(target) 기준 tototf, inf, fw이 상관관계 있음

wl_1018662    1.000000
wl_1018680    0.993364
wl_1018683    0.992271
wl_1019630    0.958617
fw_1019630    0.752340
fw_1018662    0.741266
tototf        0.709398
inf           0.699971
fw_1018683    0.655167
ecpc          0.063409
tide_level   -0.002609
sfw          -0.063405
swl          -0.066725
fw_1018680         NaN
Name: wl_1018662, dtype: float64

In [223]:
data_all.fw_1018680.value_counts() # fw_1018680: 0 or nan

0.0    79487
Name: fw_1018680, dtype: int64

In [224]:
data_all.isna().sum() # nan 개수 추출

ymdhm              0
swl              743
inf              743
sfw              743
ecpc             743
tototf           743
tide_level      4927
wl_1018662        59
fw_1018662     16380
wl_1018680        59
fw_1018680    196849
wl_1018683        59
fw_1018683      1279
wl_1019630        59
fw_1019630        59
dtype: int64

In [225]:
data_all.isin([0]).sum() # 0 개수 추출

ymdhm             0
swl              83
inf           10122
sfw              82
ecpc              0
tototf          447
tide_level        0
wl_1018662     6912
fw_1018662        0
wl_1018680     6912
fw_1018680    79487
wl_1018683     6912
fw_1018683      183
wl_1019630     6912
fw_1019630        0
dtype: int64

In [226]:
# sns.pairplot(data_corr)
# plt.show()

In [227]:
# wl과 corr >= 0.5인 totof, inf, fw를 feature selection

### Data Preprocessing

#### 1. 2012 data

In [228]:
data_2012 = datas[0].copy()

data_2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26495 non-null  float64
 2   inf         26495 non-null  float64
 3   sfw         26495 non-null  float64
 4   ecpc        26495 non-null  float64
 5   tototf      26495 non-null  float64
 6   tide_level  25720 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26496 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  26496 non-null  float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [229]:
data_2012.isna().sum() # data_2012 nan 개수

ymdhm           0
swl             1
inf             1
sfw             1
ecpc            1
tototf          1
tide_level    776
wl_1018662      0
fw_1018662      0
wl_1018680      0
fw_1018680      0
wl_1018683      0
fw_1018683      0
wl_1019630      0
fw_1019630      0
dtype: int64

In [230]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2012 = data_2012[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2012.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2012-05-01 00:00,555.0,555.0,469.05,729.8,540.18,310.7,300.2,290.0,275.3
1,2012-05-01 00:10,464.6,562.9,498.0,731.48,540.18,314.7,300.2,290.0,275.3
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3


In [231]:
data_2012.isna().sum() # inf: 1, tototf: 1

ymdhm         0
inf           1
tototf        1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
dtype: int64

In [232]:
# mean 할당 (단, 월별 mean값 부여)

In [233]:
# row 별 month 추출 method

def get_month(data):
    idx = data.iloc[0, 0].split('-')[1]
    months = []

    for idx in range(len(data)):
        idx = data.iloc[idx, 0].split('-')[1]
        
        months.append(int(idx))

    months = np.array(months)
    
    return months

In [234]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2012)

months[:5]

array([5, 5, 5, 5, 5])

In [235]:
# month column 추가

months = months.reshape(-1)

data_2012['month'] = months

data_2012.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.8,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.0,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5


In [236]:
# data_2012 월별 split

data_2012_m5 = data_2012.groupby('month').get_group(5)
data_2012_m6 = data_2012.groupby('month').get_group(6)
data_2012_m7 = data_2012.groupby('month').get_group(7)
data_2012_m8 = data_2012.groupby('month').get_group(8)
data_2012_m9 = data_2012.groupby('month').get_group(9)
data_2012_m10 = data_2012.groupby('month').get_group(10)

In [237]:
# 월별 nan값 찾는 method

def find_nan(datas):
    month = 5
    
    for data in datas:
        print(str(month) + '월')
        print(data.isna().sum())
        print()
        
        month += 1

In [238]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10]) # 7월 nan 탐색

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           1
tototf        1
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [239]:
# nan 값 mean으로 변경

inf_mean = data_2012_m7.inf.mean()
tototf_mean = data_2012_m7.tototf.mean()

data_2012_m7.inf[data_2012_m7.inf.isna()] = inf_mean
data_2012_m7.tototf[data_2012_m7.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2012_m7.inf[data_2012_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2012_m7.inf[data_2012_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2012_m7.tototf[data_2012_m7.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [240]:
find_nan([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [241]:
# dataset 생성

dataset = pd.concat([data_2012_m5, data_2012_m6, data_2012_m7, data_2012_m8, data_2012_m9, data_2012_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2012-10-31 23:10,270.2,270.2,1018.67,1395.30,1837.11,372.7,364.2,365.0,351.3,10
26492,2012-10-31 23:20,264.1,264.1,945.75,1154.21,1814.92,365.7,359.2,362.0,350.3,10
26493,2012-10-31 23:30,257.9,257.9,915.35,968.58,1792.88,362.7,356.2,359.0,349.3,10
26494,2012-10-31 23:40,264.1,264.1,846.39,776.99,1727.56,355.7,352.2,356.0,346.3,10


#### 2. 2013 data

In [242]:
data_2013 = datas[1].copy()

data_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26496 non-null  float64
 2   inf         26496 non-null  float64
 3   sfw         26496 non-null  float64
 4   ecpc        26496 non-null  float64
 5   tototf      26496 non-null  float64
 6   tide_level  26481 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26496 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  26496 non-null  float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [243]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2013 = data_2013[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2013.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2013-05-01 00:00,151.0,151.0,639.32,-993.41,958.36,332.7,317.2,314.0,305.3
1,2013-05-01 00:10,173.8,173.8,690.47,-775.22,911.07,338.7,327.2,316.0,302.3
2,2013-05-01 00:20,173.8,173.8,734.64,-380.85,834.96,343.7,334.2,324.0,297.3
3,2013-05-01 00:30,173.8,173.8,770.98,-52.67,762.23,347.7,337.2,326.0,292.3
4,2013-05-01 00:40,73.9,173.9,789.5,237.34,706.48,349.7,338.2,327.0,288.3


In [244]:
data_2013.isna().sum() # inf: 0, tototf: 0

ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
dtype: int64

In [245]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2013)

months[:5]

array([5, 5, 5, 5, 5])

In [246]:
# month column 추가

months = months.reshape(-1)

data_2013['month'] = months

data_2013.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2013-05-01 00:00,151.0,151.0,639.32,-993.41,958.36,332.7,317.2,314.0,305.3,5
1,2013-05-01 00:10,173.8,173.8,690.47,-775.22,911.07,338.7,327.2,316.0,302.3,5
2,2013-05-01 00:20,173.8,173.8,734.64,-380.85,834.96,343.7,334.2,324.0,297.3,5
3,2013-05-01 00:30,173.8,173.8,770.98,-52.67,762.23,347.7,337.2,326.0,292.3,5
4,2013-05-01 00:40,73.9,173.9,789.5,237.34,706.48,349.7,338.2,327.0,288.3,5


In [247]:
# dataset 생성

dataset = pd.concat([dataset, data_2013], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2013-10-31 23:10,171.9,171.9,280.22,372.96,366.60,280.7,274.2,268.0,259.3,10
26492,2013-10-31 23:20,71.0,171.0,280.22,348.63,366.60,280.7,274.2,268.0,259.3,10
26493,2013-10-31 23:30,70.6,170.6,280.22,315.76,376.45,280.7,273.2,267.0,260.3,10
26494,2013-10-31 23:40,70.2,170.2,280.22,263.13,376.45,280.7,273.2,267.0,260.3,10


#### 3. 2014 data

In [248]:
data_2014 = datas[2].copy()

data_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26476 non-null  float64
 2   inf         26476 non-null  float64
 3   sfw         26476 non-null  float64
 4   ecpc        26476 non-null  float64
 5   tototf      26476 non-null  float64
 6   tide_level  26492 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26496 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  26495 non-null  float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [249]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2014 = data_2014[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2014.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2014-05-01 00:00,132.0,132.0,582.21,350.33,1056.61,325.7,316.2,315.0,311.3
1,2014-05-01 00:10,126.3,126.3,590.2,249.84,1006.88,326.7,315.2,312.0,308.3
2,2014-05-01 00:20,126.3,126.3,590.2,206.48,958.36,326.7,315.2,311.0,305.3
3,2014-05-01 00:30,132.2,132.2,598.25,249.5,911.07,327.7,315.2,311.0,302.3
4,2014-05-01 00:40,126.2,126.2,598.25,305.01,849.91,327.7,314.2,310.0,298.3


In [250]:
data_2014.isna().sum() # inf: 20, tototf: 20

ymdhm          0
inf           20
tototf        20
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
dtype: int64

In [251]:
# mean 할당 (단, 월별 mean값 부여)

In [252]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2014)

months[:5]

array([5, 5, 5, 5, 5])

In [253]:
# month column 추가

months = months.reshape(-1)

data_2014['month'] = months

data_2014.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2014-05-01 00:00,132.0,132.0,582.21,350.33,1056.61,325.7,316.2,315.0,311.3,5
1,2014-05-01 00:10,126.3,126.3,590.2,249.84,1006.88,326.7,315.2,312.0,308.3,5
2,2014-05-01 00:20,126.3,126.3,590.2,206.48,958.36,326.7,315.2,311.0,305.3,5
3,2014-05-01 00:30,132.2,132.2,598.25,249.5,911.07,327.7,315.2,311.0,302.3,5
4,2014-05-01 00:40,126.2,126.2,598.25,305.01,849.91,327.7,314.2,310.0,298.3,5


In [254]:
# data_2014 월별 split

data_2014_m5 = data_2014.groupby('month').get_group(5)
data_2014_m6 = data_2014.groupby('month').get_group(6)
data_2014_m7 = data_2014.groupby('month').get_group(7)
data_2014_m8 = data_2014.groupby('month').get_group(8)
data_2014_m9 = data_2014.groupby('month').get_group(9)
data_2014_m10 = data_2014.groupby('month').get_group(10)

In [255]:
find_nan([data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10]) # 8, 10월 nan 탐색

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm          0
inf           20
tototf        20
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf 

In [256]:
# nan 값 mean으로 변경 (8월)

inf_mean = data_2014_m8.inf.mean()
tototf_mean = data_2014_m8.tototf.mean()

data_2014_m8.inf[data_2014_m8.inf.isna()] = inf_mean
data_2014_m8.tototf[data_2014_m8.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2014_m8.inf[data_2014_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2014_m8.inf[data_2014_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2014_m8.tototf[data_2014_m8.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [257]:
find_nan([data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [258]:
# dataset 생성

dataset = pd.concat([dataset, data_2014_m5, data_2014_m6, data_2014_m7, data_2014_m8, data_2014_m9, data_2014_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2014-10-31 23:10,145.4,145.4,319.84,257.19,471.08,287.7,275.2,273.0,269.3,10
26492,2014-10-31 23:20,145.3,145.3,319.84,256.83,471.08,287.7,275.2,272.0,269.3,10
26493,2014-10-31 23:30,145.2,145.2,319.84,252.76,460.03,287.7,275.2,272.0,268.3,10
26494,2014-10-31 23:40,245.1,145.1,319.84,252.76,460.03,287.7,275.2,272.0,268.3,10


#### 4. 2015 data

In [259]:
data_2015 = datas[3].copy()

data_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26483 non-null  float64
 2   inf         26483 non-null  float64
 3   sfw         26483 non-null  float64
 4   ecpc        26483 non-null  float64
 5   tototf      26483 non-null  float64
 6   tide_level  26483 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  22709 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26093 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [260]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2015 = data_2015[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2015.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2015-05-01 00:00,47.0,147.0,302.53,227.02,427.69,284.7,268.2,268.0,265.3
1,2015-05-01 00:10,151.2,151.2,302.53,267.56,427.69,284.7,268.2,268.0,265.3
2,2015-05-01 00:20,151.1,151.1,308.24,320.26,427.69,285.7,268.2,268.0,265.3
3,2015-05-01 00:30,51.2,151.2,314.01,271.61,427.69,286.7,268.2,268.0,265.3
4,2015-05-01 00:40,51.2,151.2,314.01,312.15,417.17,286.7,268.2,268.0,264.3


In [261]:
data_2015.isna().sum() # inf: 13, tototf: 13, fw_1018662: 3787, fw_1018683: 403

ymdhm            0
inf             13
tototf          13
fw_1018662    3787
fw_1018683     403
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [262]:
# mean, 0 할당 (단, 월별 mean값 부여) (전체 nan인 경우는 0값 부여)

In [263]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2015)

months[:5]

array([5, 5, 5, 5, 5])

In [264]:
# month column 추가

months = months.reshape(-1)

data_2015['month'] = months

data_2015.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2015-05-01 00:00,47.0,147.0,302.53,227.02,427.69,284.7,268.2,268.0,265.3,5
1,2015-05-01 00:10,151.2,151.2,302.53,267.56,427.69,284.7,268.2,268.0,265.3,5
2,2015-05-01 00:20,151.1,151.1,308.24,320.26,427.69,285.7,268.2,268.0,265.3,5
3,2015-05-01 00:30,51.2,151.2,314.01,271.61,427.69,286.7,268.2,268.0,265.3,5
4,2015-05-01 00:40,51.2,151.2,314.01,312.15,417.17,286.7,268.2,268.0,264.3,5


In [265]:
# data_2015 월별 split

data_2015_m5 = data_2015.groupby('month').get_group(5)
data_2015_m6 = data_2015.groupby('month').get_group(6)
data_2015_m7 = data_2015.groupby('month').get_group(7)
data_2015_m8 = data_2015.groupby('month').get_group(8)
data_2015_m9 = data_2015.groupby('month').get_group(9)
data_2015_m10 = data_2015.groupby('month').get_group(10)

In [266]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10]) # 5, 6, 7, 8, 9, 10월 nan 탐색

5월
ymdhm           0
inf             9
tototf          9
fw_1018662    537
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
inf             3
tototf          3
fw_1018662    743
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
inf             0
tototf          0
fw_1018662     54
fw_1018683    188
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
inf             0
tototf          0
fw_1018662    487
fw_1018683    178
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm           0
inf             1
tototf          1
fw_1018662    933
fw_1018683     22
fw_1019630      0
wl_1018662      0
wl_1018680 

In [267]:
# nan 값 mean으로 변경 (5월)

inf_mean = data_2015_m5.inf.mean()
tototf_mean = data_2015_m5.tototf.mean()

data_2015_m5.inf[data_2015_m5.inf.isna()] = inf_mean
data_2015_m5.tototf[data_2015_m5.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m5.inf[data_2015_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m5.inf[data_2015_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m5.tototf[data_2015_m5.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [268]:
# nan 값 mean으로 변경 (5월)

fw62_mean = data_2015_m5.fw_1018662.mean()

data_2015_m5.fw_1018662[data_2015_m5.fw_1018662.isna()] = fw62_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m5.fw_1018662[data_2015_m5.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m5.fw_1018662[data_2015_m5.fw_1018662.isna()] = fw62_mean


In [269]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
inf             3
tototf          3
fw_1018662    743
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
inf             0
tototf          0
fw_1018662     54
fw_1018683    188
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
inf             0
tototf          0
fw_1018662    487
fw_1018683    178
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm           0
inf             1
tototf          1
fw_1018662    933
fw_1018683     22
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683     

In [270]:
# nan 값 mean으로 변경 (6월)

inf_mean = data_2015_m6.inf.mean()
tototf_mean = data_2015_m6.tototf.mean()

data_2015_m6.inf[data_2015_m6.inf.isna()] = inf_mean
data_2015_m6.tototf[data_2015_m6.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m6.inf[data_2015_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m6.inf[data_2015_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m6.tototf[data_2015_m6.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [271]:
# nan 값 mean으로 변경 (6월)

fw62_mean = data_2015_m6.fw_1018662.mean()

data_2015_m6.fw_1018662[data_2015_m6.fw_1018662.isna()] = fw62_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m6.fw_1018662[data_2015_m6.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m6.fw_1018662[data_2015_m6.fw_1018662.isna()] = fw62_mean


In [272]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm           0
inf             0
tototf          0
fw_1018662     54
fw_1018683    188
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm           0
inf             0
tototf          0
fw_1018662    487
fw_1018683    178
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm           0
inf             1
tototf          1
fw_1018662    933
fw_1018683     22
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
m

In [273]:
# nan 값 mean으로 변경 (7월)

fw62_mean = data_2015_m7.fw_1018662.mean()
fw83_mean = data_2015_m7.fw_1018683.mean()

data_2015_m7.fw_1018662[data_2015_m7.fw_1018662.isna()] = fw62_mean
data_2015_m7.fw_1018683[data_2015_m7.fw_1018683.isna()] = fw83_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m7.fw_1018662[data_2015_m7.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m7.fw_1018662[data_2015_m7.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m7.fw_1018683[data_2015_m7.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [274]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
inf             0
tototf          0
fw_1018662    487
fw_1018683    178
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm           0
inf             1
tototf          1
fw_1018662    933
fw_1018683     22
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype

In [275]:
# nan 값 mean으로 변경 (8월)

fw62_mean = data_2015_m8.fw_1018662.mean()
fw83_mean = data_2015_m8.fw_1018683.mean()

data_2015_m8.fw_1018662[data_2015_m8.fw_1018662.isna()] = fw62_mean
data_2015_m8.fw_1018683[data_2015_m8.fw_1018683.isna()] = fw83_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m8.fw_1018662[data_2015_m8.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m8.fw_1018662[data_2015_m8.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m8.fw_1018683[data_2015_m8.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [276]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf             1
tototf          1
fw_1018662    933
fw_1018683     22
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

10월
ymdhm    

In [277]:
# nan 값 mean으로 변경 (9월)

inf_mean = data_2015_m9.inf.mean()
tototf_mean = data_2015_m9.tototf.mean()

data_2015_m9.inf[data_2015_m9.inf.isna()] = inf_mean
data_2015_m9.tototf[data_2015_m9.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m9.inf[data_2015_m9.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m9.inf[data_2015_m9.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m9.tototf[data_2015_m9.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [278]:
# nan 값 mean으로 변경 (9월)

fw62_mean = data_2015_m9.fw_1018662.mean()
fw83_mean = data_2015_m9.fw_1018683.mean()

data_2015_m9.fw_1018662[data_2015_m9.fw_1018662.isna()] = fw62_mean
data_2015_m9.fw_1018683[data_2015_m9.fw_1018683.isna()] = fw83_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m9.fw_1018662[data_2015_m9.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m9.fw_1018662[data_2015_m9.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m9.fw_1018683[data_2015_m9.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [279]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm            0
inf         

In [280]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2015_m10.fw_1018662.mean()
fw83_mean = data_2015_m10.fw_1018683.mean()

data_2015_m10.fw_1018662[data_2015_m10.fw_1018662.isna()] = fw62_mean
data_2015_m10.fw_1018683[data_2015_m10.fw_1018683.isna()] = fw83_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m10.fw_1018662[data_2015_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m10.fw_1018662[data_2015_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2015_m10.fw_1018683[data_2015_m10.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [281]:
find_nan([data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [282]:
# dataset 생성

dataset = pd.concat([dataset, data_2015_m5, data_2015_m6, data_2015_m7, data_2015_m8, data_2015_m9, data_2015_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.00,555.00,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.60,562.90,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.10,576.40,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.80,563.10,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.10,576.40,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2015-10-31 23:10,-20.88,83.32,875.60,-1845.96,1579.97,358.7,340.2,336.0,339.3,10
26492,2015-10-31 23:20,-20.87,83.33,955.99,-1338.61,1579.97,366.7,349.2,346.0,339.3,10
26493,2015-10-31 23:30,-125.03,83.37,1040.02,-967.58,1539.03,374.7,355.2,352.0,337.3,10
26494,2015-10-31 23:40,-332.12,83.48,1127.69,-791.01,1458.80,382.7,360.2,357.0,333.3,10


#### 5. 2016 data

In [283]:
data_2016 = datas[4].copy()

data_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26488 non-null  float64
 2   inf         26488 non-null  float64
 3   sfw         26488 non-null  float64
 4   ecpc        26488 non-null  float64
 5   tototf      26488 non-null  float64
 6   tide_level  25840 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  25247 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26122 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [284]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2016 = data_2016[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2016.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2016-05-01 00:00,451.0,654.0,512.81,677.62,748.09,316.7,304.2,300.0,291.3
1,2016-05-01 00:10,230.64,128.94,512.81,681.91,762.23,316.7,304.2,300.0,292.3
2,2016-05-01 00:20,128.98,128.98,505.38,669.04,762.23,315.7,303.2,300.0,292.3
3,2016-05-01 00:30,27.64,129.04,483.42,514.65,762.23,312.7,303.2,300.0,292.3
4,2016-05-01 00:40,27.32,129.02,483.42,493.2,762.23,312.7,302.2,300.0,292.3


In [285]:
data_2016.isna().sum() # inf: 8, tototf: 8, fw_1018662: 1249, fw_1018683: 374

ymdhm            0
inf              8
tototf           8
fw_1018662    1249
fw_1018683     374
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [286]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2016)

months[:5]

array([5, 5, 5, 5, 5])

In [287]:
# month column 추가

months = months.reshape(-1)

data_2016['month'] = months

data_2016.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2016-05-01 00:00,451.0,654.0,512.81,677.62,748.09,316.7,304.2,300.0,291.3,5
1,2016-05-01 00:10,230.64,128.94,512.81,681.91,762.23,316.7,304.2,300.0,292.3,5
2,2016-05-01 00:20,128.98,128.98,505.38,669.04,762.23,315.7,303.2,300.0,292.3,5
3,2016-05-01 00:30,27.64,129.04,483.42,514.65,762.23,312.7,303.2,300.0,292.3,5
4,2016-05-01 00:40,27.32,129.02,483.42,493.2,762.23,312.7,302.2,300.0,292.3,5


In [288]:
# data_2016 월별 split

data_2016_m5 = data_2016.groupby('month').get_group(5)
data_2016_m6 = data_2016.groupby('month').get_group(6)
data_2016_m7 = data_2016.groupby('month').get_group(7)
data_2016_m8 = data_2016.groupby('month').get_group(8)
data_2016_m9 = data_2016.groupby('month').get_group(9)
data_2016_m10 = data_2016.groupby('month').get_group(10)

In [289]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10]) # 5,6, 7, 8, 9, 10월 nan 탐색

5월
ymdhm           0
inf             0
tototf          0
fw_1018662      0
fw_1018683    101
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    97
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
inf             8
tototf          8
fw_1018662    126
fw_1018683    273
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    372
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_10196

In [290]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2016_m5.fw_1018683.mean()

data_2016_m5.fw_1018683[data_2016_m5.fw_1018683.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m5.fw_1018683[data_2016_m5.fw_1018683.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m5.fw_1018683[data_2016_m5.fw_1018683.isna()] = fw_mean


In [291]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    97
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
inf             8
tototf          8
fw_1018662    126
fw_1018683    273
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    372
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month       

In [292]:
# nan 값 mean으로 변경 (7월)

fw_mean = data_2016_m7.fw_1018662.mean()

data_2016_m7.fw_1018662[data_2016_m7.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m7.fw_1018662[data_2016_m7.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m7.fw_1018662[data_2016_m7.fw_1018662.isna()] = fw_mean


In [293]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
inf             8
tototf          8
fw_1018662    126
fw_1018683    273
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    372
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype

In [294]:
# nan 값 mean으로 변경 (8월)

inf_mean = data_2016_m8.inf.mean()
tototf_mean = data_2016_m8.tototf.mean()

data_2016_m8.inf[data_2016_m8.inf.isna()] = inf_mean
data_2016_m8.tototf[data_2016_m8.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m8.inf[data_2016_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m8.inf[data_2016_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m8.tototf[data_2016_m8.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [295]:
# nan 값 mean으로 변경 (8월)

fw62_mean = data_2016_m8.fw_1018662.mean()
fw83_mean = data_2016_m8.fw_1018683.mean()

data_2016_m8.fw_1018662[data_2016_m8.fw_1018662.isna()] = fw62_mean
data_2016_m8.fw_1018683[data_2016_m8.fw_1018683.isna()] = fw83_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m8.fw_1018662[data_2016_m8.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m8.fw_1018662[data_2016_m8.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m8.fw_1018683[data_2016_m8.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [296]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    372
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

10월
ymdhm    

In [297]:
# nan 값 mean으로 변경 (9월)

fw62_mean = data_2016_m9.fw_1018662.mean()

data_2016_m9.fw_1018662[data_2016_m9.fw_1018662.isna()] = fw62_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m9.fw_1018662[data_2016_m9.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m9.fw_1018662[data_2016_m9.fw_1018662.isna()] = fw62_mean


In [298]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm           0
inf          

In [299]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2016_m10.fw_1018662.mean()

data_2016_m10.fw_1018662[data_2016_m10.fw_1018662.isna()] = fw62_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m10.fw_1018662[data_2016_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2016_m10.fw_1018662[data_2016_m10.fw_1018662.isna()] = fw62_mean


In [300]:
find_nan([data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [301]:
# dataset 생성

dataset = pd.concat([dataset, data_2016_m5, data_2016_m6, data_2016_m7, data_2016_m8, data_2016_m9, data_2016_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.00,555.00,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.60,562.90,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.10,576.40,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.80,563.10,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.10,576.40,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2016-10-31 23:10,191.43,87.53,690.47,937.43,1159.75,338.7,325.2,326.0,317.3,10
26492,2016-10-31 23:20,87.58,87.58,631.00,776.45,1142.22,331.7,322.2,325.0,316.3,10
26493,2016-10-31 23:30,87.58,87.58,598.25,584.09,1142.22,327.7,320.2,323.0,316.3,10
26494,2016-10-31 23:40,87.58,87.58,582.21,441.31,1124.83,325.7,318.2,321.0,315.3,10


#### 6. 2017 data

In [302]:
data_2017 = datas[5].copy()

data_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26475 non-null  float64
 2   inf         26475 non-null  float64
 3   sfw         26475 non-null  float64
 4   ecpc        26475 non-null  float64
 5   tototf      26475 non-null  float64
 6   tide_level  25861 non-null  float64
 7   wl_1018662  26438 non-null  float64
 8   fw_1018662  24423 non-null  float64
 9   wl_1018680  26438 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26438 non-null  float64
 12  fw_1018683  26438 non-null  float64
 13  wl_1019630  26438 non-null  float64
 14  fw_1019630  26438 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [303]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2017 = data_2017[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2017.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2017-05-01 00:00,11.0,114.0,622.73,-944.41,805.46,330.7,309.2,304.0,295.3
1,2017-05-01 00:10,16.6,119.9,647.71,-523.16,748.09,333.7,314.2,312.0,291.3
2,2017-05-01 00:20,16.61,119.91,673.2,-131.32,692.88,336.7,317.2,315.0,287.3
3,2017-05-01 00:30,0.0,119.95,681.8,157.8,639.83,337.7,317.2,316.0,283.3
4,2017-05-01 00:40,0.0,119.97,673.2,403.27,614.11,336.7,317.2,316.0,281.3


In [304]:
data_2017.isna().sum() # inf: 20, tototf: 20, fw_1018680: 1

ymdhm            0
inf             21
tototf          21
fw_1018662    2073
fw_1018683      58
fw_1019630      58
wl_1018662      58
wl_1018680      58
wl_1018683      58
wl_1019630      58
dtype: int64

In [305]:
data_2017.isna().sum()

ymdhm            0
inf             21
tototf          21
fw_1018662    2073
fw_1018683      58
fw_1019630      58
wl_1018662      58
wl_1018680      58
wl_1018683      58
wl_1019630      58
dtype: int64

In [306]:
# mean 할당 (단, 월별 mean값 부여)

In [307]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2017)

months[:5]

array([5, 5, 5, 5, 5])

In [308]:
# month column 추가

months = months.reshape(-1)

data_2017['month'] = months

data_2017.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2017-05-01 00:00,11.0,114.0,622.73,-944.41,805.46,330.7,309.2,304.0,295.3,5
1,2017-05-01 00:10,16.6,119.9,647.71,-523.16,748.09,333.7,314.2,312.0,291.3,5
2,2017-05-01 00:20,16.61,119.91,673.2,-131.32,692.88,336.7,317.2,315.0,287.3,5
3,2017-05-01 00:30,0.0,119.95,681.8,157.8,639.83,337.7,317.2,316.0,283.3,5
4,2017-05-01 00:40,0.0,119.97,673.2,403.27,614.11,336.7,317.2,316.0,281.3,5


In [309]:
# data_2014 월별 split

data_2017_m5 = data_2017.groupby('month').get_group(5)
data_2017_m6 = data_2017.groupby('month').get_group(6)
data_2017_m7 = data_2017.groupby('month').get_group(7)
data_2017_m8 = data_2017.groupby('month').get_group(8)
data_2017_m9 = data_2017.groupby('month').get_group(9)
data_2017_m10 = data_2017.groupby('month').get_group(10)

In [310]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10]) # 5, 6, 7, 9, 10월 nan 탐색

5월
ymdhm           0
inf            14
tototf         14
fw_1018662    992
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
inf             1
tototf          1
fw_1018662    838
fw_1018683     58
fw_1019630     58
wl_1018662     58
wl_1018680     58
wl_1018683     58
wl_1019630     58
month           0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    14
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    106
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_10196

In [311]:
# nan 값 mean으로 변경 (5월)

inf_mean = data_2017_m5.inf.mean()
tototf_mean = data_2017_m5.tototf.mean()

data_2017_m5.inf[data_2017_m5.inf.isna()] = inf_mean
data_2017_m5.tototf[data_2017_m5.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m5.inf[data_2017_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m5.inf[data_2017_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m5.tototf[data_2017_m5.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [312]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2017_m5.fw_1018662.mean()

data_2017_m5.fw_1018662[data_2017_m5.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m5.fw_1018662[data_2017_m5.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m5.fw_1018662[data_2017_m5.fw_1018662.isna()] = fw_mean


In [313]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
inf             1
tototf          1
fw_1018662    838
fw_1018683     58
fw_1019630     58
wl_1018662     58
wl_1018680     58
wl_1018683     58
wl_1019630     58
month           0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    14
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    106
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month       

In [314]:
# nan 값 mean으로 변경 (6월)

inf_mean = data_2017_m6.inf.mean()
tototf_mean = data_2017_m6.tototf.mean()

data_2017_m6.inf[data_2017_m6.inf.isna()] = inf_mean
data_2017_m6.tototf[data_2017_m6.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m6.inf[data_2017_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m6.inf[data_2017_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m6.tototf[data_2017_m6.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [315]:
# nan 값 mean으로 변경 (6월)

fw62_mean = data_2017_m6.fw_1018662.mean()
fw83_mean = data_2017_m6.fw_1018683.mean()
fw30_mean = data_2017_m6.fw_1019630.mean()

wl62_mean = data_2017_m6.wl_1018662.mean()
wl80_mean = data_2017_m6.wl_1018680.mean()
wl83_mean = data_2017_m6.wl_1018683.mean()
wl30_mean = data_2017_m6.wl_1019630.mean()

data_2017_m6.fw_1018662[data_2017_m6.fw_1018662.isna()] = fw62_mean
data_2017_m6.fw_1018683[data_2017_m6.fw_1018683.isna()] = fw83_mean
data_2017_m6.fw_1019630[data_2017_m6.fw_1019630.isna()] = fw30_mean

data_2017_m6.wl_1018662[data_2017_m6.wl_1018662.isna()] = wl62_mean
data_2017_m6.wl_1018680[data_2017_m6.wl_1018680.isna()] = wl80_mean
data_2017_m6.wl_1018683[data_2017_m6.wl_1018683.isna()] = wl83_mean
data_2017_m6.wl_1019630[data_2017_m6.wl_1019630.isna()] = wl30_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m6.fw_1018662[data_2017_m6.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m6.fw_1018662[data_2017_m6.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m6.fw_1018683[data_2017_m6.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [316]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    14
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    106
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

10

In [317]:
# nan 값 mean으로 변경 (7월)

fw62_mean = data_2017_m7.fw_1018662.mean()

data_2017_m7.fw_1018662[data_2017_m7.fw_1018662.isna()] = fw62_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m7.fw_1018662[data_2017_m7.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m7.fw_1018662[data_2017_m7.fw_1018662.isna()] = fw62_mean


In [318]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf             0
tototf          0
fw_1018662    106
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

10월
ymdhm    

In [319]:
# nan 값 mean으로 변경 (9월)

fw62_mean = data_2017_m9.fw_1018662.mean()

data_2017_m9.fw_1018662[data_2017_m9.fw_1018662.isna()] = fw62_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m9.fw_1018662[data_2017_m9.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m9.fw_1018662[data_2017_m9.fw_1018662.isna()] = fw62_mean


In [320]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm           0
inf          

In [321]:
# nan 값 mean으로 변경 (10월)

inf_mean = data_2017_m10.inf.mean()
tototf_mean = data_2017_m10.tototf.mean()

data_2017_m10.inf[data_2017_m10.inf.isna()] = inf_mean
data_2017_m10.tototf[data_2017_m10.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m10.inf[data_2017_m10.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m10.inf[data_2017_m10.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m10.tototf[data_2017_m10.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [322]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2017_m10.fw_1018662.mean()

data_2017_m10.fw_1018662[data_2017_m10.fw_1018662.isna()] = fw62_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m10.fw_1018662[data_2017_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2017_m10.fw_1018662[data_2017_m10.fw_1018662.isna()] = fw62_mean


In [323]:
find_nan([data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [324]:
# dataset 생성

dataset = pd.concat([dataset, data_2017_m5, data_2017_m6, data_2017_m7, data_2017_m8, data_2017_m9, data_2017_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2017-10-31 23:10,156.0,156.0,253.57,179.12,337.86,275.7,261.2,263.0,256.3,10
26492,2017-10-31 23:20,156.0,156.0,253.57,170.98,337.86,275.7,262.2,263.0,256.3,10
26493,2017-10-31 23:30,156.0,156.0,253.57,166.91,337.86,275.7,262.2,263.0,256.3,10
26494,2017-10-31 23:40,156.0,156.0,258.79,138.41,337.86,276.7,261.2,263.0,256.3,10


#### 7. 2018 data

In [325]:
data_2018 = datas[6].copy()

data_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26401 non-null  float64
 2   inf         26401 non-null  float64
 3   sfw         26401 non-null  float64
 4   ecpc        26401 non-null  float64
 5   tototf      26401 non-null  float64
 6   tide_level  26366 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  26290 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26380 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [326]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2018 = data_2018[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2018.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2018-05-01 00:00,314.0,314.0,707.97,458.22,1107.57,340.7,324.2,327.0,314.3
1,2018-05-01 00:10,175.0,276.0,707.97,453.16,1073.46,340.7,323.2,326.0,312.3
2,2018-05-01 00:20,262.0,262.0,707.97,483.62,1039.9,340.7,322.2,325.0,310.3
3,2018-05-01 00:30,262.0,262.0,690.47,527.28,990.57,338.7,321.2,324.0,307.3
4,2018-05-01 00:40,262.0,262.0,690.47,561.97,958.36,338.7,320.2,323.0,305.3


In [327]:
data_2018.isna().sum() # inf: 95, tototf: 95, fw_1018662: 206, fw_1018683: 116

ymdhm           0
inf            95
tototf         95
fw_1018662    206
fw_1018683    116
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
dtype: int64

In [328]:
# mean 할당 (단, 월별 mean값 부여)

In [329]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2018)

months[:5]

array([5, 5, 5, 5, 5])

In [330]:
# month column 추가

months = months.reshape(-1)

data_2018['month'] = months

data_2018.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2018-05-01 00:00,314.0,314.0,707.97,458.22,1107.57,340.7,324.2,327.0,314.3,5
1,2018-05-01 00:10,175.0,276.0,707.97,453.16,1073.46,340.7,323.2,326.0,312.3,5
2,2018-05-01 00:20,262.0,262.0,707.97,483.62,1039.9,340.7,322.2,325.0,310.3,5
3,2018-05-01 00:30,262.0,262.0,690.47,527.28,990.57,338.7,321.2,324.0,307.3,5
4,2018-05-01 00:40,262.0,262.0,690.47,561.97,958.36,338.7,320.2,323.0,305.3,5


In [331]:
# data_2018 월별 split

data_2018_m5 = data_2018.groupby('month').get_group(5)
data_2018_m6 = data_2018.groupby('month').get_group(6)
data_2018_m7 = data_2018.groupby('month').get_group(7)
data_2018_m8 = data_2018.groupby('month').get_group(8)
data_2018_m9 = data_2018.groupby('month').get_group(9)
data_2018_m10 = data_2018.groupby('month').get_group(10)

In [332]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10]) # 5, 6, 7, 8, 9, 10월 nan 탐색

5월
ymdhm          0
inf           27
tototf        27
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

6월
ymdhm          0
inf           63
tototf        63
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

7월
ymdhm         0
inf           3
tototf        3
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
inf             2
tototf          2
fw_1018662    158
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            0
tototf         0
fw_1018662     0
fw_1018683    57
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month      

In [333]:
# nan 값 mean으로 변경 (5월)

inf_mean = data_2018_m5.inf.mean()
tototf_mean = data_2018_m5.tototf.mean()

data_2018_m5.inf[data_2018_m5.inf.isna()] = inf_mean
data_2018_m5.tototf[data_2018_m5.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m5.inf[data_2018_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m5.inf[data_2018_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m5.tototf[data_2018_m5.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [334]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm          0
inf           63
tototf        63
fw_1018662     0
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

7월
ymdhm         0
inf           3
tototf        3
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
inf             2
tototf          2
fw_1018662    158
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            0
tototf         0
fw_1018662     0
fw_1018683    57
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype

In [335]:
# nan 값 mean으로 변경 (6월)

inf_mean = data_2018_m6.inf.mean()
tototf_mean = data_2018_m6.tototf.mean()

data_2018_m6.inf[data_2018_m6.inf.isna()] = inf_mean
data_2018_m6.tototf[data_2018_m6.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m6.inf[data_2018_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m6.inf[data_2018_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m6.tototf[data_2018_m6.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [336]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           3
tototf        3
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
inf             2
tototf          2
fw_1018662    158
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            0
tototf         0
fw_1018662     0
fw_1018683    57
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

10

In [337]:
# nan 값 mean으로 변경 (7월)

inf_mean = data_2018_m7.inf.mean()
tototf_mean = data_2018_m7.tototf.mean()

data_2018_m7.inf[data_2018_m7.inf.isna()] = inf_mean
data_2018_m7.tototf[data_2018_m7.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m7.inf[data_2018_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m7.inf[data_2018_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m7.tototf[data_2018_m7.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [338]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
inf             2
tototf          2
fw_1018662    158
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            0
tototf         0
fw_1018662     0
fw_1018683    57
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

10

In [339]:
# nan 값 mean으로 변경 (8월)

inf_mean = data_2018_m8.inf.mean()
tototf_mean = data_2018_m8.tototf.mean()

data_2018_m8.inf[data_2018_m8.inf.isna()] = inf_mean
data_2018_m8.tototf[data_2018_m8.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m8.inf[data_2018_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m8.inf[data_2018_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m8.tototf[data_2018_m8.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [340]:
# nan 값 mean으로 변경 (8월)

fw_mean = data_2018_m8.fw_1018662.mean()

data_2018_m8.fw_1018662[data_2018_m8.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m8.fw_1018662[data_2018_m8.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m8.fw_1018662[data_2018_m8.fw_1018662.isna()] = fw_mean


In [341]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm          0
inf            0
tototf         0
fw_1018662     0
fw_1018683    57
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

10월
ymdhm          0
inf

In [342]:
# nan 값 mean으로 변경 (9월)

fw_mean = data_2018_m9.fw_1018683.mean()

data_2018_m9.fw_1018683[data_2018_m9.fw_1018683.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m9.fw_1018683[data_2018_m9.fw_1018683.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m9.fw_1018683[data_2018_m9.fw_1018683.isna()] = fw_mean


In [343]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm          0
inf           

In [344]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2018_m10.fw_1018662.mean()
fw83_mean = data_2018_m10.fw_1018683.mean()

data_2018_m10.fw_1018662[data_2018_m10.fw_1018662.isna()] = fw62_mean
data_2018_m10.fw_1018683[data_2018_m10.fw_1018683.isna()] = fw83_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m10.fw_1018662[data_2018_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m10.fw_1018662[data_2018_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2018_m10.fw_1018683[data_2018_m10.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [345]:
find_nan([data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [346]:
# dataset 생성

dataset = pd.concat([dataset, data_2018_m5, data_2018_m6, data_2018_m7, data_2018_m8, data_2018_m9, data_2018_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2018-10-31 23:10,50.0,152.0,314.01,157.37,406.79,286.7,275.2,275.0,263.3,10
26492,2018-10-31 23:20,152.0,152.0,314.01,107.52,406.79,286.7,274.2,274.0,263.3,10
26493,2018-10-31 23:30,152.0,152.0,314.01,140.60,406.79,286.7,274.2,274.0,263.3,10
26494,2018-10-31 23:40,254.0,152.0,314.01,165.41,396.55,286.7,274.2,274.0,262.3,10


#### 8. 2019 data

In [347]:
data_2019 = datas[7].copy()

data_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         25994 non-null  float64
 2   inf         25994 non-null  float64
 3   sfw         25994 non-null  float64
 4   ecpc        25994 non-null  float64
 5   tototf      25994 non-null  float64
 6   tide_level  26455 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  25829 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26496 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [348]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2019 = data_2019[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2019.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2019-05-01 00:00,0.0,329.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
1,2019-05-01 00:10,0.0,198.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
2,2019-05-01 00:20,0.0,137.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
3,2019-05-01 00:30,0.0,136.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3
4,2019-05-01 00:40,35.0,136.0,355.94,353.51,438.33,293.7,278.2,278.0,266.3


In [349]:
data_2019.isna().sum() # inf: 502, tototf: 502, fw_1018662: 667

ymdhm           0
inf           502
tototf        502
fw_1018662    667
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
dtype: int64

In [350]:
# mean 할당 (단, 월별 mean값 부여)

In [351]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2019)

months[:5]

array([5, 5, 5, 5, 5])

In [352]:
# month column 추가

months = months.reshape(-1)

data_2019['month'] = months

data_2019.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2019-05-01 00:00,0.0,329.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
1,2019-05-01 00:10,0.0,198.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
2,2019-05-01 00:20,0.0,137.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
3,2019-05-01 00:30,0.0,136.0,355.94,374.3,438.33,293.7,278.2,278.0,266.3,5
4,2019-05-01 00:40,35.0,136.0,355.94,353.51,438.33,293.7,278.2,278.0,266.3,5


In [353]:
# data_2014 월별 split

data_2019_m5 = data_2019.groupby('month').get_group(5)
data_2019_m6 = data_2019.groupby('month').get_group(6)
data_2019_m7 = data_2019.groupby('month').get_group(7)
data_2019_m8 = data_2019.groupby('month').get_group(8)
data_2019_m9 = data_2019.groupby('month').get_group(9)
data_2019_m10 = data_2019.groupby('month').get_group(10)

In [354]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10]) # 5, 6, 7, 8, 9, 10월 nan 탐색

5월
ymdhm           0
inf           202
tototf        202
fw_1018662    277
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
inf            36
tototf         36
fw_1018662    339
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    40
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    7
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf           157
tototf        157
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_10196

In [355]:
# nan 값 mean으로 변경 (5월)

inf_mean = data_2019_m5.inf.mean()
tototf_mean = data_2019_m5.tototf.mean()

data_2019_m5.inf[data_2019_m5.inf.isna()] = inf_mean
data_2019_m5.tototf[data_2019_m5.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m5.inf[data_2019_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m5.inf[data_2019_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m5.tototf[data_2019_m5.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [356]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2019_m5.fw_1018662.mean()

data_2019_m5.fw_1018662[data_2019_m5.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m5.fw_1018662[data_2019_m5.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m5.fw_1018662[data_2019_m5.fw_1018662.isna()] = fw_mean


In [357]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
inf            36
tototf         36
fw_1018662    339
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    40
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    7
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf           157
tototf        157
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month       

In [358]:
# nan 값 mean으로 변경 (6월)

inf_mean = data_2019_m6.inf.mean()
tototf_mean = data_2019_m6.tototf.mean()

data_2019_m6.inf[data_2019_m6.inf.isna()] = inf_mean
data_2019_m6.tototf[data_2019_m6.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m6.inf[data_2019_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m6.inf[data_2019_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m6.tototf[data_2019_m6.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [359]:
# nan 값 mean으로 변경 (6월)

fw_mean = data_2019_m6.fw_1018662.mean()

data_2019_m6.fw_1018662[data_2019_m6.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m6.fw_1018662[data_2019_m6.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m6.fw_1018662[data_2019_m6.fw_1018662.isna()] = fw_mean


In [360]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    40
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    7
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf           157
tototf        157
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

10

In [361]:
# nan 값 mean으로 변경 (7월)

fw_mean = data_2019_m7.fw_1018662.mean()

data_2019_m7.fw_1018662[data_2019_m7.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m7.fw_1018662[data_2019_m7.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m7.fw_1018662[data_2019_m7.fw_1018662.isna()] = fw_mean


In [362]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    7
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf           157
tototf        157
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

10월
ymdhm    

In [363]:
# nan 값 mean으로 변경 (8월)

fw_mean = data_2019_m8.fw_1018662.mean()

data_2019_m8.fw_1018662[data_2019_m8.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m8.fw_1018662[data_2019_m8.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m8.fw_1018662[data_2019_m8.fw_1018662.isna()] = fw_mean


In [364]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm           0
inf           157
tototf        157
fw_1018662      0
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

10월
ymdhm    

In [365]:
# nan 값 mean으로 변경 (9월)

inf_mean = data_2019_m9.inf.mean()
tototf_mean = data_2019_m9.tototf.mean()

data_2019_m9.inf[data_2019_m9.inf.isna()] = inf_mean
data_2019_m9.tototf[data_2019_m9.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m9.inf[data_2019_m9.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m9.inf[data_2019_m9.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m9.tototf[data_2019_m9.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [366]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm           0
inf          

In [367]:
# nan 값 mean으로 변경 (10월)

inf_mean = data_2019_m10.inf.mean()
tototf_mean = data_2019_m10.tototf.mean()

data_2019_m10.inf[data_2019_m10.inf.isna()] = inf_mean
data_2019_m10.tototf[data_2019_m10.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m10.inf[data_2019_m10.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m10.inf[data_2019_m10.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m10.tototf[data_2019_m10.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [368]:
# nan 값 mean으로 변경 (10월)

fw_mean = data_2019_m10.fw_1018662.mean()

data_2019_m10.fw_1018662[data_2019_m10.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m10.fw_1018662[data_2019_m10.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2019_m10.fw_1018662[data_2019_m10.fw_1018662.isna()] = fw_mean


In [369]:
find_nan([data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [370]:
# dataset 생성

dataset = pd.concat([dataset, data_2019_m5, data_2019_m6, data_2019_m7, data_2019_m8, data_2019_m9, data_2019_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2019-10-31 23:10,127.0,127.0,1689.09,-898.87,1996.24,427.7,405.2,400.0,358.3,10
26492,2019-10-31 23:20,133.0,133.0,1744.63,-573.68,1950.09,431.7,409.2,403.0,356.3,10
26493,2019-10-31 23:30,133.0,140.0,1758.66,14.75,1904.48,432.7,411.2,405.0,354.3,10
26494,2019-10-31 23:40,140.0,140.0,1730.66,403.06,1814.92,430.7,411.2,405.0,350.3,10


#### 9. 2020 data

In [371]:
data_2020 = datas[8].copy()

data_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26487 non-null  float64
 2   inf         26487 non-null  float64
 3   sfw         26487 non-null  float64
 4   ecpc        26487 non-null  float64
 5   tototf      26487 non-null  float64
 6   tide_level  26453 non-null  float64
 7   wl_1018662  26495 non-null  float64
 8   fw_1018662  22813 non-null  float64
 9   wl_1018680  26495 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26495 non-null  float64
 12  fw_1018683  26495 non-null  float64
 13  wl_1019630  26495 non-null  float64
 14  fw_1019630  26495 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [372]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2020 = data_2020[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2020.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2020-05-01 00:00,34.0,135.0,269.4,231.3,301.39,278.7,279.2,264.0,252.3
1,2020-05-01 00:10,0.0,135.0,269.4,219.12,301.39,278.7,279.2,264.0,252.3
2,2020-05-01 00:20,0.0,135.0,269.4,215.07,301.39,278.7,279.2,264.0,252.3
3,2020-05-01 00:30,0.0,135.0,269.4,219.12,292.61,278.7,279.2,264.0,251.3
4,2020-05-01 00:40,28.0,129.0,269.4,227.24,283.96,278.7,279.2,264.0,250.3


In [373]:
data_2020.isna().sum() # inf: 9, tototf: 9, fw_1018662: 3683, fw_1018680: 26496, fw_1018683: 1, fw_1019630: 1, wl_1018662: 1, wl_1018680: 1, wl_1018683: 1, wl_1019630: 1

ymdhm            0
inf              9
tototf           9
fw_1018662    3683
fw_1018683       1
fw_1019630       1
wl_1018662       1
wl_1018680       1
wl_1018683       1
wl_1019630       1
dtype: int64

In [374]:
# mean 할당 (단, 월별 mean값 부여)

In [375]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2020)

months[:5]

array([5, 5, 5, 5, 5])

In [376]:
# month column 추가

months = months.reshape(-1)

data_2020['month'] = months

data_2020.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2020-05-01 00:00,34.0,135.0,269.4,231.3,301.39,278.7,279.2,264.0,252.3,5
1,2020-05-01 00:10,0.0,135.0,269.4,219.12,301.39,278.7,279.2,264.0,252.3,5
2,2020-05-01 00:20,0.0,135.0,269.4,215.07,301.39,278.7,279.2,264.0,252.3,5
3,2020-05-01 00:30,0.0,135.0,269.4,219.12,292.61,278.7,279.2,264.0,251.3,5
4,2020-05-01 00:40,28.0,129.0,269.4,227.24,283.96,278.7,279.2,264.0,250.3,5


In [377]:
# data_2014 월별 split

data_2020_m5 = data_2020.groupby('month').get_group(5)
data_2020_m6 = data_2020.groupby('month').get_group(6)
data_2020_m7 = data_2020.groupby('month').get_group(7)
data_2020_m8 = data_2020.groupby('month').get_group(8)
data_2020_m9 = data_2020.groupby('month').get_group(9)
data_2020_m10 = data_2020.groupby('month').get_group(10)

In [378]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10]) # 5, 6, 7, 8, 10월 nan 탐색

5월
ymdhm           0
inf             8
tototf          8
fw_1018662    563
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

6월
ymdhm           0
inf             0
tototf          0
fw_1018662    569
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
inf             1
tototf          1
fw_1018662    144
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm            0
inf              0
tototf           0
fw_1018662    1213
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    

In [379]:
# nan 값 mean으로 변경 (5월)

inf_mean = data_2020_m5.inf.mean()
tototf_mean = data_2020_m5.tototf.mean()

data_2020_m5.inf[data_2020_m5.inf.isna()] = inf_mean
data_2020_m5.tototf[data_2020_m5.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m5.inf[data_2020_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m5.inf[data_2020_m5.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m5.tototf[data_2020_m5.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [380]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2020_m5.fw_1018662.mean()

data_2020_m5.fw_1018662[data_2020_m5.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m5.fw_1018662[data_2020_m5.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m5.fw_1018662[data_2020_m5.fw_1018662.isna()] = fw_mean


In [381]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
inf             0
tototf          0
fw_1018662    569
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm           0
inf             1
tototf          1
fw_1018662    144
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm            0
inf              0
tototf           0
fw_1018662    1213
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1

In [382]:
# nan 값 mean으로 변경 (6월)

fw_mean = data_2020_m6.fw_1018662.mean()

data_2020_m6.fw_1018662[data_2020_m6.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m6.fw_1018662[data_2020_m6.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m6.fw_1018662[data_2020_m6.fw_1018662.isna()] = fw_mean


In [383]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm           0
inf             1
tototf          1
fw_1018662    144
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

8월
ymdhm            0
inf              0
tototf           0
fw_1018662    1213
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month     

In [384]:
# nan 값 mean으로 변경 (7월)

inf_mean = data_2020_m7.inf.mean()
tototf_mean = data_2020_m7.tototf.mean()

data_2020_m7.inf[data_2020_m7.inf.isna()] = inf_mean
data_2020_m7.tototf[data_2020_m7.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m7.inf[data_2020_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m7.inf[data_2020_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m7.tototf[data_2020_m7.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [385]:
# nan 값 mean으로 변경 (7월)

fw_mean = data_2020_m7.fw_1018662.mean()

data_2020_m7.fw_1018662[data_2020_m7.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m7.fw_1018662[data_2020_m7.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m7.fw_1018662[data_2020_m7.fw_1018662.isna()] = fw_mean


In [386]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm            0
inf              0
tototf           0
fw_1018662    1213
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10

In [387]:
# nan 값 mean으로 변경 (8월)

fw_mean = data_2020_m8.fw_1018662.mean()

data_2020_m8.fw_1018662[data_2020_m8.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m8.fw_1018662[data_2020_m8.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m8.fw_1018662[data_2020_m8.fw_1018662.isna()] = fw_mean


In [388]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm            0
inf         

In [389]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2020_m10.fw_1018662.mean()
fw83_mean = data_2020_m10.fw_1018683.mean()
fw30_mean = data_2020_m10.fw_1019630.mean()

wl62_mean = data_2020_m10.wl_1018662.mean()
wl80_mean = data_2020_m10.wl_1018680.mean()
wl83_mean = data_2020_m10.wl_1018683.mean()
wl30_mean = data_2020_m10.wl_1019630.mean()


data_2020_m10.fw_1018662[data_2020_m10.fw_1018662.isna()] = fw62_mean
data_2020_m10.fw_1018683[data_2020_m10.fw_1018683.isna()] = fw83_mean
data_2020_m10.fw_1019630[data_2020_m10.fw_1019630.isna()] = fw30_mean

data_2020_m10.wl_1018662[data_2020_m10.wl_1018662.isna()] = wl62_mean
data_2020_m10.wl_1018680[data_2020_m10.wl_1018680.isna()] = wl80_mean
data_2020_m10.wl_1018683[data_2020_m10.wl_1018683.isna()] = wl83_mean
data_2020_m10.wl_1019630[data_2020_m10.wl_1019630.isna()] = wl30_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m10.fw_1018662[data_2020_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m10.fw_1018662[data_2020_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2020_m10.fw_1018683[data_2020_m10.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [390]:
find_nan([data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [391]:
# dataset 생성

dataset = pd.concat([dataset, data_2020_m5, data_2020_m6, data_2020_m7, data_2020_m8, data_2020_m9, data_2020_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2020-10-31 23:10,136.0,136.0,574.28,295.70,1142.22,324.7,340.2,324.0,316.3,10
26492,2020-10-31 23:20,34.0,136.0,582.21,189.27,1107.57,325.7,339.2,322.0,314.3,10
26493,2020-10-31 23:30,0.0,137.0,590.20,215.39,1039.90,326.7,338.2,321.0,310.3,10
26494,2020-10-31 23:40,0.0,137.0,598.25,228.26,990.57,327.7,338.2,320.0,307.3,10


#### 10. 2021 data

In [392]:
data_2021 = datas[9].copy()

data_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       26496 non-null  object 
 1   swl         26458 non-null  float64
 2   inf         26458 non-null  float64
 3   sfw         26458 non-null  float64
 4   ecpc        26458 non-null  float64
 5   tototf      26458 non-null  float64
 6   tide_level  23895 non-null  float64
 7   wl_1018662  26496 non-null  float64
 8   fw_1018662  24966 non-null  float64
 9   wl_1018680  26496 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  26496 non-null  float64
 12  fw_1018683  26169 non-null  float64
 13  wl_1019630  26496 non-null  float64
 14  fw_1019630  26496 non-null  float64
dtypes: float64(14), object(1)
memory usage: 3.0+ MB


In [393]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2021 = data_2021[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2021.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2021-05-01 00:00,0.0,138.0,905.32,983.18,895.58,361.7,346.2,340.0,301.3
1,2021-05-01 00:10,0.0,138.0,856.07,1285.8,1006.88,356.7,338.2,335.0,308.3
2,2021-05-01 00:20,0.0,138.0,770.98,1381.43,1090.45,347.7,331.2,331.0,313.3
3,2021-05-01 00:30,0.0,151.0,681.8,1359.95,1124.83,337.7,327.2,329.0,315.3
4,2021-05-01 00:40,0.0,151.0,606.35,1267.65,1142.22,328.7,323.2,327.0,316.3


In [394]:
data_2021.isna().sum() # inf: 38, tototf: 38, fw_1018662: 1530, fw_1018683: 327

ymdhm            0
inf             38
tototf          38
fw_1018662    1530
fw_1018683     327
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [395]:
# mean 할당 (단, 월별 mean값 부여)

In [396]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2021)

months[:5]

array([5, 5, 5, 5, 5])

In [397]:
# month column 추가

months = months.reshape(-1)

data_2021['month'] = months

data_2021.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2021-05-01 00:00,0.0,138.0,905.32,983.18,895.58,361.7,346.2,340.0,301.3,5
1,2021-05-01 00:10,0.0,138.0,856.07,1285.8,1006.88,356.7,338.2,335.0,308.3,5
2,2021-05-01 00:20,0.0,138.0,770.98,1381.43,1090.45,347.7,331.2,331.0,313.3,5
3,2021-05-01 00:30,0.0,151.0,681.8,1359.95,1124.83,337.7,327.2,329.0,315.3,5
4,2021-05-01 00:40,0.0,151.0,606.35,1267.65,1142.22,328.7,323.2,327.0,316.3,5


In [398]:
# data_2020 월별 split

data_2021_m5 = data_2021.groupby('month').get_group(5)
data_2021_m6 = data_2021.groupby('month').get_group(6)
data_2021_m7 = data_2021.groupby('month').get_group(7)
data_2021_m8 = data_2021.groupby('month').get_group(8)
data_2021_m9 = data_2021.groupby('month').get_group(9)
data_2021_m10 = data_2021.groupby('month').get_group(10)

In [399]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10]) # 5, 6, 7, 8, 9, 10월 nan 탐색

5월
ymdhm          0
inf            0
tototf         0
fw_1018662    94
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

6월
ymdhm           0
inf             1
tototf          1
fw_1018662      0
fw_1018683    102
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
inf           23
tototf        23
fw_1018662     0
fw_1018683    26
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
inf             5
tototf          5
fw_1018662    438
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            1
tototf         1
fw_1018662    92
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_101

In [400]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2021_m5.fw_1018662.mean()

data_2021_m5.fw_1018662[data_2021_m5.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m5.fw_1018662[data_2021_m5.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m5.fw_1018662[data_2021_m5.fw_1018662.isna()] = fw_mean


In [401]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm           0
inf             1
tototf          1
fw_1018662      0
fw_1018683    102
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

7월
ymdhm          0
inf           23
tototf        23
fw_1018662     0
fw_1018683    26
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
inf             5
tototf          5
fw_1018662    438
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            1
tototf         1
fw_1018662    92
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0


In [402]:
# nan 값 mean으로 변경 (6월)

inf_mean = data_2021_m6.inf.mean()
tototf_mean = data_2021_m6.tototf.mean()

data_2021_m6.inf[data_2021_m6.inf.isna()] = inf_mean
data_2021_m6.tototf[data_2021_m6.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m6.inf[data_2021_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m6.inf[data_2021_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m6.tototf[data_2021_m6.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [403]:
# nan 값 mean으로 변경 (6월)

fw_mean = data_2021_m6.fw_1018683.mean()

data_2021_m6.fw_1018683[data_2021_m6.fw_1018683.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m6.fw_1018683[data_2021_m6.fw_1018683.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m6.fw_1018683[data_2021_m6.fw_1018683.isna()] = fw_mean


In [404]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm          0
inf           23
tototf        23
fw_1018662     0
fw_1018683    26
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

8월
ymdhm           0
inf             5
tototf          5
fw_1018662    438
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            1
tototf         1
fw_1018662    92
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype

In [405]:
# nan 값 mean으로 변경 (7월)

inf_mean = data_2021_m7.inf.mean()
tototf_mean = data_2021_m7.tototf.mean()

data_2021_m7.inf[data_2021_m7.inf.isna()] = inf_mean
data_2021_m7.tototf[data_2021_m7.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m7.inf[data_2021_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m7.inf[data_2021_m7.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m7.tototf[data_2021_m7.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [406]:
# nan 값 mean으로 변경 (7월)

fw_mean = data_2021_m7.fw_1018683.mean()

data_2021_m7.fw_1018683[data_2021_m7.fw_1018683.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m7.fw_1018683[data_2021_m7.fw_1018683.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m7.fw_1018683[data_2021_m7.fw_1018683.isna()] = fw_mean


In [407]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm           0
inf             5
tototf          5
fw_1018662    438
fw_1018683      0
fw_1019630      0
wl_1018662      0
wl_1018680      0
wl_1018683      0
wl_1019630      0
month           0
dtype: int64

9월
ymdhm          0
inf            1
tototf         1
fw_1018662    92
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

10

In [408]:
# nan 값 mean으로 변경 (8월)

inf_mean = data_2021_m8.inf.mean()
tototf_mean = data_2021_m8.tototf.mean()

data_2021_m8.inf[data_2021_m8.inf.isna()] = inf_mean
data_2021_m8.tototf[data_2021_m8.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m8.inf[data_2021_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m8.inf[data_2021_m8.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m8.tototf[data_2021_m8.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [409]:
# nan 값 mean으로 변경 (8월)

fw_mean = data_2021_m8.fw_1018662.mean()

data_2021_m8.fw_1018662[data_2021_m8.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m8.fw_1018662[data_2021_m8.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m8.fw_1018662[data_2021_m8.fw_1018662.isna()] = fw_mean


In [410]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm          0
inf            1
tototf         1
fw_1018662    92
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64

10월
ymdhm           0
in

In [411]:
# nan 값 mean으로 변경 (9월)

inf_mean = data_2021_m9.inf.mean()
tototf_mean = data_2021_m9.tototf.mean()

data_2021_m9.inf[data_2021_m9.inf.isna()] = inf_mean
data_2021_m9.tototf[data_2021_m9.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m9.inf[data_2021_m9.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m9.inf[data_2021_m9.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m9.tototf[data_2021_m9.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [412]:
# nan 값 mean으로 변경 (9월)

fw_mean = data_2021_m9.fw_1018662.mean()

data_2021_m9.fw_1018662[data_2021_m9.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m9.fw_1018662[data_2021_m9.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m9.fw_1018662[data_2021_m9.fw_1018662.isna()] = fw_mean


In [413]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm           0
inf          

In [414]:
# nan 값 mean으로 변경 (10월)

inf_mean = data_2021_m10.inf.mean()
tototf_mean = data_2021_m10.tototf.mean()

data_2021_m10.inf[data_2021_m10.inf.isna()] = inf_mean
data_2021_m10.tototf[data_2021_m10.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m10.inf[data_2021_m10.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m10.inf[data_2021_m10.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m10.tototf[data_2021_m10.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [415]:
# nan 값 mean으로 변경 (10월)

fw62_mean = data_2021_m10.fw_1018662.mean()
fw83_mean = data_2021_m10.fw_1018683.mean()

data_2021_m10.fw_1018662[data_2021_m10.fw_1018662.isna()] = fw62_mean
data_2021_m10.fw_1018683[data_2021_m10.fw_1018683.isna()] = fw83_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m10.fw_1018662[data_2021_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m10.fw_1018662[data_2021_m10.fw_1018662.isna()] = fw62_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2021_m10.fw_1018683[data_2021_m10.fw_1018683.isna()] = fw83_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

In [416]:
find_nan([data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10]) # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

8월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

9월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

10월
ymdhm         0
inf           0

In [417]:
# dataset 생성

dataset = pd.concat([dataset, data_2021_m5, data_2021_m6, data_2021_m7, data_2021_m8, data_2021_m9, data_2021_m10], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.0,555.0,469.050000,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.6,562.9,498.000000,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.1,576.4,490.680000,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.8,563.1,476.210000,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.1,576.4,476.210000,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
26491,2021-10-31 23:10,56.0,157.0,438.147794,207.27,310.31,269.7,258.2,265.0,253.3,10
26492,2021-10-31 23:20,56.0,157.0,438.147794,211.33,319.36,270.7,258.2,265.0,254.3,10
26493,2021-10-31 23:30,56.0,157.0,438.147794,219.46,319.36,270.7,258.2,265.0,254.3,10
26494,2021-10-31 23:40,56.0,157.0,438.147794,227.59,319.36,271.7,258.2,265.0,254.3,10


#### 11. 2022 data

In [418]:
data_2022 = datas[-1].copy()

data_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11376 entries, 0 to 11375
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ymdhm       11376 non-null  object 
 1   swl         11340 non-null  float64
 2   inf         11340 non-null  float64
 3   sfw         11340 non-null  float64
 4   ecpc        11340 non-null  float64
 5   tototf      11340 non-null  float64
 6   tide_level  11363 non-null  float64
 7   wl_1018662  11376 non-null  float64
 8   fw_1018662  8191 non-null   float64
 9   wl_1018680  11376 non-null  float64
 10  fw_1018680  0 non-null      float64
 11  wl_1018683  11376 non-null  float64
 12  fw_1018683  11376 non-null  float64
 13  wl_1019630  11376 non-null  float64
 14  fw_1019630  11376 non-null  float64
dtypes: float64(14), object(1)
memory usage: 1.3+ MB


In [419]:
# corr >= 0.5인 feature(inf, totof, fw) selection

data_2022 = data_2022[['ymdhm', 'inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630',
                       'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']]

data_2022.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2022-05-01 00:00,739.36,124.36,337.64,0.0,471.08,290.7,284.2,279.0,269.3
1,2022-05-01 00:10,124.48,124.48,343.69,145.14,438.33,291.7,284.2,279.0,266.3
2,2022-05-01 00:20,124.2,124.2,337.64,232.22,417.17,290.7,284.2,279.0,264.3
3,2022-05-01 00:30,124.35,124.35,331.65,302.71,406.79,289.7,284.2,279.0,263.3
4,2022-05-01 00:40,0.0,124.42,319.84,340.03,417.17,287.7,284.2,279.0,264.3


In [420]:
data_2022.isna().sum() # inf: 36, tototf: 36, fw_1018662: 3185

ymdhm            0
inf             36
tototf          36
fw_1018662    3185
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
dtype: int64

In [421]:
# mean 할당 (단, 월별 mean값 부여)

In [422]:
# 월별로 분류히기 위해 row 별 month 추출

months = get_month(data_2022)

months[:5]

array([5, 5, 5, 5, 5])

In [423]:
# month column 추가

months = months.reshape(-1)

data_2022['month'] = months

data_2022.head()

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2022-05-01 00:00,739.36,124.36,337.64,0.0,471.08,290.7,284.2,279.0,269.3,5
1,2022-05-01 00:10,124.48,124.48,343.69,145.14,438.33,291.7,284.2,279.0,266.3,5
2,2022-05-01 00:20,124.2,124.2,337.64,232.22,417.17,290.7,284.2,279.0,264.3,5
3,2022-05-01 00:30,124.35,124.35,331.65,302.71,406.79,289.7,284.2,279.0,263.3,5
4,2022-05-01 00:40,0.0,124.42,319.84,340.03,417.17,287.7,284.2,279.0,264.3,5


In [424]:
# data_2022 월별 split

data_2022_m5 = data_2022.groupby('month').get_group(5)
data_2022_m6 = data_2022.groupby('month').get_group(6)
data_2022_m7 = data_2022.groupby('month').get_group(7)

In [425]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7]) # 5, 6, 7월 nan 탐색

5월
ymdhm            0
inf              0
tototf           0
fw_1018662    1969
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

6월
ymdhm            0
inf             36
tototf          36
fw_1018662    1180
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    36
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64



In [426]:
# nan 값 mean으로 변경 (5월)

fw_mean = data_2022_m5.fw_1018662.mean()

data_2022_m5.fw_1018662[data_2022_m5.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m5.fw_1018662[data_2022_m5.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m5.fw_1018662[data_2022_m5.fw_1018662.isna()] = fw_mean


In [427]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7])  # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm            0
inf             36
tototf          36
fw_1018662    1180
fw_1018683       0
fw_1019630       0
wl_1018662       0
wl_1018680       0
wl_1018683       0
wl_1019630       0
month            0
dtype: int64

7월
ymdhm          0
inf            0
tototf         0
fw_1018662    36
fw_1018683     0
fw_1019630     0
wl_1018662     0
wl_1018680     0
wl_1018683     0
wl_1019630     0
month          0
dtype: int64



In [428]:
# dataset 생성 (단, 5월까지만 dataset이고, 6,7월은 submission이기 때문에 5월 data만 datset에 추가)

dataset = pd.concat([dataset, data_2022_m5], axis=0)

dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.00,555.00,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.60,562.90,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.10,576.40,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.80,563.10,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.10,576.40,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
4459,2022-05-31 23:10,140.94,140.94,285.72,575.57,493.58,281.7,281.2,278.0,271.3,5
4460,2022-05-31 23:20,141.07,141.07,274.78,501.04,505.03,279.7,279.2,278.0,272.3,5
4461,2022-05-31 23:30,141.01,141.01,269.40,425.89,505.03,278.7,277.2,277.0,272.3,5
4462,2022-05-31 23:40,755.75,140.75,264.07,198.19,493.58,277.7,276.2,276.0,271.3,5


In [429]:
# 2022-06, 07은 Data Leakage를 피하기 위해 해당 달의 mean이 아닌 dataset의 mean(6, 7)으로 변경

In [430]:
dataset_m6 = dataset.groupby('month').get_group(6)
dataset_m7 = dataset.groupby('month').get_group(7)

In [431]:
# nan 값 mean으로 변경 (6월)

inf_mean = dataset_m6.inf.mean()
tototf_mean = dataset_m6.tototf.mean()

data_2022_m6.inf[data_2022_m6.inf.isna()] = inf_mean
data_2022_m6.tototf[data_2022_m6.tototf.isna()] = tototf_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m6.inf[data_2022_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m6.inf[data_2022_m6.inf.isna()] = inf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m6.tototf[data_2022_m6.tototf.isna()] = tototf_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [433]:
# nan 값 mean으로 변경 (6월)

fw_mean = dataset_m6.fw_1018662.mean()

data_2022_m6.fw_1018662[data_2022_m6.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m6.fw_1018662[data_2022_m6.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m6.fw_1018662[data_2022_m6.fw_1018662.isna()] = fw_mean


In [441]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7])  # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64



In [435]:
# nan 값 mean으로 변경 (7월)

fw_mean = dataset_m7.fw_1018662.mean()

data_2022_m7.fw_1018662[data_2022_m7.fw_1018662.isna()] = fw_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m7.fw_1018662[data_2022_m7.fw_1018662.isna()] = fw_mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2022_m7.fw_1018662[data_2022_m7.fw_1018662.isna()] = fw_mean


In [440]:
find_nan([data_2022_m5, data_2022_m6, data_2022_m7])  # 변경 확인

5월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

6월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64

7월
ymdhm         0
inf           0
tototf        0
fw_1018662    0
fw_1018683    0
fw_1019630    0
wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
month         0
dtype: int64



### Dataset save

In [437]:
dataset.to_csv('./data/dataset.csv', index=False)

### Dataset load

In [438]:
dataset = pd.read_csv('./data/dataset.csv')

dataset.shape

(269424, 11)

### Sub_mission

In [577]:
sub_mission = pd.concat([data_2022_m6, data_2022_m7], axis=0)

sub_mission

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
4464,2022-06-01 00:00,140.60,140.60,269.40,8.23,471.08,0.0,0.0,0.0,0.0,6
4465,2022-06-01 00:10,140.78,140.78,280.22,28.82,449.12,0.0,0.0,0.0,0.0,6
4466,2022-06-01 00:20,755.90,140.90,296.87,12.35,417.17,0.0,0.0,0.0,0.0,6
4467,2022-06-01 00:30,0.00,140.94,302.53,53.52,386.43,0.0,0.0,0.0,0.0,6
4468,2022-06-01 00:40,140.63,140.63,296.87,107.04,366.60,0.0,0.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...
11371,2022-07-18 23:10,259.23,259.23,319.84,-456.41,974.40,0.0,0.0,0.0,0.0,7
11372,2022-07-18 23:20,260.46,260.46,314.01,-717.30,1006.88,0.0,0.0,0.0,0.0,7
11373,2022-07-18 23:30,259.37,259.37,387.55,-843.37,1039.90,0.0,0.0,0.0,0.0,7
11374,2022-07-18 23:40,259.13,259.13,454.91,-1023.37,1073.46,0.0,0.0,0.0,0.0,7


In [578]:
X_sub = sub_mission.drop(['ymdhm', 'wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630', 'month'], axis=1)

X_sub

Unnamed: 0,inf,tototf,fw_1018662,fw_1018683,fw_1019630
4464,140.60,140.60,269.40,8.23,471.08
4465,140.78,140.78,280.22,28.82,449.12
4466,755.90,140.90,296.87,12.35,417.17
4467,0.00,140.94,302.53,53.52,386.43
4468,140.63,140.63,296.87,107.04,366.60
...,...,...,...,...,...
11371,259.23,259.23,319.84,-456.41,974.40
11372,260.46,260.46,314.01,-717.30,1006.88
11373,259.37,259.37,387.55,-843.37,1039.90
11374,259.13,259.13,454.91,-1023.37,1073.46


### Train, Val, Test split

#### Train, Test split

In [444]:
from sklearn.model_selection import train_test_split

In [524]:
dataset

Unnamed: 0,ymdhm,inf,tototf,fw_1018662,fw_1018683,fw_1019630,wl_1018662,wl_1018680,wl_1018683,wl_1019630,month
0,2012-05-01 00:00,555.00,555.00,469.05,729.80,540.18,310.7,300.2,290.0,275.3,5
1,2012-05-01 00:10,464.60,562.90,498.00,731.48,540.18,314.7,300.2,290.0,275.3,5
2,2012-05-01 00:20,478.10,576.40,490.68,726.42,540.18,313.7,301.2,290.0,275.3,5
3,2012-05-01 00:30,464.80,563.10,476.21,726.42,552.17,311.7,301.2,290.0,276.3,5
4,2012-05-01 00:40,478.10,576.40,476.21,707.17,564.29,311.7,301.2,291.0,277.3,5
...,...,...,...,...,...,...,...,...,...,...,...
269419,2022-05-31 23:10,140.94,140.94,285.72,575.57,493.58,281.7,281.2,278.0,271.3,5
269420,2022-05-31 23:20,141.07,141.07,274.78,501.04,505.03,279.7,279.2,278.0,272.3,5
269421,2022-05-31 23:30,141.01,141.01,269.40,425.89,505.03,278.7,277.2,277.0,272.3,5
269422,2022-05-31 23:40,755.75,140.75,264.07,198.19,493.58,277.7,276.2,276.0,271.3,5


In [554]:
# X, y 분류

X = dataset[['inf', 'tototf', 'fw_1018662', 'fw_1018683', 'fw_1019630', 'month']]
y = dataset.iloc[:, -5:]

X.shape, y.shape

((269424, 6), (269424, 5))

In [555]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((215539, 6), (53885, 6), (215539, 5), (53885, 5))

In [556]:
X_train.month.value_counts(), y_train.month.value_counts()

(5     39283
 7     35712
 8     35712
 10    35712
 6     34560
 9     34560
 Name: month, dtype: int64,
 5     39283
 7     35712
 8     35712
 10    35712
 6     34560
 9     34560
 Name: month, dtype: int64)

In [557]:
X_test.month.value_counts(), y_test.month.value_counts()

(5     9821
 7     8928
 8     8928
 10    8928
 6     8640
 9     8640
 Name: month, dtype: int64,
 5     9821
 7     8928
 8     8928
 10    8928
 6     8640
 9     8640
 Name: month, dtype: int64)

#### Train, Val split

In [471]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=X_train.month, shuffle=True, random_state=2022)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((172431, 6), (43108, 6), (172431, 5), (43108, 5))

In [231]:
X_train.month.value_counts(), y_train.month.value_counts()

(5     31426
 8     28570
 10    28570
 7     28569
 9     27648
 6     27648
 Name: month, dtype: int64,
 5     31426
 8     28570
 10    28570
 7     28569
 9     27648
 6     27648
 Name: month, dtype: int64)

In [232]:
X_val.month.value_counts(), y_val.month.value_counts()

(5     7857
 7     7143
 10    7142
 8     7142
 9     6912
 6     6912
 Name: month, dtype: int64,
 5     7857
 7     7143
 10    7142
 8     7142
 9     6912
 6     6912
 Name: month, dtype: int64)

### Scaling

In [450]:
from sklearn.preprocessing import StandardScaler

In [558]:
# months col 제거

X_train = X_train.drop(['month'], axis=1)
y_train = y_train.drop(['month'], axis=1)

# X_val = X_val.drop(['month'], axis=1)
# y_val = y_val.drop(['month'], axis=1)

X_test = X_test.drop(['month'], axis=1)
y_test = y_test.drop(['month'], axis=1)

In [559]:
X_train.shape, X_test.shape

((215539, 5), (53885, 5))

In [492]:
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
# X_val = ss.transform(X_val)
X_test = ss.transform(X_test)
X_sub = ss.transform(X_sub)

### Learning

In [455]:
from sklearn.ensemble import RandomForestRegressor

In [560]:
rfr = RandomForestRegressor(n_jobs=4, random_state=0)

rfr.fit(X_train, y_train)

In [561]:
y_pred = rfr.predict(X_test)

In [562]:
from sklearn.metrics import mean_squared_error, r2_score

In [563]:
np.sqrt(mean_squared_error(y_test, y_pred)) / r2_score(y_test, y_pred)

16.14414411578317

In [564]:
rfr.n_estimators

100

In [541]:
# Grid Search

from sklearn.model_selection import GridSearchCV, KFold

In [535]:
param = {
    'n_estimators' : range(100, 200, 5),
    'verbose' : [True]
}

In [565]:
kfold = KFold(shuffle=False)

In [566]:
rfr2 = RandomForestRegressor(n_jobs=4, random_state=0)

In [567]:
grid = GridSearchCV(rfr2, param, n_jobs=4, cv=kfold, refit=True)

grid.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   31.7s finished


In [568]:
grid.best_params_

{'n_estimators': 100, 'verbose': True}

In [569]:
y_pred = grid.best_estimator_.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished


In [570]:
np.sqrt(mean_squared_error(y_test, y_pred)) / r2_score(y_test, y_pred)

16.14414411578317

In [None]:
'''
n_estimators: 100 = 3.317265
n_estimators: 190 = 3.310137

'''

### Model save

In [571]:
import pickle

In [572]:
saved_model = pickle.dump(grid.best_estimator_, open('./model/RFR_n_100.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

### Model load

In [573]:
loaded_model = pickle.load(open('./model/RFR_n_100.pkl', 'rb'))

In [574]:
y_pred = loaded_model.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


In [575]:
np.sqrt(mean_squared_error(y_test, y_pred)) / r2_score(y_test, y_pred)

16.14414411578317

### Make Sub_mission

In [580]:
X_sub = X_sub.to_numpy()

In [589]:
y_pred = loaded_model.predict(X_sub)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [590]:
y_pred

array([[279.1 , 269.65, 266.11, 269.76],
       [280.61, 265.88, 269.69, 267.51],
       [283.73, 271.31, 270.41, 264.27],
       ...,
       [298.17, 298.73, 302.58, 310.09],
       [308.79, 305.6 , 308.24, 311.76],
       [325.73, 305.47, 311.3 , 313.1 ]])

In [591]:
sub_mission = pd.read_csv('./data/sample_submission.csv')

In [592]:
sub_mission

Unnamed: 0,ymdhm,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2022-06-01 00:00,0,0,0,0
1,2022-06-01 00:10,0,0,0,0
2,2022-06-01 00:20,0,0,0,0
3,2022-06-01 00:30,0,0,0,0
4,2022-06-01 00:40,0,0,0,0
...,...,...,...,...,...
6907,2022-07-18 23:10,0,0,0,0
6908,2022-07-18 23:20,0,0,0,0
6909,2022-07-18 23:30,0,0,0,0
6910,2022-07-18 23:40,0,0,0,0


In [593]:
sub_mission[['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']] = y_pred

In [594]:
sub_mission

Unnamed: 0,ymdhm,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2022-06-01 00:00,279.10,269.65,266.11,269.76
1,2022-06-01 00:10,280.61,265.88,269.69,267.51
2,2022-06-01 00:20,283.73,271.31,270.41,264.27
3,2022-06-01 00:30,284.78,271.69,270.16,261.24
4,2022-06-01 00:40,283.55,274.88,268.30,259.21
...,...,...,...,...,...
6907,2022-07-18 23:10,287.31,279.30,286.17,305.95
6908,2022-07-18 23:20,286.90,286.51,291.63,307.68
6909,2022-07-18 23:30,298.17,298.73,302.58,310.09
6910,2022-07-18 23:40,308.79,305.60,308.24,311.76


In [588]:
sub_mission.to_csv('./result/sample_submission.csv', index=False)