# 이온 채널 전환 예측
* **데이터 세트**
 * Ion Switching (이온 채넡 전환 예측 데이터)
* **<u>주어진 시간(time)과 신호 값(signal)에 따라 열린 채널의 수(open_channels) 예측</u>**


* 도전과제 : Lasso, ElasticNet 등으로 모델을 만들어본 후, 기존 모델과 성능 비교하기
 * RMSLE, RMSE, MAE 지표를 활용하여 성능측정 및 비교

## 데이터 가공
### 데이터 구조 및 칼럼 확인

In [598]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [599]:
from sklearn.linear_model import Ridge

In [600]:
train = pd.read_csv('/content/drive/MyDrive/EURON/University-of-Liverpool/train.csv')
submission = pd.read_csv('/content/drive/MyDrive/EURON/University-of-Liverpool/sample_submission.csv')

* train DataFrame의 처음 5개 행, 끝 5개 행을 출력하여 대략적인 데이터 구조와 주요 칼럼 확인

* 주요 칼럼
 * time, signal, open_channels

### train 데이터셋

In [601]:
train.head()

Unnamed: 0,time,signal,open_channels
0,0.0001,-2.76,0
1,0.0002,-2.8557,0
2,0.0003,-2.4074,0
3,0.0004,-3.1404,0
4,0.0005,-3.1525,0


In [602]:
train.tail()

Unnamed: 0,time,signal,open_channels
4999995,499.9996,2.9194,7
4999996,499.9997,2.698,7
4999997,499.9998,4.5164,8
4999998,499.9999,5.6397,9
4999999,500.0,5.3792,9


In [603]:
# train DateFrame의 49000~50010번째 column까지 모든 열에 해당하는 데이터 선택
train.loc[49000:50010,:]

Unnamed: 0,time,signal,open_channels
49000,4.9001,-2.9378,0
49001,4.9002,-2.7240,0
49002,4.9003,-2.5879,0
49003,4.9004,-2.3940,0
49004,4.9005,-2.3708,0
...,...,...,...
50006,5.0007,-2.3878,0
50007,5.0008,-2.8767,0
50008,5.0009,-2.5252,0
50009,5.0010,-2.5781,0


In [604]:
# train 데이터 세트의 shape 확인
print('train DateFrame의 크기', train.shape)

train DateFrame의 크기 (5000000, 3)


In [605]:
# open_channels 칼럼에서 가장 작은 값을 반환
train['open_channels'].min()

0

### time 칼럼의 데이터를 가공해준다.
* 원래 DataFrame에서 time 칼럼은 0.0001~500.0000 이었는데 이를 0.0001에서 5.0000까지의 숫자로 변환해준다.
* 0.0001에서 5.0000까지는 그대로지만 그 다음부터 다시 0.0001이 시작되도록 바꿔준다.

In [606]:
# time 칼럼의 값을 train_time 변수에 저장
# .values를 사용하여 해당 칼럼의 데이터를 numpy 배열로 가져옴
train_time = train['time'].values

In [607]:
train_time[0]

0.0001

In [608]:
# train_time_0에 train_time 배열의 0부터 49999까지의 값을 저장
train_time_0 = train_time[:50000]

In [609]:
# train_time_0을 리스트로 만들고 100을 곱해서 train DateFrame의 크기인 5000000과 맞춰준다.
train_time_0 = list(train_time_0)*100

In [610]:
# train_time_0에선 5.0000 다음 0.0001이 온다.
print('train_time_0과 train_time의 차이')
print('* 49999번째 인덱스')
print('train_time_0 : ', train_time_0[49999])
print('train_time   : ', train_time[49999])
print('* 50000번째 인덱스')
print('train_time_0 : ', train_time_0[50000])
print('train_time   : ', train_time[50000])
print('* 100000번째 인덱스')
print('train_time_0 : ', train_time_0[100000])
print('train_time   : ', train_time[100000])

train_time_0과 train_time의 차이
* 49999번째 인덱스
train_time_0 :  5.0
train_time   :  5.0
* 50000번째 인덱스
train_time_0 :  0.0001
train_time   :  5.0001
* 100000번째 인덱스
train_time_0 :  0.0001
train_time   :  10.0001


In [611]:
len(train_time_0)

5000000

In [612]:
# train의 time 칼럼을 train_time_0으로 바꿔준다.
train['time'] = train_time_0

In [613]:
# 바뀐 time 칼럼을 확인해준다.
train.head()

Unnamed: 0,time,signal,open_channels
0,0.0001,-2.76,0
1,0.0002,-2.8557,0
2,0.0003,-2.4074,0
3,0.0004,-3.1404,0
4,0.0005,-3.1525,0


In [614]:
train.tail()

Unnamed: 0,time,signal,open_channels
4999995,4.9996,2.9194,7
4999996,4.9997,2.698,7
4999997,4.9998,4.5164,8
4999998,4.9999,5.6397,9
4999999,5.0,5.3792,9


### test 데이터셋

In [615]:
# 테스트 데이터셋을 불러온다.
test = pd.read_csv('/content/drive/MyDrive/EURON/University-of-Liverpool/test.csv')

In [616]:
test.head()

Unnamed: 0,time,signal
0,500.0001,-2.6498
1,500.0002,-2.8494
2,500.0003,-2.86
3,500.0004,-2.435
4,500.0005,-2.6155


In [617]:
test.tail()

Unnamed: 0,time,signal
1999995,699.9996,-2.9092
1999996,699.9997,-2.7422
1999997,699.9998,-2.8285
1999998,699.9999,-2.9092
1999999,700.0,-2.7422


In [618]:
test.shape

(2000000, 2)

### time 칼럼의 데이터를 가공해준다.
* train 데이터셋에서 사용한 train_time_0을 test DataFrame의 길이인 2000000에 맞춰서 바꿔주고 time 칼럼을 바꿔준다.
* train 데이터프레임의 time 칼럼의 첫 50,000개의 값을 40번 반복하여 그 결과를 test 데이터프레임의 time 칼럼에 할당

In [619]:
train_time_0 = train_time[:50000]
train_time_0 = list(train_time_0)*40
test['time'] = train_time_0

In [620]:
# 바뀐 time 칼럼을 확인해준다.
test.head()

Unnamed: 0,time,signal
0,0.0001,-2.6498
1,0.0002,-2.8494
2,0.0003,-2.86
3,0.0004,-2.435
4,0.0005,-2.6155


In [621]:
test.tail()

Unnamed: 0,time,signal
1999995,4.9996,-2.9092
1999996,4.9997,-2.7422
1999997,4.9998,-2.8285
1999998,4.9999,-2.9092
1999999,5.0,-2.7422


## 데이터 프레임 그룹으로 나누기
* 각 그룹 signal 칼럼 값 조정

In [622]:
# train 데이터를 100개의 그룹으로 나누기
n_groups = 100
train["group"] = 0
for i in range(n_groups):
    ids = np.arange(i*50000, (i+1)*50000)
    train.loc[ids,"group"] = i

# test 데이터를 40개의 그룹으로 나누기
n_groups = 40
test["group"] = 0
for i in range(n_groups):
    ids = np.arange(i*50000, (i+1)*50000)
    test.loc[ids,"group"] = i

# train, test 데이터프레임에 새로운 칼럼 'signal_2'를 추가하고 0으로 초기화한다
train['signal_2'] = 0
test['signal_2'] = 0

In [623]:
train.tail()

Unnamed: 0,time,signal,open_channels,group,signal_2
4999995,4.9996,2.9194,7,99,0
4999996,4.9997,2.698,7,99,0
4999997,4.9998,4.5164,8,99,0
4999998,4.9999,5.6397,9,99,0
4999999,5.0,5.3792,9,99,0


In [624]:
test.tail()

Unnamed: 0,time,signal,group,signal_2
1999995,4.9996,-2.9092,39,0
1999996,4.9997,-2.7422,39,0
1999997,4.9998,-2.8285,39,0
1999998,4.9999,-2.9092,39,0
1999999,5.0,-2.7422,39,0


In [625]:
# 각 그룹에 대해 signal 칼럼의 값들을 0에서 1 사이로 정규화한 후 원래의 범위로 스케일링한다.
# 이 값들을 signal_2 칼럼에 저장한다.
n_groups = 100
for i in range(n_groups):
    sub = train[train.group == i]
    signals = sub.signal.values
    imax, imin = math.floor(np.max(signals)), math.ceil(np.min(signals))
    # print(np.max(signals), np.min(signals), imax, imin)
    signals = (signals - np.min(signals))/(np.max(signals) - np.min(signals))
    signals = signals*(imax-imin)
    train.loc[sub.index,"signal_2"] = [0,] +list(np.array(signals[:-1]))

# test 데이터에 대해서도 동일한 작업을 수행한다
n_groups = 40
for i in range(n_groups):
    sub = test[test.group == i]
    signals = sub.signal.values
    imax, imin = math.floor(np.max(signals)), math.ceil(np.min(signals))
    signals = (signals - np.min(signals))/(np.max(signals) - np.min(signals))
    signals = signals*(imax-imin)
    test.loc[sub.index,"signal_2"] = [0,] +list(np.array(signals[:-1]))

In [626]:
print('train DF에서 signal_2 값의 범위:', train['signal_2'].min(), '~', train['signal_2'].max())
print('test DF에서 signal_2 값의 범위:', test['signal_2'].min(), '~', test['signal_2'].max())

train DF에서 signal_2 값의 범위: 0.0 ~ 13.0
test DF에서 signal_2 값의 범위: 0.0 ~ 13.0


In [627]:
train.tail()

Unnamed: 0,time,signal,open_channels,group,signal_2
4999995,4.9996,2.9194,7,99,6.997803
4999996,4.9997,2.698,7,99,5.704543
4999997,4.9998,4.5164,8,99,5.507334
4999998,4.9999,5.6397,9,99,7.127049
4999999,5.0,5.3792,9,99,8.127613


## 예제 노트북 - Ridge 모델로 예측 수행
* **원래 노트북과 다르게 예측 성능을 측정하기 위해 test 데이터셋을 사용하여 예측하지 않고 train데이터셋을 train_test_split()으로 쪼개서 예측을 수행한다.**

In [628]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [629]:
# 독립변수와 타겟 변수를 설정한다.
y_target = train['open_channels'].values
X_features = train.drop(['open_channels', 'signal', 'group'], axis=1, inplace=False)
X_features

Unnamed: 0,time,signal_2
0,0.0001,0.000000
1,0.0002,0.622782
2,0.0003,0.556714
3,0.0004,0.866206
4,0.0005,0.360166
...,...,...
4999995,4.9996,6.997803
4999996,4.9997,5.704543
4999997,4.9998,5.507334
4999998,4.9999,7.127049


In [630]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

In [631]:
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)

In [632]:
pred = ridge_reg.predict(X_test)

# 예측값을 0과 10 사이의 범위로 조정
pred = np.clip(pred, 0, 10)

In [633]:
pred.mean()

2.7375996153934214

In [634]:
# 예측값을 정수형으로 변환
pred = pred.astype(int)

### 성능 측정
* RMSLE, MAE, RMSE

In [665]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

def rmse(y, pred):
    return np.sqrt(mean_squared_error(y, pred))

# MAE, RMSE, RMSLE 모두 계산
def evaluate_regr(y, pred):
    rmsle_val = rmsle(y, pred)
    rmse_val = rmse(y, pred)
    # MAE는 사이킷런의 mean_absolute_error()로 계산
    mae_val = mean_absolute_error(y, pred)
    print('RMSLE: {0:.5f}, RMSE: {1:.5F}, MAE: {2:.5F}'.format(rmsle_val, rmse_val, mae_val))

In [666]:
evaluate_regr(y_test, pred)

RMSLE: 0.33695, RMSE: 1.06090, MAE: 0.68109


## 도전과제 - Lasso, ElasticNet 모델과 기존 모델 성능 비교

In [667]:
from sklearn.linear_model import Lasso, ElasticNet

In [668]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

In [669]:
# 모델과 학습/테스트 데이터 세트를 입력하면 성능 평가 수치를 반환
def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    if is_expm1 :
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)
    print('###', model.__class__.__name__,'###')
    evaluate_regr(y_test, pred)

# ridge, lasso, elasticnet 모델별로 평가 수행
model_r = Ridge()
model_l = Lasso()
model_e = ElasticNet()

for model in [model_r, model_l, model_e]:
    get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False)

### Ridge ###
RMSLE: 0.26055, RMSE: 0.92641, MAE: 0.63155
### Lasso ###
RMSLE: 0.30173, RMSE: 1.00129, MAE: 0.72155
### ElasticNet ###
RMSLE: 0.29444, RMSE: 0.98872, MAE: 0.70731


## 결과
RMSLE, RMSE, MAE 모두 Ridge 회귀 모델이 가장 낮다.

<u>**따라서 최적 선형 회귀 모델은 `Ridge`이다.**</u>

## 원본 노트북 뒷부분

In [640]:
X = train[['time', 'signal_2']].values
y = train['open_channels'].values

In [641]:
model = Ridge()
model.fit(X, y)

In [642]:
train_preds = model.predict(X)

In [643]:
# 예측값을 0과 10 사이의 범위로 조정
train_preds = np.clip(train_preds, 0, 10)

In [644]:
# 평균 반환
train_preds.mean()

2.736848540884588

In [645]:
# 예측값을 정수형으로 변환
train_preds = train_preds.astype(int)

In [646]:
# 테스트 데이터셋으로 예측하기 위한 변수 저장
X_test = test[['time', 'signal_2']].values

In [647]:
submission.head()

Unnamed: 0,time,open_channels
0,500.0001,0
1,500.0002,0
2,500.0003,0
3,500.0004,0
4,500.0005,0


In [648]:
submission.shape

(2000000, 2)

In [649]:
X_test.shape

(2000000, 2)

In [650]:
# 테스트 데이터에 대한 예측 수행
test_preds = model.predict(X_test)
test_preds = np.clip(test_preds, 0, 10) # 예측값을 0과 10 사이의 범위로 조정
test_preds = test_preds.astype(int) # 예측값을 정수형으로 변환

# 예측값을 submission 데이터프레임의 open_channels 칼럼으로 할당
submission['open_channels'] = test_preds

In [651]:
test_preds.mean()

1.2352345

In [652]:
submission.head(20)

Unnamed: 0,time,open_channels
0,500.0001,0
1,500.0002,0
2,500.0003,0
3,500.0004,0
4,500.0005,0
5,500.0006,0
6,500.0007,0
7,500.0008,0
8,500.0009,0
9,500.001,0


In [653]:
np.set_printoptions(precision=4)

In [654]:
submission.time.values[:20]

array([500.0001, 500.0002, 500.0003, 500.0004, 500.0005, 500.0006,
       500.0007, 500.0008, 500.0009, 500.001 , 500.0011, 500.0012,
       500.0013, 500.0014, 500.0015, 500.0016, 500.0017, 500.0018,
       500.0019, 500.002 ])

In [655]:
# numpy 출력 옵션을 설정하여 소수점 4자리까지 정밀도 출력, 포맷팅
submission['time'] = [format(submission.time.values[x], '.4f') for x in range(2000000)]

In [656]:
submission.time.values[:20]

array(['500.0001', '500.0002', '500.0003', '500.0004', '500.0005',
       '500.0006', '500.0007', '500.0008', '500.0009', '500.0010',
       '500.0011', '500.0012', '500.0013', '500.0014', '500.0015',
       '500.0016', '500.0017', '500.0018', '500.0019', '500.0020'],
      dtype=object)

In [657]:
submission['open_channels'].mean()

1.2352345

In [658]:
submission.head()

Unnamed: 0,time,open_channels
0,500.0001,0
1,500.0002,0
2,500.0003,0
3,500.0004,0
4,500.0005,0


In [659]:
submission.to_csv('submission.csv', index=False)