## 6.4 자전거 대여 수요 예측 Baseline Model

In [1]:
import pandas as pd

data_path = '/kaggle/input/bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sampleSubmission.csv')

In [2]:
#훈련 데이터에서 weather가 4가 아닌 데이터만 추출
train = train[train['weather'] != 4]

In [3]:
all_data_temp = pd.concat([train, test])
all_data_temp

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


In [4]:
all_data = pd.concat([train, test], ignore_index = True)
all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


In [5]:
from datetime import datetime

#날짜 피처 생성
all_data['date'] = all_data['datetime'].apply(lambda x: x.split()[0])
#연도 피처 생성
all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0])
#월 피처 생성
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1])
#시 피처 생성
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0])
#요일 피처 생성
all_data['weekday'] = all_data['date'].apply(lambda dateString: datetime.strptime(dateString,"%Y-%m-%d").weekday())

In [6]:
#필요없는 피처 제거
drop_features = ['casual', 'registered', 'datetime', 'date', 'month', 'windspeed']
all_data = all_data.drop(drop_features, axis = 1)

- 탐색적 데이터 분석에서 얻은 인사이트를 활용해 의미 있는 피처와 불필요한 피처를 구분: '피처 선택(Feature Selection)' 과정
- 타깃값 예측과 관련 없는 피처가 많다면 오히려 예측 성능이 떨어지기때문에 데이터의 특징을 잘 나타내는 주요 피처 선택 과정이 중요

In [7]:
#모든 피처 엔지니어링을 적용했으므로 다시 훈련 데이터/ 테스트 데이터 나누기
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]

#타깃값 count 제거
X_train = X_train.drop(['count'], axis = 1)
X_test = X_test.drop(['count'], axis = 1)

y = train['count']
X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,hour,weekday
0,1,0,0,1,9.84,14.395,81,2011,0,5
1,1,0,0,1,9.02,13.635,80,2011,1,5
2,1,0,0,1,9.02,13.635,80,2011,2,5
3,1,0,0,1,9.84,14.395,75,2011,3,5
4,1,0,0,1,9.84,14.395,75,2011,4,5


### 평가지표 계산 함수 작성

In [8]:
import numpy as np

def rmsle(y_true, y_pred, convertExp=True):
    # 지수 변환
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
        
    #로그변환 후 결측값을 0으로 변환
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    
    #RMSLE 계산
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

### 모델 훈련 및 성능 검증

In [9]:
from sklearn.linear_model import LinearRegression

linear_reg_model = LinearRegression()

log_y = np.log(y) #타깃값 로그변환
linear_reg_model.fit(X_train, log_y) #모델 훈련

LinearRegression()

In [10]:
#모델 성능 검증
preds = linear_reg_model.predict(X_train)
print(f'선형 회귀의 RMSLE 값: {rmsle(log_y, preds, True):.4f}')

선형 회귀의 RMSLE 값: 1.0205


In [11]:
linearreg_preds = linear_reg_model.predict(X_test) #테스트 데이터로 예측

submission['count'] = np.exp(linearreg_preds) #지수변환
submission.to_csv('submission.csv', index=False) #파일로 저장