# 3.4 자전거 대여 수요 예측 경진대회 베이스라인

- [자전거 대여 수요 예측 경진대회 링크](https://www.kaggle.com/c/bike-sharing-demand)

- [모델링 노트북 참고 링크](https://www.kaggle.com/viveksrinivasan/eda-ensemble-model-top-10-percentile)

In [1]:
import pandas as pd

# 데이터 경로
data_path = '/kaggle/input/bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sampleSubmission.csv')

## 3.4.1 피처 엔지니어링

### 데이터 합치기

In [2]:
all_data_temp = pd.concat([train, test])
all_data_temp

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


In [3]:
all_data = pd.concat([train, test], ignore_index=True)
all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17374,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17375,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17376,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
17377,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


### 파생 변수(피처) 추가

In [4]:
from datetime import datetime

all_data['date'] = all_data['datetime'].apply(lambda x: x.split()[0]) # 날짜 피처 생성
all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0]) # 연도 피처 생성
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1]) # 월 피처 생성
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0]) # 시 피처 생성
all_data["weekday"] = all_data['date'].apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").weekday()) # 요일 피처 생성

### 필요 없는 피처 제거

In [5]:
drop_features = ['casual', 'registered', 'datetime', 'date', 'datetime', 'windspeed', 'month']

all_data = all_data.drop(drop_features, axis=1)

### 데이터 나누기

In [6]:
X_train = all_data[pd.notnull(all_data['count'])]
X_test = all_data[~pd.notnull(all_data['count'])]

# 타깃 값 count 제거
X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)

y = train['count']

In [7]:
X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,hour,weekday
0,1,0,0,1,9.84,14.395,81,2011,0,5
1,1,0,0,1,9.02,13.635,80,2011,1,5
2,1,0,0,1,9.02,13.635,80,2011,2,5
3,1,0,0,1,9.84,14.395,75,2011,3,5
4,1,0,0,1,9.84,14.395,75,2011,4,5


## 3.4.2 평가지표 계산 함수 작성

In [8]:
import numpy as np

def rmsle(y_true, y_pred, convertExp=True):
    # 지수변
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
        
    # 로그변환 후 결측값을 0으로 변환
    log_true = np.nan_to_num(np.array([np.log(y+1) for y in y_true]))
    log_pred = np.nan_to_num(np.array([np.log(y+1) for y in y_pred]))
    
    # RMSLE 계산
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

## 3.4.3 모델 훈련

In [9]:
from sklearn.linear_model import LinearRegression

linear_reg_model = LinearRegression()

In [10]:
log_y = np.log1p(y)  # 타깃 값 로그변환
linear_reg_model.fit(X_train, log_y) # 훈련

LinearRegression()

## 3.4.4 모델 성능 검증

In [11]:
preds = linear_reg_model.predict(X_train)

In [12]:
print ('선형회귀의 RMSLE 값:', rmsle(log_y, preds, True))

선형회귀의 RMSLE 값: 0.9803697923313486


## 3.4.5 예측 및 결과 제출

In [13]:
linearreg_preds = linear_reg_model.predict(X_test) # 테스트 데이터로 예측

submission['count'] = np.exp(linearreg_preds) # 지수변환
submission.to_csv('submission.csv', index=False) # 파일로 저장