In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV

## 기본 데이터

In [62]:
train = pd.read_csv('data/train.csv')

In [64]:
X = train.drop(['18~20_ride', 'date', 'in_out', 'station_name'], axis = 1)
y = train['18~20_ride']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [65]:
rf = RandomForestRegressor(random_state=42, n_estimators = 100)
neg_mse_scores = cross_val_score(rf, X, y, scoring = "neg_mean_squared_error", cv=3, n_jobs=-1)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('Negative MSE scores: ', np.round(neg_mse_scores, 2))
print('개별 RMSE scores : ', np.round(rmse_scores, 2))
print('평균 RMSE : {0:.3f}'.format(avg_rmse))

Negative MSE scores:  [-12.06 -10.43 -11.  ]
개별 RMSE scores :  [3.47 3.23 3.32]
평균 RMSE : 3.340


- 3.340

## 시간 데이터 정보 추가 (ID 컬럼도 제거)

In [38]:
train = pd.read_csv('data/train_time.csv', index_col=0) 

In [39]:
X = train.drop(['18~20_ride', 'id', 'date', 'in_out', 'station_name'], axis = 1)
y = train['18~20_ride']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [40]:
rf = RandomForestRegressor(random_state=42, n_estimators = 100)
neg_mse_scores = cross_val_score(rf, X, y, scoring = "neg_mean_squared_error", cv=3, n_jobs=-1)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('Negative MSE scores: ', np.round(neg_mse_scores, 2))
print('개별 RMSE scores : ', np.round(rmse_scores, 2))
print('평균 RMSE : {0:.3f}'.format(avg_rmse))

Negative MSE scores:  [-7.86 -9.36 -7.73]
개별 RMSE scores :  [2.8  3.06 2.78]
평균 RMSE : 2.881


- 2.881

## 날짜, 승하차인원(시간별), 노선ID, 정류장ID  전처리 추가

In [6]:
train = pd.read_csv('data/base_model.csv',  index_col=0)

In [53]:
X = train.drop(['id', 'date', '18~20_ride','6~7_ride', '7~8_ride', '8~9_ride',
       '9~10_ride', '10~11_ride', '11~12_ride', '6~7_takeoff', '7~8_takeoff',
       '8~9_takeoff', '9~10_takeoff', '10~11_takeoff', '11~12_takeoff'], axis = 1)
y = train['18~20_ride']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [55]:
rf = RandomForestRegressor(random_state=42, n_estimators = 100)
neg_mse_scores = cross_val_score(rf, X, y, scoring = "neg_mean_squared_error", cv=3, n_jobs=-1)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('Negative MSE scores: ', np.round(neg_mse_scores, 2))
print('개별 RMSE scores : ', np.round(rmse_scores, 2))
print('평균 RMSE : {0:.3f}'.format(avg_rmse))

Negative MSE scores:  [-7.32 -8.99 -7.32]
개별 RMSE scores :  [2.71 3.   2.71]
평균 RMSE : 2.803


- 2.803

## 위,경도 거리차이 정보 추가

In [54]:
train = pd.read_csv('data/train_lalo.csv',  index_col=0)

In [48]:
X = train.drop(['18~20_ride'], axis = 1)
y = train['18~20_ride']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [50]:
rf = RandomForestRegressor(random_state=42, n_estimators = 100)
neg_mse_scores = cross_val_score(rf, X, y, scoring = "neg_mean_squared_error", cv=3, n_jobs=-1)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('Negative MSE scores: ', np.round(neg_mse_scores, 2))
print('개별 RMSE scores : ', np.round(rmse_scores, 2))
print('평균 RMSE : {0:.3f}'.format(avg_rmse))

Negative MSE scores:  [-7.1 -8.8 -7.2]
개별 RMSE scores :  [2.66 2.97 2.68]
평균 RMSE : 2.772


- 2.772

## 요일별 18~20시 탑승 평균, 합 정보 추가

In [66]:
train = pd.read_csv('data/train_1820.csv',  index_col=0) 

In [59]:
X = train.drop(['18~20_ride'], axis = 1)
y = train['18~20_ride']

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [61]:
rf = RandomForestRegressor(random_state=42, n_estimators = 100)
neg_mse_scores = cross_val_score(rf, X, y, scoring = "neg_mean_squared_error", cv=3, n_jobs=-1)
rmse_scores = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

print('Negative MSE scores: ', np.round(neg_mse_scores, 2))
print('개별 RMSE scores : ', np.round(rmse_scores, 2))
print('평균 RMSE : {0:.3f}'.format(avg_rmse))

Negative MSE scores:  [-6.81 -8.7  -6.85]
개별 RMSE scores :  [2.61 2.95 2.62]
평균 RMSE : 2.726


- 2.726