In [24]:
import pandas as pd

In [25]:
raw = pd.read_csv('delivery_raw.csv', sep='\t')

In [26]:
raw_dropna_label = raw.dropna(subset=["actual_delivery_time"])

# 데이터 전처리와 속성 생성


### 시간 형식 변환
- 주문처리시간 (`estimated_order_place_duration`), 배달소요시간 (`estimated_store_to_consumer_driving_duration`)을 통해
- 배달에 소요된 총 시간을 구할 수 있음
- 더 이상 필요없는 주문처리시간, 배달소요시간 속성 제거

In [27]:
raw_dropna_label['delivery_time'] = (raw_dropna_label['estimated_order_place_duration'] + raw_dropna_label['estimated_store_to_consumer_driving_duration'])
new_data = raw_dropna_label.drop(['estimated_order_place_duration', 'estimated_store_to_consumer_driving_duration'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_dropna_label['delivery_time'] = (raw_dropna_label['estimated_order_place_duration'] + raw_dropna_label['estimated_store_to_consumer_driving_duration'])


- 주문이 들어온 시간 (`created_at`)은 당시 매장의 상황, 교통상황 등 배달소요시간에 영향을 미칠 수 있으므로 학습에 사용
    - 단, 실제로는 날짜 또한 영향을 미칠 수 있지만 (공휴일 등)
    - 간단하게 하기위해 요일, 시, 분 만을 사용
- 더 이상 필요없는 주문 생성 시간, 실제 도착시간 제거

In [28]:
created_at = pd.to_datetime(new_data['created_at'])
new_data['created_hour'] = created_at.dt.hour
new_data['created_minute'] = created_at.dt.minute
new_data['created_weekday'] = created_at.dt.weekday
new_data = new_data.drop(['created_at', 'actual_delivery_time'], axis=1)

new_data

Unnamed: 0,market_id,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift,total_busy,total_outstanding_orders,delivery_time,created_hour,created_minute,created_weekday
0,1.0,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,1307.0,22,24,4
1,2.0,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,1136.0,21,49,1
2,3.0,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,1136.0,20,39,3
3,3.0,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,735.0,21,21,1
4,3.0,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,1096.0,2,40,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197423,1.0,2956,fast,4.0,3,1389,3,345,649,17.0,17.0,23.0,582.0,0,19,1
197424,1.0,2956,fast,4.0,6,3010,4,405,825,12.0,11.0,14.0,1166.0,0,1,4
197425,1.0,2956,fast,4.0,5,1836,3,300,399,39.0,41.0,40.0,1046.0,4,46,5
197426,1.0,3630,sandwich,1.0,1,1175,1,535,535,7.0,7.0,12.0,830.0,18,18,6


- 기타 의미가 없어보이는 속성들 제거
    - `store_id`, `store_primary_category`, `order_protocol`

In [29]:
new_data = new_data.drop(['store_id', 'store_primary_category', 'order_protocol'], axis=1)
new_data = new_data.dropna()
new_data[new_data.isna().any(axis=1)]

Unnamed: 0,market_id,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift,total_busy,total_outstanding_orders,delivery_time,created_hour,created_minute,created_weekday


- X, y로 나누기
    - `delivery_time`만 y, 나머지 X

In [30]:
X_data = new_data.drop('delivery_time', axis=1)
y_data = pd.to_numeric(new_data['delivery_time'])

print("X_data\n", X_data.shape)
print("y_data\n", y_data.shape)

X_data
 (179759, 12)
y_data
 (179759,)


- 스케일링
    -  X에는 `StandardScaler` 적용
    -  y에는 `MinMaxScaler` 적용

In [31]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
X = scaler.fit_transform(X_data)
print(X.shape)

mmscaler = MinMaxScaler()
y = mmscaler.fit_transform(y_data.values.reshape(-1, 1))
print(y.shape)

(179759, 12)
(179759, 1)


- train / test 로 나누기

In [32]:
n = int(X.shape[0] * 0.9)
X_train, y_train = X[:n], y[:n]
X_test, y_test = X[n:], y[n:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(161783, 12) (161783, 1)
(17976, 12) (17976, 1)


# 학습
- `delivery_time`을 맞춰야하는 회귀모델이므로, scikit-learn의 `LinearRegression` 사용

In [33]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

# 모델 평가
- MSE, MAE, RMSE, R2로 평가

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

MSE: 0.005927712767656015
MAE: 0.06281755383901394
RMSE: 0.07699164089468424
R^2: 0.01513482731412974


- Under-prediction 비율

In [41]:
import numpy as np

under_predictions = y_pred > y_test
num_under_predictions = np.sum(under_predictions)

under_predictions_rate = 100 * num_under_predictions / len(y_test)
print(under_predictions_rate, "%")

50.85669781931464 %
