In [7]:
from cainiao_utils import *  # 引用寫好的 function
import datetime as dt
import gc
import numpy as np
import pandas as pd
import pathlib
import random
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Variables
rawdata_folderpath = 'Cainiao_dataset/'
output_folderpath = 'Cainiao_preprocessed/'
order_filepath = 'msom_order_data_1.csv'
logistic_filepath = 'msom_logistic_detail_1.csv'
logistic_cols = ['order_id','order_date','logistic_order_id','action','facility_id','facility_type','city_id',
                 'logistic_company_id','timestamp']
order_cols = ['day','order_id','item_det_info','pay_timestamp','buyer_id','promise_speed','if_cainiao','merchant_id',
              'Logistics_review_score']

produce_date = str(dt.date.today()).replace('-', '')
sent_day = 10
file_name = f'{produce_date}-sentday_{sent_day}'



#### 創建資料夾及載入資料集

In [8]:
# 若本地端沒有該資料夾則則創建
path = pathlib.Path(output_folderpath)
path.mkdir(parents=True, exist_ok=True)

# 載入訂單及物流進程資料集
order = pd.read_csv(rawdata_folderpath + order_filepath, header=None)
order.columns = order_cols
order = order.drop(['day','item_det_info','buyer_id','merchant_id'], axis=1)
order = order[~order['Logistics_review_score'].isnull()]
logistic = pd.read_csv(rawdata_folderpath + logistic_filepath, header=None)
logistic.columns = logistic_cols



FileNotFoundError: [Errno 2] No such file or directory: 'Cainiao_dataset/msom_order_data_1.csv'

In [None]:
# 一月份共 1400 多萬筆訂單，從評分 1-5 分各抽樣 3 萬筆訂單，減少運算時間
sample_oids = list()
for score in range(1, 6):
    review_oids = np.random.choice(a=order[order['Logistics_review_score']==score].order_id.unique(), size=30000, replace=False)
    sample_oids.append(review_oids)
sample_oids = np.concatenate(sample_oids)
order = order[order['order_id'].isin(sample_oids)]
print('Shape of order: ', order.shape)
print(order.head())

# 合併訂單和其進程紀錄
logistic = pd.merge(logistic, order, on='order_id', how='left')
logistic = logistic[~logistic['Logistics_review_score'].isnull()]
logistic = logistic.drop_duplicates()
print('Shape of logistic: ', logistic.shape)
print(logistic.head())
# 儲存備用
logistic.to_csv(rawdata_folderpath + f'order_logistic_log-{produce_date}.csv', index=False)

# 節省空間將不用的訂單資料表刪除
del order
gc.collect()


# 計算每張訂單的運送天數
df_sent_times = compute_sent_days(df=logistic)
print(df_sent_times.groupby(['sent_duration']).size().reset_index().rename(columns={0: 'count'}))
df_sent_times.to_csv(output_folderpath + 'cainiao-sent_times.csv', index=False)

# 篩選出運送天數 >10 天的訂單
sent_duration_10dayup = df_sent_times[df_sent_times['sent_duration']=='10+'].order_id.unique()
logistic = logistic[logistic['order_id'].isin(sent_duration_10dayup)]

# 依據滿意度評分各隨機抽樣 1000 筆訂單
sample_oids = list()
for score in [1, 5]:
    review_oids = np.random.choice(a=logistic[logistic['Logistics_review_score']==score].order_id.unique(),
                                   size=1000, replace=False)
    sample_oids.append(review_oids)
sample_oids = np.concatenate(sample_oids)
logistic = logistic[logistic['order_id'].isin(sample_oids)]

# 將抽樣後資料連同運送天數一起儲存備用
logistic = pd.merge(logistic, df_sent_times, on='order_id', how='left')
logistic.to_csv(output_folderpath + f'Cainiao-sampledData_reviewscore-{file_name}.csv', index=False)

# 計算物流狀態時長佔比，作為序列長度，將每筆訂單轉換為物流狀態序列
actions_collect, reviews_collect = generate_logistic_state_sequence(df=logistic, order_id_list=sample_oids)

# 將物流狀態序列存為 .csv
action_cols = ['action_'+str(x) for x in range(0, pd.DataFrame(actions_collect).shape[1]-1)]
action_cols = ['order_id'] + action_cols
order_logistic_states = pd.DataFrame(actions_collect, columns=action_cols)
order_logistic_states['review_score'] = reviews_collect
order_logistic_states.to_csv(output_folderpath + f'order_logistic_states-{produce_date}-sentday_{sent_day}.csv', index=False)
print(order_logistic_states.groupby(['review_score']).size())
print(order_logistic_states.shape)
print(order_logistic_states.head())
