In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import KFold, train_test_split
import lightgbm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore")

In [2]:
trian_path = "D:/PythonProject/didi_giscup/data/giscup_2021/processed_train/"
test_path = "D:/PythonProject/didi_giscup/data/giscup_2021/20200901/"
result_path = "D:/PythonProject/didi_giscup/result/"

In [10]:
fe_list = list(pd.read_csv(trian_path + "20200801/feature.csv").columns)

train_data = pd.DataFrame(columns=fe_list)
test_data = pd.read_csv(test_path + "feature.csv").drop(["ata"], axis=1)

def read_data(start: int, end: int):
    day_list = list(range(start, end + 1))
    if 803 in day_list:
        day_list.remove(803)
    print("day list: ", day_list)
        
    global train_data, test_data
    for d in day_list:
        path = trian_path + "20200" + str(d) + "/"
        temp_df = pd.read_csv(path + "feature.csv")
        train_data = pd.concat([train_data, temp_df])
    train_data.reset_index(drop=True, inplace=True)
    print("train shape: ", train_data.shape)
    print("test shape: ", test_data.shape)


def mape_score(pred: list, true: list):
    n = len(pred)
    pred_true = pd.DataFrame({"pred": pred, "true": true})
    pred_true["pred"] = np.abs(pred_true["pred"] - pred_true["true"]) / pred_true["true"]
    return pred_true["pred"].sum() / n


def trian_model_cross_lgb():
    global train_data, test_data
    label = train_data["ata"]
    
    feature = list(train_data.columns)
    print("columns len: ", len(train_data.columns))
    print("columns: ", train_data.columns)
    feature.remove("order_id")
    feature.remove("ata")
    for fe in feature:
        train_data[fe] = train_data[fe].astype(np.float)
    
    # label_encode = LabelEncoder()
    # label_encode.fit(train_data["driver_id"])
    # test_data["driver_id"] = test_data["driver_id"].map(lambda x: -1 if x not in label_encode.classes_ else x)
    # label_encode.classes_ = np.append(label_encode.classes_, -1)
    # train_data["driver_id"] = label_encode.transform(train_data["driver_id"])
    # test_data["driver_id"] = label_encode.transform(test_data["driver_id"])
    
    train_df = train_data[feature]
    test_df = test_data[feature]
    
    # cross validation
    preds = np.zeros(train_df.shape[0])
    test_pred = np.zeros(test_df.shape[0])
    model = lightgbm.LGBMRegressor(n_estimators=10000, metric='mape')
    kf = KFold(n_splits=5, random_state=418, shuffle=True)
    for k, (train_id, valid_id) in enumerate(kf.split(train_df, label)):
        
        train_x = train_df.iloc[train_id]
        valid_x = train_df.iloc[valid_id]
        train_y = label.iloc[train_id]
        valid_y = label.iloc[valid_id]
        
        model.fit(train_x, train_y, eval_set=(valid_x, valid_y), early_stopping_rounds=100, verbose=100)
        preds[valid_id] = model.predict(valid_x, num_iteration=model.best_iteration_)
        test_pred += model.predict(test_df, num_iteration=model.best_iteration_)
    score = mape_score(preds, label)
    print("mape score by five: ", score)
    test_data["result"] = test_pred / 5
    test_data = test_data.rename(columns={"order_id": "id"})
    
    t = datetime.datetime.now()
    t_str = str(t.month) + str(t.day) + "_" + str(t.hour) + "-" + str(t.minute) + "-" + str(t.second)
#     test_data[["id", "result"]].to_csv(result_path + "submission" + t_str + ".csv", index=False)


def train_model_one_lgb():
    global train_data, test_data
    label = train_data["ata"]
    
    feature = list(train_data.columns)
    print("columns len: ", len(train_data.columns))
    print("columns: ", train_data.columns)
    feature.remove("order_id")
    feature.remove("ata")
    for fe in feature:
        train_data[fe] = train_data[fe].astype(np.float)
    
    train_df = train_data[feature]
    test_df = test_data[feature]
    model = lightgbm.LGBMRegressor(n_estimators=10000, metric='mape',)
    train_x, test_x, train_y, test_y = train_test_split(train_df, label, random_state=1021, test_size=0.2, shuffle=False)
    model.fit(train_x, train_y, eval_set=(test_x, test_y), early_stopping_rounds=100, verbose=100)
    pred_y = model.predict(test_x, num_iteration=model.best_iteration_)
    score = mape_score(pred_y, test_y)
    print("mape score by one: ", score)
    test_data["result"] = model.predict(test_df, num_iteration=model.best_iteration_)
    test_data = test_data.rename(columns={"order_id": "id"})
    
    t = datetime.datetime.now()
    t_str = str(t.month) + str(t.day) + "_" + str(t.hour) + "-" + str(t.minute) + "-" + str(t.second)
    test_data[["id", "result"]].to_csv(result_path + "submission" + t_str + ".csv", index=False)


def train_model_ctb():
    global train_data, test_data
    label = train_data["ata"]
    
    feature = list(train_data.columns)
    print("columns len: ", len(train_data.columns))
    print("columns: ", train_data.columns)
    feature.remove("order_id")
    feature.remove("ata")
    for fe in feature:
        train_data[fe] = train_data[fe].astype(np.float)
    
    train_df = train_data[feature]
    test_df = test_data[feature]
    model = lightgbm.LGBMRegressor(n_estimators=10000, metric='mae', learning_rate=0.5)
    train_x, test_x, train_y, test_y = train_test_split(train_df, label, random_state=1021, test_size=0.2, shuffle=False)
    model.fit(train_x, train_y, eval_set=(test_x, test_y), early_stopping_rounds=100, verbose=100)
    pred_y = model.predict(test_x, num_iteration=model.best_iteration_)
    score = mape_score(pred_y, test_y)
    print("mape score by one: ", score)
    test_data["result"] = model.predict(test_df, num_iteration=model.best_iteration_)
    test_data = test_data.rename(columns={"order_id": "id"})
    
    t = datetime.datetime.now()
    t_str = str(t.month) + str(t.day) + "_" + str(t.hour) + "-" + str(t.minute) + "-" + str(t.second)
    test_data[["id", "result"]].to_csv(result_path + "submission" + t_str + ".csv", index=False)


def main():
    read_data(801, 804)
    
    trian_model_cross_lgb()

In [11]:
main()

day list:  [801, 802, 804]
train shape:  (320618, 37)
test shape:  (288076, 36)
columns len:  37
columns:  Index(['order_id', 'ata', 'distance', 'simple_eta', 'driver_id', 'slice_id',
       'weather', 'hightemp', 'lowtemp', 'temp_sub', 'slice_1m', 'slice_30m',
       'slice_1h', 'link_cnt', 'mean_distance', 'speed_one', 'link_time_sum',
       'link_time_mean', 'link_time_max', 'link_time_min', 'speed_two',
       'link_cur_sta_mean', 'link_cur_sta_sum', 'conges_cnt', 'conges_sum',
       'amble_cnt', 'amble_sum', 'cross_cnt', 'cross_time_sum',
       'cross_time_mean', 'cross_time_max', 'cross_time_mode', 'cross_from',
       'cross_to', 'cross_from_last', 'cross_to_last', 'link_time_sum_ratio'],
      dtype='object')
Training until validation scores don't improve for 100 rounds
[100]	valid_0's mape: 0.151706
[200]	valid_0's mape: 0.150825
[300]	valid_0's mape: 0.150442
[400]	valid_0's mape: 0.150245
[500]	valid_0's mape: 0.150087
[600]	valid_0's mape: 0.149938
[700]	valid_0's mape: 

In [43]:
# five mape score three datys: 0.14973174142905518
# five mape score all days: 0.14015414626974915 --- 0.14508
# one mape score all dats: 0.13986354280512076  --- 0.14572
# train_data

In [None]:
datetime.datetime.now()