In [17]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import KFold
import lightgbm
from sklearn.metrics import mean_squared_error

In [18]:
trian_path = "D:/PythonProject/didi_giscup/data/giscup_2021/processed_train/"
test_path = "D:/PythonProject/didi_giscup/data/giscup_2021/20200901/"
result_path = "D:/PythonProject/didi_giscup/result/"

In [31]:
fe_list = list(pd.read_csv(trian_path + "20200801/feature.csv").columns)

train_data = pd.DataFrame(columns=fe_list)
test_data = pd.read_csv(test_path + "feature.csv").drop(["ata"], axis=1)

def read_data(start: int, end: int):
    day_list = list(range(start, end + 1))
    if start <= 803 and (end - start) >= 2:
        day_list.remove(803)
    print("day list: ", day_list)
        
    global train_data, test_data
    for d in day_list:
        path = trian_path + "20200" + str(d) + "/"
        temp_df = pd.read_csv(path + "feature.csv")
        train_data = pd.concat([train_data, temp_df])
    train_data.reset_index(drop=True, inplace=True)
    print("train shape: ", train_data.shape)
    print("test shape: ", test_data.shape)


def mape_score(pred: list, true: list):
    n = len(pred)
    pred_true = pd.DataFrame({"pred": pred, "true": true})
    pred_true["pred"] = np.abs(pred_true["pred"] - pred_true["true"]) / pred_true["true"]
    return pred_true["pred"].sum() / n


def trian_model():
    global train_data, test_data
    label = train_data["ata"]
    
    feature = list(train_data.columns)
    display(feature)
    feature.remove("order_id")
    feature.remove("ata")
    for fe in feature:
        train_data[fe] = train_data[fe].astype(np.float)
    
    train_df = train_data[feature]
    test_df = test_data[feature]
    
    # cross validation
    preds = np.zeros(train_df.shape[0])
    test_pred = np.zeros(test_df.shape[0])
    model = lightgbm.LGBMRegressor(n_estimators=10000, metric='mse')
    kf = KFold(n_splits=5, random_state=418, shuffle=True)
    for k, (train_id, valid_id) in enumerate(kf.split(train_df, label)):
        
        train_x = train_df.iloc[train_id]
        valid_x = train_df.iloc[valid_id]
        train_y = label.iloc[train_id]
        valid_y = label.iloc[valid_id]
        
        model.fit(train_x, train_y, eval_set=(valid_x, valid_y), early_stopping_rounds=100, verbose=100)
        preds[valid_id] = model.predict(valid_x, num_iteration=model.best_iteration_)
        test_pred += model.predict(test_df, num_iteration=model.best_iteration_)
    score = mape_score(preds, label)
    print("mape score: ", score)
    test_data["result"] = test_pred / 5
    test_data = test_data.rename(columns={"order_id": "id"})
    
    t = datetime.datetime.now()
    t_str = str(t.month) + str(t.day) + "_" + str(t.hour) + "-" + str(t.minute) + "-" + str(t.second)
    test_data[["id", "result"]].to_csv(result_path + "submission" + t_str + ".csv", index=False)
    

def main():
    read_data(801, 804)
    trian_model()

In [32]:
main()

day list:  [801, 802, 804]
train shape:  (320618, 26)
test shape:  (288076, 25)


['order_id',
 'ata',
 'distance',
 'simple_eta',
 'driver_id',
 'slice_id',
 'link_cnt',
 'mean_distance',
 'speed_one',
 'link_time_sum',
 'link_time_mean',
 'link_time_max',
 'link_time_min',
 'speed_two',
 'link_cur_sta_mean',
 'link_cur_sta_sum',
 'link_cur_sta_mode',
 'cross_cnt',
 'cross_time_sum',
 'cross_time_mean',
 'cross_time_max',
 'cross_time_mode',
 'cross_from',
 'cross_to',
 'cross_from_last',
 'cross_to_last']

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 19215.7
[200]	valid_0's l2: 19086.9
[300]	valid_0's l2: 19004.9
[400]	valid_0's l2: 18949.3
[500]	valid_0's l2: 18942.9
[600]	valid_0's l2: 18924
[700]	valid_0's l2: 18890
[800]	valid_0's l2: 18876.9
[900]	valid_0's l2: 18858.5
[1000]	valid_0's l2: 18869.6
Early stopping, best iteration is:
[901]	valid_0's l2: 18858.5
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 19671.8
[200]	valid_0's l2: 19527.8
[300]	valid_0's l2: 19452.8
[400]	valid_0's l2: 19374.5
[500]	valid_0's l2: 19357.3
[600]	valid_0's l2: 19347.4
[700]	valid_0's l2: 19328.7
[800]	valid_0's l2: 19306.9
[900]	valid_0's l2: 19302.3
Early stopping, best iteration is:
[850]	valid_0's l2: 19294.7
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 20413.7
[200]	valid_0's l2: 20295.6
[300]	valid_0's l2: 20246.5
[400]	valid_0's l2: 20184.5
[500]	valid_0's l2: 20120.8
[600]	valid_0's l2:

In [None]:
# mape score:  0.15049685957592757
train_data

In [None]:
datetime.datetime.now()