In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score
import lightgbm as lgb
from collections import Counter
import warnings
import pickle
warnings.filterwarnings("ignore")

In [4]:
def get_base_info(x):
    return [i.split(':')[-1] for i in x.split(' ')]

def get_speed(x):
    return np.array([i.split(',')[0] for i in x], dtype='float16')

def get_eta(x):
    return np.array([i.split(',')[1] for i in x], dtype='float16')

def get_state(x):
    return [int(i.split(',')[2]) for i in x]

def get_cnt(x):
    return np.array([i.split(',')[3] for i in x], dtype='int16')

In [5]:
def gen_feats(path, mode='is_train'):
    df = pd.read_csv(path, sep=';', header=None)
    df['link'] = df[0].apply(lambda x: x.split(' ')[0])
    if mode == 'is_train':
        df['label'] = df[0].apply(lambda x: int(x.split(' ')[1]))
        df['label'] = df['label'].apply(lambda x: 3 if x > 3 else x)
        df['label'] -= 1
        df['current_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[2]))
        df['future_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[3]))
    else:
        df['label'] = -1
        df['current_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[2]))
        df['future_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[3]))

    df['time_diff'] = df['future_slice_id'] - df['current_slice_id']

    df['curr_state'] = df[1].apply(lambda x: x.split(' ')[-1].split(':')[-1])
    df['curr_speed'] = df['curr_state'].apply(lambda x: x.split(',')[0])
    df['curr_eta'] = df['curr_state'].apply(lambda x: x.split(',')[1])
    df['curr_cnt'] = df['curr_state'].apply(lambda x: x.split(',')[3])
    df['curr_state'] = df['curr_state'].apply(lambda x: x.split(',')[2])
    del df[0]

    for i in tqdm(range(1, 6)):
        df['his_info'] = df[i].apply(get_base_info)
        if i == 1:
            flg = 'current'
        else:
            flg = f'his_{(6 - i) * 7}'
        df['his_speed'] = df['his_info'].apply(get_speed)
        df[f'{flg}_speed_min'] = df['his_speed'].apply(lambda x: x.min())
        df[f'{flg}_speed_max'] = df['his_speed'].apply(lambda x: x.max())
        df[f'{flg}_speed_mean'] = df['his_speed'].apply(lambda x: x.mean())
        df[f'{flg}_speed_std'] = df['his_speed'].apply(lambda x: x.std())

        df['his_eta'] = df['his_info'].apply(get_eta)
        df[f'{flg}_eta_min'] = df['his_eta'].apply(lambda x: x.min())
        df[f'{flg}_eta_max'] = df['his_eta'].apply(lambda x: x.max())
        df[f'{flg}_eta_mean'] = df['his_eta'].apply(lambda x: x.mean())
        df[f'{flg}_eta_std'] = df['his_eta'].apply(lambda x: x.std())

        df['his_cnt'] = df['his_info'].apply(get_cnt)
        df[f'{flg}_cnt_min'] = df['his_cnt'].apply(lambda x: x.min())
        df[f'{flg}_cnt_max'] = df['his_cnt'].apply(lambda x: x.max())
        df[f'{flg}_cnt_mean'] = df['his_cnt'].apply(lambda x: x.mean())
        df[f'{flg}_cnt_std'] = df['his_cnt'].apply(lambda x: x.std())

        df['his_state'] = df['his_info'].apply(get_state)
        df[f'{flg}_state'] = df['his_state'].apply(lambda x: Counter(x).most_common()[0][0])
        df.drop([i, 'his_info', 'his_speed', 'his_eta', 'his_cnt', 'his_state'], axis=1, inplace=True)
    if mode == 'is_train':
        df.to_csv(f"{mode}_{path.split('/')[-1]}", index=False)
    else:
        df.to_csv(f"is_test.csv", index=False)

In [6]:
def f1_score_eval(preds, valid_df):
    labels = valid_df.get_label()
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    scores = f1_score(y_true=labels, y_pred=preds, average=None)
    scores = scores[0]*0.2+scores[1]*0.2+scores[2]*0.8
    return 'f1_score', scores, True

In [7]:
def lgb_train(train_: pd.DataFrame, test_: pd.DataFrame, use_train_feats: list, id_col: str, label: str,
              n_splits: int, split_rs: int, is_shuffle=True, use_cart=False, cate_cols=None) -> pd.DataFrame:
    if not cate_cols:
        cate_cols = []
    print('data shape:\ntrain--{}\ntest--{}'.format(train_.shape, test_.shape))
    print('Use {} features ...'.format(len(use_train_feats)))
    print('Use lightgbm to train ...')
    n_class = train_[label].nunique()
    train_[f'{label}_pred'] = 0
    test_pred = np.zeros((test_.shape[0], n_class))
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = use_train_feats

    folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=split_rs)
    train_user_id = train_[id_col].unique()

    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': 'None',
        'num_leaves': 31,
        'num_class': n_class,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 1,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': -1,
        'verbose': -1
    }

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):
        print('the {} training start ...'.format(n_fold))
        train_x, train_y = train_.loc[train_[id_col].isin(train_user_id[train_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[train_idx]), label]
        valid_x, valid_y = train_.loc[train_[id_col].isin(train_user_id[valid_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[valid_idx]), label]
        print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')

        if use_cart:
            dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)
            dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)
        else:
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=5000,
            valid_sets=[dvalid],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=f1_score_eval
        )
        fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance(importance_type='gain')
        train_.loc[train_[id_col].isin(train_user_id[valid_idx]), f'{label}_pred'] = np.argmax(
            clf.predict(valid_x, num_iteration=clf.best_iteration), axis=1)
        test_pred += clf.predict(test_[use_train_feats], num_iteration=clf.best_iteration) / folds.n_splits

    report = f1_score(train_[label], train_[f'{label}_pred'], average=None)
    print(classification_report(train_[label], train_[f'{label}_pred'], digits=4))
    print('Score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)
    test_[f'{label}_pred'] = np.argmax(test_pred, axis=1)
    test_[label] = np.argmax(test_pred, axis=1)+1
    five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]
    fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)
    fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)
    print(fold_importance_df[['Feature', 'avg_imp']].head(20))
    return test_[[id_col, 'current_slice_id', 'future_slice_id', label]],clf

In [8]:
def save(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)
def load(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [9]:
attr = pd.read_csv('attr.txt', sep='\t',
                           names=['link', 'length', 'direction', 'path_class', 'speed_class', 'LaneNum', 'speed_limit',
                                  'level', 'width'], header=None)

In [15]:
if __name__ == "__main__":
    for i in range(10,20):
        train_path = './traffic/201907' + str(i) + '.txt'
        test_path = 'test.txt'
        gen_feats(train_path, mode='is_train')
        gen_feats(test_path, mode='is_test')
    
        train = pd.read_csv('is_train_201907' + str(i) + '.txt')
        test = pd.read_csv('is_test.csv')
        train = train.merge(attr, on='link', how='left')
        test = test.merge(attr, on='link', how='left')
    
        use_cols = [i for i in train.columns if i not in ['link', 'label', 'current_slice_id', 'future_slice_id', 'label_pred']]
    
        sub,clf = lgb_train(train, test, use_cols, 'link', 'label', 5, 2020)
        save(clf,'model/'+ '201907' + str(i) + '.txt')
        #sub.to_csv('public_baseline.csv', index=False, encoding='utf8')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:34<00:00, 54.85s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 94.73it/s]


data shape:
train--(516024, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10624
for valid user:2656
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.678842
[200]	valid_0's f1_score: 0.68872
[300]	valid_0's f1_score: 0.690819
Early stopping, best iteration is:
[235]	valid_0's f1_score: 0.691603
the 2 training start ...
for train user:10624
for valid user:2656
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.673832
[200]	valid_0's f1_score: 0.682784
[300]	valid_0's f1_score: 0.685471
[400]	valid_0's f1_score: 0.687258
Early stopping, best iteration is:
[394]	valid_0's f1_score: 0.687611
the 3 training start ...
for train user:10624
for valid user:2656
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.643849
[200]	valid_0's f1_score: 0.648207
[300]	valid_0's f1_score: 0.649223
[400]	valid_0's f1_score: 0.651158


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:12<00:00, 50.50s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 109.59it/s]


data shape:
train--(521820, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10712
for valid user:2678
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.695048
[200]	valid_0's f1_score: 0.702629
[300]	valid_0's f1_score: 0.702845
Early stopping, best iteration is:
[255]	valid_0's f1_score: 0.704051
the 2 training start ...
for train user:10712
for valid user:2678
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.699019
[200]	valid_0's f1_score: 0.704663
[300]	valid_0's f1_score: 0.704811
[400]	valid_0's f1_score: 0.706089
[500]	valid_0's f1_score: 0.706306
Early stopping, best iteration is:
[483]	valid_0's f1_score: 0.707735
the 3 training start ...
for train user:10712
for valid user:2678
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.675163
[200]	valid_0's f1_score: 0.682577
[300]	valid_0's f1_score: 0.686234

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:11<00:00, 50.34s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 111.06it/s]


data shape:
train--(522081, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10655
for valid user:2664
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.664703
[200]	valid_0's f1_score: 0.674487
Early stopping, best iteration is:
[193]	valid_0's f1_score: 0.674895
the 2 training start ...
for train user:10655
for valid user:2664
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.679267
[200]	valid_0's f1_score: 0.685229
[300]	valid_0's f1_score: 0.68871
[400]	valid_0's f1_score: 0.688506
Early stopping, best iteration is:
[300]	valid_0's f1_score: 0.68871
the 3 training start ...
for train user:10655
for valid user:2664
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.694795
[200]	valid_0's f1_score: 0.705054
[300]	valid_0's f1_score: 0.706088
Early stopping, best iteration is:
[227]	valid_0's f1_score: 0.706429
t

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:50<00:00, 46.19s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 109.74it/s]


data shape:
train--(491574, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10182
for valid user:2546
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.720995
[200]	valid_0's f1_score: 0.728896
[300]	valid_0's f1_score: 0.730541
[400]	valid_0's f1_score: 0.731308
[500]	valid_0's f1_score: 0.733549
[600]	valid_0's f1_score: 0.733242
Early stopping, best iteration is:
[586]	valid_0's f1_score: 0.73388
the 2 training start ...
for train user:10182
for valid user:2546
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.737818
[200]	valid_0's f1_score: 0.745805
[300]	valid_0's f1_score: 0.74658
[400]	valid_0's f1_score: 0.747687
Early stopping, best iteration is:
[343]	valid_0's f1_score: 0.748374
the 3 training start ...
for train user:10182
for valid user:2546
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.709383
[

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:41<00:00, 44.24s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 117.66it/s]


data shape:
train--(469377, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:9926
for valid user:2482
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.692352
[200]	valid_0's f1_score: 0.702674
[300]	valid_0's f1_score: 0.701357
Early stopping, best iteration is:
[261]	valid_0's f1_score: 0.704677
the 2 training start ...
for train user:9926
for valid user:2482
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.667507
[200]	valid_0's f1_score: 0.673502
[300]	valid_0's f1_score: 0.675382
Early stopping, best iteration is:
[259]	valid_0's f1_score: 0.676978
the 3 training start ...
for train user:9926
for valid user:2482
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.677918
[200]	valid_0's f1_score: 0.685718
[300]	valid_0's f1_score: 0.683981
Early stopping, best iteration is:
[209]	valid_0's f1_score: 0.686399
th

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:04<00:00, 48.94s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 100.40it/s]


data shape:
train--(519537, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10723
for valid user:2681
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.668959
[200]	valid_0's f1_score: 0.678021
[300]	valid_0's f1_score: 0.67863
[400]	valid_0's f1_score: 0.683314
[500]	valid_0's f1_score: 0.682827
Early stopping, best iteration is:
[450]	valid_0's f1_score: 0.684825
the 2 training start ...
for train user:10723
for valid user:2681
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.661361
[200]	valid_0's f1_score: 0.673238
[300]	valid_0's f1_score: 0.675114
[400]	valid_0's f1_score: 0.675265
Early stopping, best iteration is:
[348]	valid_0's f1_score: 0.676551
the 3 training start ...
for train user:10723
for valid user:2681
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.669528
[200]	valid_0's f1_score: 0.67903
[

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:07<00:00, 49.53s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 115.61it/s]


data shape:
train--(524658, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10776
for valid user:2694
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.690316
[200]	valid_0's f1_score: 0.699782
[300]	valid_0's f1_score: 0.702887
[400]	valid_0's f1_score: 0.702822
Early stopping, best iteration is:
[329]	valid_0's f1_score: 0.703516
the 2 training start ...
for train user:10776
for valid user:2694
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.696018
[200]	valid_0's f1_score: 0.702971
[300]	valid_0's f1_score: 0.706226
[400]	valid_0's f1_score: 0.707177
[500]	valid_0's f1_score: 0.709412
[600]	valid_0's f1_score: 0.709959
[700]	valid_0's f1_score: 0.710118
Early stopping, best iteration is:
[651]	valid_0's f1_score: 0.710903
the 3 training start ...
for train user:10776
for valid user:2694
Training until validation scores don't improve for 100 rounds

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:05<00:00, 49.18s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 98.10it/s]


data shape:
train--(523518, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10779
for valid user:2695
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.668625
[200]	valid_0's f1_score: 0.677641
Early stopping, best iteration is:
[189]	valid_0's f1_score: 0.679404
the 2 training start ...
for train user:10779
for valid user:2695
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.677362
[200]	valid_0's f1_score: 0.68299
[300]	valid_0's f1_score: 0.684165
[400]	valid_0's f1_score: 0.686775
[500]	valid_0's f1_score: 0.686948
Early stopping, best iteration is:
[425]	valid_0's f1_score: 0.687871
the 3 training start ...
for train user:10779
for valid user:2695
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.648535
[200]	valid_0's f1_score: 0.657148
Early stopping, best iteration is:
[160]	valid_0's f1_score: 0.657771


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:07<00:00, 49.59s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 122.41it/s]


data shape:
train--(528582, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10830
for valid user:2708
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.707753
[200]	valid_0's f1_score: 0.713785
[300]	valid_0's f1_score: 0.713476
[400]	valid_0's f1_score: 0.716552
[500]	valid_0's f1_score: 0.71741
Early stopping, best iteration is:
[487]	valid_0's f1_score: 0.718217
the 2 training start ...
for train user:10830
for valid user:2708
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.710233
[200]	valid_0's f1_score: 0.71895
[300]	valid_0's f1_score: 0.7217
[400]	valid_0's f1_score: 0.721981
[500]	valid_0's f1_score: 0.723497
[600]	valid_0's f1_score: 0.723667
Early stopping, best iteration is:
[518]	valid_0's f1_score: 0.7244
the 3 training start ...
for train user:10830
for valid user:2708
Training until validation scores don't improve for 100 rounds
[100]

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:06<00:00, 49.30s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 122.79it/s]


data shape:
train--(525174, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10778
for valid user:2695
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.700922
[200]	valid_0's f1_score: 0.706743
[300]	valid_0's f1_score: 0.707134
[400]	valid_0's f1_score: 0.709238
[500]	valid_0's f1_score: 0.709646
[600]	valid_0's f1_score: 0.708971
Early stopping, best iteration is:
[523]	valid_0's f1_score: 0.710593
the 2 training start ...
for train user:10778
for valid user:2695
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.706951
[200]	valid_0's f1_score: 0.714053
[300]	valid_0's f1_score: 0.712408
Early stopping, best iteration is:
[209]	valid_0's f1_score: 0.714509
the 3 training start ...
for train user:10778
for valid user:2695
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.686823
[200]	valid_0's f1_score: 0.695424

In [16]:
if __name__ == "__main__":
    for i in range(20,30):
        train_path = './traffic/201907' + str(i) + '.txt'
        test_path = 'test.txt'
        gen_feats(train_path, mode='is_train')
        gen_feats(test_path, mode='is_test')
    
        train = pd.read_csv('is_train_201907' + str(i) + '.txt')
        test = pd.read_csv('is_test.csv')
        train = train.merge(attr, on='link', how='left')
        test = test.merge(attr, on='link', how='left')
    
        use_cols = [i for i in train.columns if i not in ['link', 'label', 'current_slice_id', 'future_slice_id', 'label_pred']]
    
        sub,clf = lgb_train(train, test, use_cols, 'link', 'label', 5, 2020)
        save(clf,'model/'+ '201907' + str(i) + '.txt')
        #sub.to_csv('public_baseline.csv', index=False, encoding='utf8')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:53<00:00, 46.64s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 96.31it/s]


data shape:
train--(495456, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10299
for valid user:2575
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.731537
[200]	valid_0's f1_score: 0.73615
[300]	valid_0's f1_score: 0.737341
Early stopping, best iteration is:
[284]	valid_0's f1_score: 0.738281
the 2 training start ...
for train user:10299
for valid user:2575
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.724967
[200]	valid_0's f1_score: 0.734256
[300]	valid_0's f1_score: 0.734866
[400]	valid_0's f1_score: 0.734893
Early stopping, best iteration is:
[355]	valid_0's f1_score: 0.736457
the 3 training start ...
for train user:10299
for valid user:2575
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.71748
[200]	valid_0's f1_score: 0.726901
[300]	valid_0's f1_score: 0.728446
[400]	valid_0's f1_score: 0.729644
E

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:39<00:00, 43.90s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 123.55it/s]


data shape:
train--(465870, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:9929
for valid user:2483
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.691068
[200]	valid_0's f1_score: 0.69658
[300]	valid_0's f1_score: 0.698974
[400]	valid_0's f1_score: 0.697876
[500]	valid_0's f1_score: 0.696096
Early stopping, best iteration is:
[422]	valid_0's f1_score: 0.699872
the 2 training start ...
for train user:9929
for valid user:2483
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.702426
[200]	valid_0's f1_score: 0.712434
[300]	valid_0's f1_score: 0.714935
[400]	valid_0's f1_score: 0.716255
[500]	valid_0's f1_score: 0.715896
Early stopping, best iteration is:
[425]	valid_0's f1_score: 0.717708
the 3 training start ...
for train user:9930
for valid user:2482
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.656958
[20

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:03<00:00, 48.61s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 105.67it/s]


data shape:
train--(517290, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10686
for valid user:2672
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.705691
[200]	valid_0's f1_score: 0.713774
[300]	valid_0's f1_score: 0.717738
[400]	valid_0's f1_score: 0.719138
[500]	valid_0's f1_score: 0.719964
[600]	valid_0's f1_score: 0.720022
Early stopping, best iteration is:
[581]	valid_0's f1_score: 0.721313
the 2 training start ...
for train user:10686
for valid user:2672
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.727163
[200]	valid_0's f1_score: 0.736053
Early stopping, best iteration is:
[178]	valid_0's f1_score: 0.736349
the 3 training start ...
for train user:10686
for valid user:2672
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.727745
[200]	valid_0's f1_score: 0.737903
[300]	valid_0's f1_score: 0.741784

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:02<00:00, 48.57s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 103.25it/s]


data shape:
train--(516813, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10628
for valid user:2658
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.685405
[200]	valid_0's f1_score: 0.695182
[300]	valid_0's f1_score: 0.696713
[400]	valid_0's f1_score: 0.697947
[500]	valid_0's f1_score: 0.697678
[600]	valid_0's f1_score: 0.699244
[700]	valid_0's f1_score: 0.695481
Early stopping, best iteration is:
[611]	valid_0's f1_score: 0.699761
the 2 training start ...
for train user:10629
for valid user:2657
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.658388
[200]	valid_0's f1_score: 0.6684
[300]	valid_0's f1_score: 0.671169
[400]	valid_0's f1_score: 0.671894
[500]	valid_0's f1_score: 0.671262
Early stopping, best iteration is:
[426]	valid_0's f1_score: 0.673697
the 3 training start ...
for train user:10629
for valid user:2657
Training until validation sc

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:00<00:00, 48.11s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 122.51it/s]


data shape:
train--(512130, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10560
for valid user:2641
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.640926
[200]	valid_0's f1_score: 0.647248
[300]	valid_0's f1_score: 0.646243
Early stopping, best iteration is:
[208]	valid_0's f1_score: 0.648977
the 2 training start ...
for train user:10561
for valid user:2640
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.676762
[200]	valid_0's f1_score: 0.686742
[300]	valid_0's f1_score: 0.687364
Early stopping, best iteration is:
[269]	valid_0's f1_score: 0.689572
the 3 training start ...
for train user:10561
for valid user:2640
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.639784
[200]	valid_0's f1_score: 0.652164
[300]	valid_0's f1_score: 0.654293
[400]	valid_0's f1_score: 0.652216
Early stopping, best iteration is:

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:59<00:00, 47.81s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 112.63it/s]


data shape:
train--(509850, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10669
for valid user:2668
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.705226
[200]	valid_0's f1_score: 0.709742
[300]	valid_0's f1_score: 0.712588
[400]	valid_0's f1_score: 0.71362
Early stopping, best iteration is:
[385]	valid_0's f1_score: 0.714303
the 2 training start ...
for train user:10669
for valid user:2668
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.674057
[200]	valid_0's f1_score: 0.681105
[300]	valid_0's f1_score: 0.680764
Early stopping, best iteration is:
[262]	valid_0's f1_score: 0.682854
the 3 training start ...
for train user:10670
for valid user:2667
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.693392
[200]	valid_0's f1_score: 0.698348
[300]	valid_0's f1_score: 0.697291
Early stopping, best iteration is:


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:04<00:00, 48.91s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 121.93it/s]


data shape:
train--(521751, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10681
for valid user:2671
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.734394
[200]	valid_0's f1_score: 0.741317
[300]	valid_0's f1_score: 0.74229
[400]	valid_0's f1_score: 0.744169
Early stopping, best iteration is:
[376]	valid_0's f1_score: 0.746105
the 2 training start ...
for train user:10681
for valid user:2671
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.710355
[200]	valid_0's f1_score: 0.71359
[300]	valid_0's f1_score: 0.71381
[400]	valid_0's f1_score: 0.71752
[500]	valid_0's f1_score: 0.717205
Early stopping, best iteration is:
[404]	valid_0's f1_score: 0.718335
the 3 training start ...
for train user:10682
for valid user:2670
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.697878
[200]	valid_0's f1_score: 0.705201
[30

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:44<00:00, 44.98s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 102.96it/s]


data shape:
train--(478371, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10023
for valid user:2506
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.702846
[200]	valid_0's f1_score: 0.70587
[300]	valid_0's f1_score: 0.705616
[400]	valid_0's f1_score: 0.709193
[500]	valid_0's f1_score: 0.709127
Early stopping, best iteration is:
[472]	valid_0's f1_score: 0.710287
the 2 training start ...
for train user:10023
for valid user:2506
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.699006
[200]	valid_0's f1_score: 0.707703
Early stopping, best iteration is:
[169]	valid_0's f1_score: 0.710306
the 3 training start ...
for train user:10023
for valid user:2506
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.698188
[200]	valid_0's f1_score: 0.702348
[300]	valid_0's f1_score: 0.700859
Early stopping, best iteration is:


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:33<00:00, 42.68s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 121.86it/s]


data shape:
train--(454839, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:9760
for valid user:2441
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.676713
[200]	valid_0's f1_score: 0.68814
Early stopping, best iteration is:
[197]	valid_0's f1_score: 0.688444
the 2 training start ...
for train user:9761
for valid user:2440
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.70335
[200]	valid_0's f1_score: 0.708522
[300]	valid_0's f1_score: 0.713131
[400]	valid_0's f1_score: 0.711335
Early stopping, best iteration is:
[338]	valid_0's f1_score: 0.714814
the 3 training start ...
for train user:9761
for valid user:2440
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.685995
[200]	valid_0's f1_score: 0.696125
[300]	valid_0's f1_score: 0.696457
Early stopping, best iteration is:
[256]	valid_0's f1_score: 0.698845
the 

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:00<00:00, 48.08s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 123.68it/s]


data shape:
train--(513840, 82)
test--(1, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10621
for valid user:2656
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.691168
[200]	valid_0's f1_score: 0.69958
[300]	valid_0's f1_score: 0.703795
[400]	valid_0's f1_score: 0.705536
[500]	valid_0's f1_score: 0.705527
Early stopping, best iteration is:
[452]	valid_0's f1_score: 0.707036
the 2 training start ...
for train user:10621
for valid user:2656
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.682042
[200]	valid_0's f1_score: 0.693115
Early stopping, best iteration is:
[195]	valid_0's f1_score: 0.693757
the 3 training start ...
for train user:10622
for valid user:2655
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.688381
[200]	valid_0's f1_score: 0.691462
[300]	valid_0's f1_score: 0.693675
[400]	valid_0's f1_score: 0.694423


In [15]:
test = pd.read_csv('is_test.csv')
test = test.merge(attr, on='link', how='left')

In [10]:
clfs = []
for i in range(1,10):
    path = 'model/2019070'+ str(i) +'.txt'
    #print(path)
    clf = load(path)
    clfs.append(clf)

In [12]:
for i in range(10,31):
    path = 'model/201907'+ str(i) +'.txt'
    #print(path)
    clf = load(path)
    clfs.append(clf)

In [160]:
train = pd.read_csv('is_train_201907' + str(11) + '.txt')
train = train.merge(attr, on='link', how='left')

In [161]:
train__ = train
id_col = 'link'

In [162]:
train_user_id = train__[id_col].unique()

In [163]:
train__

Unnamed: 0,link,label,current_slice_id,future_slice_id,time_diff,curr_state,curr_speed,curr_eta,curr_cnt,current_speed_min,...,his_7_cnt_std,his_7_state,length,direction,path_class,speed_class,LaneNum,speed_limit,level,width
0,43094,1,322,327,5,1,26.6,28.8,42,26.000000,...,1.166190,0,24,2,5,5,3,19.444444,5,130
1,193284,1,622,628,6,1,30.3,13.6,3,5.699219,...,0.489898,2,60,1,5,6,1,11.111111,5,55
2,278876,0,466,489,23,2,19.1,22.3,6,17.796875,...,1.600000,1,35,2,5,7,1,4.166667,5,30
3,193344,1,583,610,27,2,17.8,16.9,20,6.500000,...,1.095445,2,10,2,5,5,2,19.444444,5,55
4,655729,0,354,363,9,1,31.6,31.6,11,31.203125,...,4.604346,1,7,1,5,7,1,4.166667,5,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521815,501248,0,356,382,26,1,49.1,25.2,8,29.203125,...,1.019804,1,5,3,5,5,2,19.444444,5,55
521816,310869,0,507,516,9,1,52.9,56.3,9,51.687500,...,1.673320,1,75,2,3,4,2,22.222222,3,55
521817,310426,0,251,279,28,1,28.9,14.4,11,25.593750,...,4.261455,2,70,2,5,5,2,16.666667,5,55
521818,72098,0,643,668,25,1,23.0,27.7,2,23.000000,...,0.000000,1,36,2,5,5,1,16.666667,5,30


In [164]:
train_y = train__.loc[:,'label']

In [165]:
train_x = train__.loc[:, use_train_feats]

In [166]:
test_pred = clfs[0].predict(train_x, num_iteration=clf.best_iteration)

In [167]:
flag = 0
for clf in clfs:
    test_pred = clf.predict(train_x, num_iteration=clf.best_iteration)
    #print(test_pred.shape)
    if(flag==0):
        sum_result = test_pred[np.newaxis,:]
    else:
        sum_result = np.vstack((sum_result,test_pred[np.newaxis,:]))
    flag = flag + 1
    #print(sum_result.shape)
    print(flag)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [214]:
ans_30 = np.average(sum_result,axis=0)

In [207]:
temp1 = np.average(sum_result,axis=0)
temp1

array([[0.86960012, 0.12247236, 0.00792751],
       [0.24333951, 0.56653227, 0.19012821],
       [0.73951064, 0.24519226, 0.0152971 ],
       ...,
       [0.53814861, 0.43474972, 0.02710167],
       [0.93042625, 0.06421522, 0.00535853],
       [0.97233802, 0.02328847, 0.00437351]])

In [208]:
temp1[:,0] *= 0.8
temp1[:,1] *= 0.8
temp1[:,2] *= 1.6

In [209]:
label1 = np.argmax(temp1,axis=1)+1

In [210]:
label1 = label1-1

In [211]:
print(classification_report(np.array(train_y).reshape(-1), label1, digits=3))

              precision    recall  f1-score   support

           0      0.899     0.949     0.924    414270
           1      0.642     0.444     0.525     82578
           2      0.501     0.547     0.523     24972

    accuracy                          0.850    521820
   macro avg      0.681     0.647     0.657    521820
weighted avg      0.839     0.850     0.841    521820



In [212]:
report = f1_score(np.array(train_y).reshape(-1), label1, average=None)
print('Score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)

Score:  0.6035740480351992


In [19]:
train = pd.read_csv('is_train_201907' + str(20) + '.txt')
train = train.merge(attr, on='link', how='left')
use_train_feats = [i for i in train.columns if i not in ['link', 'label', 'current_slice_id', 'future_slice_id', 'label_pred']]

In [216]:
flag = 0
for clf in clfs:
    test_pred = clf.predict(test[use_train_feats], num_iteration=clf.best_iteration)/len(clfs)
    #print(test_pred.shape)
    if(flag==0):
        sum_result = test_pred[np.newaxis,:]
    else:
        sum_result = np.vstack((sum_result,test_pred[np.newaxis,:]))
    flag = flag + 1
    #print(sum_result.shape)
    print(flag)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [217]:
sum_result.shape

(30, 504891, 3)

In [256]:
ahhhhh = np.average(sum_result,axis=0)

In [257]:
ahhhhh*=30

In [258]:
ahhhhh

array([[0.96949326, 0.02289998, 0.00760676],
       [0.89833867, 0.09460017, 0.00706116],
       [0.99431566, 0.00388015, 0.00180418],
       ...,
       [0.38981282, 0.50587981, 0.10430737],
       [0.98531589, 0.00965437, 0.00502974],
       [0.98366032, 0.01403462, 0.00230506]])

In [259]:
ahhhhh[:,0] *= 0.8
ahhhhh[:,1] *= 0.8
ahhhhh[:,2] *= 1.2

In [260]:
ahhhhh

array([[0.77559461, 0.01831998, 0.00912811],
       [0.71867093, 0.07568014, 0.0084734 ],
       [0.79545253, 0.00310412, 0.00216502],
       ...,
       [0.31185025, 0.40470385, 0.12516885],
       [0.78825271, 0.00772349, 0.00603569],
       [0.78692826, 0.01122769, 0.00276607]])

In [261]:
label = np.argmax(ahhhhh,axis=1)+1

In [262]:
label

array([1, 1, 1, ..., 2, 1, 1], dtype=int64)

In [264]:
test[['label']]=label

In [265]:
sub = test[['link', 'current_slice_id', 'future_slice_id','label']]

In [266]:
sub.to_csv('Own1026.csv', index=False, encoding='utf8')