In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score
import lightgbm as lgb
from collections import Counter
import warnings
import pickle
warnings.filterwarnings("ignore")

In [2]:
def get_base_info(x):
    return [i.split(':')[-1] for i in x.split(' ')]

def get_speed(x):
    return np.array([i.split(',')[0] for i in x], dtype='float16')

def get_eta(x):
    return np.array([i.split(',')[1] for i in x], dtype='float16')

def get_state(x):
    return [int(i.split(',')[2]) for i in x]

def get_cnt(x):
    return np.array([i.split(',')[3] for i in x], dtype='int16')

In [9]:
def gen_feats(path, mode='is_train'):
    df = pd.read_csv(path, sep=';', header=None)
    df['link'] = df[0].apply(lambda x: x.split(' ')[0])
    if mode == 'is_train':
        df['label'] = df[0].apply(lambda x: int(x.split(' ')[1]))
        df['label'] = df['label'].apply(lambda x: 3 if x > 3 else x)
        df['label'] -= 1
        df['current_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[2]))
        df['future_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[3]))
    else:
        df['label'] = -1
        df['current_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[2]))
        df['future_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[3]))

    df['time_diff'] = df['future_slice_id'] - df['current_slice_id']

    df['curr_state'] = df[1].apply(lambda x: x.split(' ')[-1].split(':')[-1])
    df['curr_speed'] = df['curr_state'].apply(lambda x: x.split(',')[0])
    df['curr_eta'] = df['curr_state'].apply(lambda x: x.split(',')[1])
    df['curr_cnt'] = df['curr_state'].apply(lambda x: x.split(',')[3])
    df['curr_state'] = df['curr_state'].apply(lambda x: x.split(',')[2])
    del df[0]

    for i in tqdm(range(1, 6)):
        df['his_info'] = df[i].apply(get_base_info)
        if i == 1:
            flg = 'current'
        else:
            flg = f'his_{(6 - i) * 7}'
        df['his_speed'] = df['his_info'].apply(get_speed)
        df[f'{flg}_speed_min'] = df['his_speed'].apply(lambda x: x.min())
        df[f'{flg}_speed_max'] = df['his_speed'].apply(lambda x: x.max())
        df[f'{flg}_speed_mean'] = df['his_speed'].apply(lambda x: x.mean())
        df[f'{flg}_speed_std'] = df['his_speed'].apply(lambda x: x.std())

        df['his_eta'] = df['his_info'].apply(get_eta)
        df[f'{flg}_eta_min'] = df['his_eta'].apply(lambda x: x.min())
        df[f'{flg}_eta_max'] = df['his_eta'].apply(lambda x: x.max())
        df[f'{flg}_eta_mean'] = df['his_eta'].apply(lambda x: x.mean())
        df[f'{flg}_eta_std'] = df['his_eta'].apply(lambda x: x.std())

        df['his_cnt'] = df['his_info'].apply(get_cnt)
        df[f'{flg}_cnt_min'] = df['his_cnt'].apply(lambda x: x.min())
        df[f'{flg}_cnt_max'] = df['his_cnt'].apply(lambda x: x.max())
        df[f'{flg}_cnt_mean'] = df['his_cnt'].apply(lambda x: x.mean())
        df[f'{flg}_cnt_std'] = df['his_cnt'].apply(lambda x: x.std())

        df['his_state'] = df['his_info'].apply(get_state)
        df[f'{flg}_state'] = df['his_state'].apply(lambda x: Counter(x).most_common()[0][0])
        df.drop([i, 'his_info', 'his_speed', 'his_eta', 'his_cnt', 'his_state'], axis=1, inplace=True)
    if mode == 'is_train':
        df.to_csv(f"{mode}_{path.split('/')[-1]}", index=False)
    else:
        df.to_csv(f"is_test.csv", index=False)

In [4]:
def f1_score_eval(preds, valid_df):
    labels = valid_df.get_label()
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    scores = f1_score(y_true=labels, y_pred=preds, average=None)
    scores = scores[0]*0.2+scores[1]*0.2+scores[2]*0.8
    return 'f1_score', scores, True

In [5]:
def save(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)

In [6]:
def load(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [7]:
def lgb_train(train_: pd.DataFrame, test_: pd.DataFrame, use_train_feats: list, id_col: str, label: str,
              n_splits: int, split_rs: int, is_shuffle=True, use_cart=False, cate_cols=None) -> pd.DataFrame:
    if not cate_cols:
        cate_cols = []
    print('data shape:\ntrain--{}\ntest--{}'.format(train_.shape, test_.shape))
    print('Use {} features ...'.format(len(use_train_feats)))
    print('Use lightgbm to train ...')
    n_class = train_[label].nunique()
    train_[f'{label}_pred'] = 0
    test_pred = np.zeros((test_.shape[0], n_class))
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = use_train_feats

    folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=split_rs)
    train_user_id = train_[id_col].unique()

    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': 'None',
        'num_leaves': 31,
        'num_class': n_class,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 1,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': -1,
        'verbose': -1
    }

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):
        print('the {} training start ...'.format(n_fold))
        train_x, train_y = train_.loc[train_[id_col].isin(train_user_id[train_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[train_idx]), label]
        valid_x, valid_y = train_.loc[train_[id_col].isin(train_user_id[valid_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[valid_idx]), label]
        print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')

        if use_cart:
            dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)
            dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)
        else:
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)

        #clf = lgb.train(
        #    params=params,
        #    train_set=dtrain,
        #    num_boost_round=5000,
        #    valid_sets=[dvalid],
        #    early_stopping_rounds=100,
        #    verbose_eval=100,
        #    feval=f1_score_eval
        #)
        clf = load('model/20190730.txt')
        fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance(importance_type='gain')
        train_.loc[train_[id_col].isin(train_user_id[valid_idx]), f'{label}_pred'] = np.argmax(
            clf.predict(valid_x, num_iteration=clf.best_iteration), axis=1)
        test_pred += clf.predict(test_[use_train_feats], num_iteration=clf.best_iteration) / folds.n_splits

    report = f1_score(train_[label], train_[f'{label}_pred'], average=None)
    print(classification_report(train_[label], train_[f'{label}_pred'], digits=4))
    print('Score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)
    test_[f'{label}_pred'] = np.argmax(test_pred, axis=1)
    test_[label] = np.argmax(test_pred, axis=1)+1
    five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]
    fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)
    fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)
    print(fold_importance_df[['Feature', 'avg_imp']].head(20))
    return test_[[id_col, 'current_slice_id', 'future_slice_id', label]],clf

In [9]:
sub.to_csv('public_baseline1.csv', index=False, encoding='utf8')

NameError: name 'sub' is not defined

In [10]:
sub,clf = lgb_train(train, test, use_cols, 'link', 'label', 5, 2020)
#save(clf,'model/20190730.txt')

NameError: name 'train' is not defined

In [19]:
if __name__ == "__main__":
    # train_path = './traffic/20190730.txt'
    test_path = 'test.txt'
    #gen_feats(train_path, mode='is_train')
    #gen_feats(test_path, mode='is_test')
    #attr = pd.read_csv('attr.txt', sep='\t',
                       names=['link', 'length', 'direction', 'path_class', 'speed_class', 'LaneNum', 'speed_limit',
                              'level', 'width'], header=None)

    #train = pd.read_csv('is_train_20190730.txt')
    test = pd.read_csv('is_test.csv')
    train = train.merge(attr, on='link', how='left')
    test = test.merge(attr, on='link', how='left')

    use_cols = [i for i in train.columns if i not in ['link', 'label', 'current_slice_id', 'future_slice_id', 'label_pred']]

    sub,clf = lgb_train(train, test, use_cols, 'link', 'label', 5, 2020)
    save(clf,'model/20190730.txt')

    sub.to_csv('public_baseline.csv', index=False, encoding='utf8')

100%|██████████| 5/5 [04:53<00:00, 58.71s/it]
100%|██████████| 5/5 [04:59<00:00, 59.96s/it]


data shape:
train--(510414, 82)
test--(504891, 82)
Use 78 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:10543
for valid user:2636
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.640209
[200]	valid_0's f1_score: 0.651391
Early stopping, best iteration is:
[192]	valid_0's f1_score: 0.65304
the 2 training start ...
for train user:10543
for valid user:2636
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.647961
[200]	valid_0's f1_score: 0.663607
[300]	valid_0's f1_score: 0.6673
[400]	valid_0's f1_score: 0.66846
Early stopping, best iteration is:
[331]	valid_0's f1_score: 0.669531
the 3 training start ...
for train user:10543
for valid user:2636
Training until validation scores don't improve for 100 rounds
[100]	valid_0's f1_score: 0.643075
[200]	valid_0's f1_score: 0.655626
[300]	valid_0's f1_score: 0.656286
Early stopping, best iteration is:
[257]	valid_0's f1_score: 0.65835

In [4]:
train = pd.read_csv('is_train/is_train_20190717.txt')

In [9]:
train

Unnamed: 0,link,label,current_slice_id,future_slice_id,time_diff,now_speed,now_eta,now_cnt,now_state,current_speed_min,...,his_7_eta_max,his_7_eta_mean,his_7_eta_std,his_7_cnt_min,his_7_cnt_max,his_7_cnt_mean,his_7_cnt_std,his_7_state_zhong,his_7_state_max,his_7_state_min
0,383172,0,267,284,17,38.1,41.8,7,1,31.406250,...,41.000000,39.281250,1.390625,3,7,5.2,1.469694,1,1,1
1,209892,0,204,222,18,35.3,40.1,2,1,30.906250,...,33.593750,31.750000,2.636719,1,3,2.0,0.632456,1,1,1
2,641047,0,229,236,7,26.0,30.5,1,1,26.000000,...,28.500000,27.343750,0.706543,3,6,4.8,1.166190,1,1,1
3,682831,0,307,314,7,38.2,21.1,18,1,37.312500,...,20.296875,17.421875,1.552734,10,13,11.6,1.019804,1,2,1
4,376640,0,235,248,13,41.3,26.0,1,1,39.500000,...,25.203125,21.156250,3.425781,2,5,3.4,1.019804,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523513,674887,0,294,318,24,25.3,11.1,33,1,20.203125,...,24.000000,21.234375,1.689453,16,22,18.2,2.400000,1,1,1
523514,569697,0,333,335,2,29.2,24.6,5,1,28.796875,...,33.906250,33.593750,0.637207,2,5,4.4,1.200000,1,1,1
523515,514590,0,613,643,30,13.3,12.9,6,2,13.296875,...,13.898438,11.000000,2.054688,7,10,8.4,1.200000,2,2,2
523516,68894,2,389,396,7,7.0,10.7,13,3,7.000000,...,20.296875,16.859375,2.886719,8,14,11.6,2.244994,2,2,1


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 523518 entries, 0 to 523517
Data columns (total 84 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   link                 523518 non-null  int64  
 1   label                523518 non-null  int64  
 2   current_slice_id     523518 non-null  int64  
 3   future_slice_id      523518 non-null  int64  
 4   time_diff            523518 non-null  int64  
 5   now_speed            523518 non-null  float64
 6   now_eta              523518 non-null  float64
 7   now_cnt              523518 non-null  int64  
 8   now_state            523518 non-null  int64  
 9   current_speed_min    523518 non-null  float64
 10  current_speed_max    523518 non-null  float64
 11  current_speed_mean   523518 non-null  float64
 12  current_speed_std    523518 non-null  float64
 13  current_eta_min      523518 non-null  float64
 14  current_eta_max      523518 non-null  float64
 15  current_eta_mean 

In [18]:
attr = pd.read_table('attr.txt')

In [38]:
attr[0:1]

Unnamed: 0,linkid,length,direction,pathclass,speedclass,LaneNum,speedlimit,level,width
0,0,19,1,5,7,1,4.166667,5,30


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train.loc[:,'current_slice_id':'his_7_state'],train.loc[:,'label':'label'],test_size=0.3 , random_state=1234)

In [7]:
from pandasgui import show

In [8]:
show(train)

<pandasgui.gui.PandasGui at 0x22d0254b700>

In [6]:
train

Unnamed: 0,link,label,current_slice_id,future_slice_id,time_diff,curr_state,curr_speed,curr_eta,curr_cnt,current_speed_min,...,his_7_speed_std,his_7_eta_min,his_7_eta_max,his_7_eta_mean,his_7_eta_std,his_7_cnt_min,his_7_cnt_max,his_7_cnt_mean,his_7_cnt_std,his_7_state
0,499320,0,378,380,2,1,38.8,30.5,4,35.31250,...,6.070312,11.500000,53.40625,36.656250,20.531250,1,1,1.0,0.000000,1
1,34589,0,599,609,10,2,17.5,26.2,18,17.50000,...,2.263672,26.000000,27.00000,26.453125,0.373779,16,20,18.6,1.496663,1
2,335549,0,205,211,6,1,53.2,43.2,24,43.81250,...,7.222656,25.203125,31.50000,28.875000,2.318359,20,25,22.4,1.854724,1
3,257097,1,689,716,27,2,17.6,18.7,4,17.59375,...,4.304688,10.203125,26.00000,16.359375,5.269531,1,5,3.4,1.356466,2
4,229237,0,355,385,30,1,30.2,26.9,10,26.09375,...,2.341797,28.093750,32.40625,30.187500,1.445312,5,14,10.4,3.136877,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522076,246099,0,394,403,9,1,32.1,29.1,6,29.40625,...,4.015625,12.703125,18.40625,15.820312,1.857422,6,8,7.0,0.632456,2
522077,469010,0,586,595,9,2,16.6,17.9,7,16.59375,...,4.792969,9.796875,18.90625,14.296875,3.361328,3,7,5.6,1.496663,2
522078,243544,0,665,678,13,0,30.0,12.7,3,30.00000,...,2.074219,3.800781,57.68750,14.578125,21.562500,1,1,1.0,0.000000,1
522079,172289,0,196,217,21,0,0.0,0.0,0,0.00000,...,1.698242,23.203125,25.40625,24.703125,0.906250,1,2,1.6,0.489898,1


In [7]:
n = 0
op = 0
error_his = 0
error_all = 0
all_same = 0
error_3 = 0
error_2 = 0
for i in tqdm(range(522081)):
    if (train['label'][i]+1 == 3) & (train['his_7_state'][i] != train['label'][i]+1):
        error_3 += 1
    if (train['label'][i]+1 == 2) & (train['his_7_state'][i] != train['label'][i]+1):
        error_2 += 1
    if train['his_7_state'][i] == train['label'][i]+1:
        op += 1
    if  (train['his_7_state'][i] == train['his_14_state'][i]) & (train['his_21_state'][i] == train['his_7_state'][i]): 
        n += 1
        if (train['his_7_state'][i] != train['label'][i]+1):
            error_his += 1
        if train['his_7_state'][i] == train['curr_state'][i]:
            all_same += 1
            if (train['his_7_state'][i] != train['label'][i]+1): 
                error_all += 1
print(op, n, error_his, all_same, error_all, error_3, error_2)

100%|████████████████████████████████████████████████████████████████████████| 522081/522081 [01:58<00:00, 4416.52it/s]

411291 353550 31554 311555 17801 17973 45321





In [4]:
type(test)

pandas.core.frame.DataFrame

In [14]:
ans = pd.read_csv('2020_10_26_23.csv')

In [15]:
ans

Unnamed: 0,link,current_slice_id,future_slice_id,label
0,106984,616,640,1
1,33252,327,343,1
2,528890,579,598,1
3,321454,140,169,1
4,513463,204,206,1
...,...,...,...,...
504886,323321,74,95,1
504887,310389,97,110,1
504888,12536,256,282,2
504889,359102,451,459,1


In [15]:
test['his_7_state'][504888]

2

In [30]:
n = 0
error_his = 0
error_all = 0
for i in tqdm(range(504891)):
    if  (test['his_7_state'][i] == test['his_14_state'][i]) & (test['his_21_state'][i] == test['his_7_state'][i]): 
        n += 1
        if (test['his_7_state'][i] != 0) & (test['his_7_state'][i] != ans['label'][i]):
            error_his += 1
        if test['his_7_state'][i] == test['curr_state'][i]:
            if (test['his_7_state'][i] != 0) & (test['his_7_state'][i] != ans['label'][i]): 
                error_all += 1
print(n, error_his, error_all)

100%|████████████████████████████████████████████████████████████████████████| 504891/504891 [00:57<00:00, 8745.49it/s]

356124 5896 622





In [34]:
n = 0
num_1 = 0
num_2 = 0
num_3 = 0
for i in tqdm(range(504891)):
    if  (test['his_7_state'][i] == test['his_14_state'][i]) & (test['his_21_state'][i] == test['his_7_state'][i]): 
        n += 1
        if (test['his_7_state'][i] == 1) :
            num_1+=1
        if (test['his_7_state'][i] == 2) :
            num_2+=1
        if (test['his_7_state'][i] == 3) :
            num_3+=1
print(n, num_1, num_2, num_3)

100%|███████████████████████████████████████████████████████████████████████| 504891/504891 [00:40<00:00, 12607.13it/s]

356124 327915 22317 2223





In [32]:
n = 0
error_his = 0
error_all = 0
for i in tqdm(range(504891)):
    if  (test['his_7_state'][i] == test['his_14_state'][i]) & (test['his_21_state'][i] == test['his_7_state'][i]): 
        n += 1
        if (test['his_7_state'][i] != 0) & (test['his_7_state'][i] != ans['label'][i]):
            ans['label'][i] = test['his_7_state'][i]
        if test['his_7_state'][i] == test['curr_state'][i]:
            if (test['his_7_state'][i] != 0) & (test['his_7_state'][i] != ans['label'][i]): 
                error_all += 1
print(n, error_his, error_all)

100%|████████████████████████████████████████████████████████████████████████| 504891/504891 [00:58<00:00, 8592.59it/s]

356124 0 0





In [10]:
train = test[['link','current_slice_id','future_slice_id','his_7_state']]

In [11]:
train['label'] = train['his_7_state']


In [12]:
del train['his_7_state']

In [13]:
train

Unnamed: 0,link,current_slice_id,future_slice_id,label
0,106984,616,640,1
1,33252,327,343,1
2,528890,579,598,1
3,321454,140,169,1
4,513463,204,206,1
...,...,...,...,...
504886,323321,74,95,1
504887,310389,97,110,1
504888,12536,256,282,2
504889,359102,451,459,1


In [14]:
n = 0
for i in tqdm(range(504891)):
    if  (train['label'][i] == 0): 
        n += 1
n

100%|███████████████████████████████████████████████████████████████████████| 504891/504891 [00:06<00:00, 75060.98it/s]


23148

In [33]:
ans.to_csv('2020_10_28.csv', index=False, encoding='utf8')

In [None]:
df = pd.read_csv(path, sep=';', header=None)