In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score,accuracy_score
import lightgbm as lgb
from collections import Counter
import warnings
import gc

warnings.filterwarnings("ignore")

In [14]:
def get_base_info(x):
    return [i.split(':')[-1] for i in x.split(' ')]

def get_speed(x):
    return np.array([i.split(',')[0] for i in x], dtype='float16')

def get_eta(x):
    return np.array([i.split(',')[1] for i in x], dtype='float16')

def get_state(x):
    return np.array([(i.split(',')[2]) for i in x], dtype='int16')

def get_cnt(x):
    return np.array([i.split(',')[3] for i in x], dtype='int16')

In [15]:
def gen_feat(path):
    df = pd.read_csv(path, sep=';', header=None)
    df['link'] = df[0].apply(lambda x: x.split(' ')[0])
    df['label'] = df[0].apply(lambda x: int(x.split(' ')[1]))
    df['label'] = df['label'].apply(lambda x: 3 if x > 3 else x)
    df['label'] -= 1
    df['current_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[2]))
    df['future_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[3]))
    df['time_diff'] = df['future_slice_id'] - df['current_slice_id']
    del df[0]
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 1 if x <= 200 else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 2 if (200< x <= 230) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 3 if (230< x <= 340) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 4 if (340< x <= 470) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 5 if (470< x <= 520) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 6 if (520< x <= 588) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 7 if (588< x <= 720) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 1 if x <= 200 else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 2 if (200< x <= 230) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 3 if (230< x <= 340) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 4 if (340< x <= 470) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 5 if (470< x <= 520) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 6 if (520< x <= 588) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 7 if (588< x <= 720) else x)
    for i in tqdm(range(1, 6)):
        df['his_info'] = df[i].apply(get_base_info)
        if i == 1:
            flg = 'current'
        else:
            flg = f'his_{(6 - i) * 7}'
        df['his_speed'] = df['his_info'].apply(get_speed)
        df[f'{flg}_speed_min'] = df['his_speed'].apply(lambda x: x.min())
        df[f'{flg}_speed_max'] = df['his_speed'].apply(lambda x: x.max())
        df[f'{flg}_speed_mean'] = df['his_speed'].apply(lambda x: x.mean())
        df[f'{flg}_speed_std'] = df['his_speed'].apply(lambda x: x.std())

        df['his_eta'] = df['his_info'].apply(get_eta)
        df[f'{flg}_eta_min'] = df['his_eta'].apply(lambda x: x.min())
        df[f'{flg}_eta_max'] = df['his_eta'].apply(lambda x: x.max())
        df[f'{flg}_eta_mean'] = df['his_eta'].apply(lambda x: x.mean())
        df[f'{flg}_eta_std'] = df['his_eta'].apply(lambda x: x.std())

        df['his_cnt'] = df['his_info'].apply(get_cnt)
        df[f'{flg}_cnt_min'] = df['his_cnt'].apply(lambda x: x.min())
        df[f'{flg}_cnt_max'] = df['his_cnt'].apply(lambda x: x.max())
        df[f'{flg}_cnt_mean'] = df['his_cnt'].apply(lambda x: x.mean())
        df[f'{flg}_cnt_std'] = df['his_cnt'].apply(lambda x: x.std())
        
        df['his_state'] = df['his_info'].apply(get_state).apply(lambda x:np.where(x<=0,1,x))
        df['his_state'] = df['his_state'].apply(lambda x:np.where(x>=4,3,x))
        df[f'{flg}__state_min'] = df['his_state'].apply(lambda x: x.min())
        df[f'{flg}__state_max'] = df['his_state'].apply(lambda x: x.max())
        df[f'{flg}_comstate'] = df['his_state'].apply(lambda x: Counter(x).most_common()[0][0])
        df.drop([i, 'his_info', 'his_speed', 'his_eta', 'his_cnt', 'his_state'], axis=1, inplace=True)
        #df.drop([i, 'his_info', 'his_speed', 'his_eta', 'his_cnt'], axis=1, inplace=True)
        attr = pd.read_csv('attr.txt', sep='\t',
                       names=['link', 'length', 'direction', 'path_class', 'speed_class', 'LaneNum', 'speed_limit',
                              'level', 'width'], header=None)
        df['link'] = df['link'].apply(int)
    return(df.merge(attr, on='link', how='left'))

In [16]:
ahhh = gen_feat('traffic/20190702.txt')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [05:05<00:00, 61.07s/it]


In [17]:
ahhh.to_csv('is_tran_data/is_tran_2019070'+str(2)+'.txt', index=False)

In [9]:
for i in tqdm(range(1,10)):
    path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)
    save_path = is_tran_data/is_tran_2019070'+str(i)+'.txt
    extract.to_csv(save_path, index=False)

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [04:38<00:00, 30.96s/it]


In [4]:
    path = 'is_tran_data/is_tran_2019070'+str(1)+'.txt'
    extract = pd.read_csv(path)
    #extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)

In [6]:
 extract = extract[extract.link.isin(['353495'])]


Unnamed: 0,link,label,current_slice_id,future_slice_id,time_diff,current_speed_min,current_speed_max,current_speed_mean,current_speed_std,current_eta_min,...,his_7__state_max,his_7_comstate,length,direction,path_class,speed_class,LaneNum,speed_limit,level,width
0,353495,0,3,3,9,20.0,31.59375,24.984375,4.753906,21.90625,...,1,1,49,2,4,5,2,16.666667,4,55
10250,353495,0,3,3,5,32.8125,35.8125,33.90625,1.165039,33.40625,...,1,1,49,2,4,5,2,16.666667,4,55
10854,353495,0,7,7,13,31.90625,38.59375,33.71875,2.519531,35.40625,...,1,1,49,2,4,5,2,16.666667,4,55
14070,353495,0,4,4,30,27.0,34.8125,30.84375,2.974609,31.296875,...,1,1,49,2,4,5,2,16.666667,4,55
23896,353495,0,4,4,14,30.296875,32.59375,31.359375,0.924805,36.3125,...,1,1,49,2,4,5,2,16.666667,4,55
45160,353495,0,1,1,10,34.3125,34.3125,34.3125,0.0,35.59375,...,1,1,49,2,4,5,2,16.666667,4,55
45699,353495,0,7,7,6,25.59375,31.0,28.3125,2.070312,29.59375,...,1,1,49,2,4,5,2,16.666667,4,55
48923,353495,0,2,3,24,27.296875,32.8125,29.640625,1.847656,29.90625,...,1,1,49,2,4,5,2,16.666667,4,55
57571,353495,0,4,4,20,27.0,34.59375,31.609375,2.876953,31.296875,...,1,1,49,2,4,5,2,16.666667,4,55
89746,353495,0,3,3,18,30.0,33.90625,31.859375,1.640625,36.90625,...,1,1,49,2,4,5,2,16.666667,4,55


In [8]:
extract = extract.append(extract)
extract

Unnamed: 0,link,label,current_slice_id,future_slice_id,time_diff,current_speed_min,current_speed_max,current_speed_mean,current_speed_std,current_eta_min,...,his_7__state_max,his_7_comstate,length,direction,path_class,speed_class,LaneNum,speed_limit,level,width
0,353495,0,3,3,9,20.000000,31.59375,24.984375,4.753906,21.906250,...,1,1,49,2,4,5,2,16.666667,4,55
10250,353495,0,3,3,5,32.812500,35.81250,33.906250,1.165039,33.406250,...,1,1,49,2,4,5,2,16.666667,4,55
10854,353495,0,7,7,13,31.906250,38.59375,33.718750,2.519531,35.406250,...,1,1,49,2,4,5,2,16.666667,4,55
14070,353495,0,4,4,30,27.000000,34.81250,30.843750,2.974609,31.296875,...,1,1,49,2,4,5,2,16.666667,4,55
23896,353495,0,4,4,14,30.296875,32.59375,31.359375,0.924805,36.312500,...,1,1,49,2,4,5,2,16.666667,4,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453953,353495,0,3,3,5,31.296875,35.31250,34.187500,1.475586,39.187500,...,1,1,49,2,4,5,2,16.666667,4,55
458446,353495,0,6,6,15,32.687500,38.68750,36.062500,2.462891,35.812500,...,1,1,49,2,4,5,2,16.666667,4,55
478890,353495,0,7,7,5,29.093750,35.40625,32.218750,2.580078,35.406250,...,1,1,49,2,4,5,2,16.666667,4,55
492028,353495,0,5,5,30,33.093750,34.90625,33.968750,0.814453,40.687500,...,1,1,49,2,4,5,2,16.666667,4,55


In [7]:
for i in tqdm(range(10,31)):
    path = 'is_tran_data/is_tran_201907'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)
    save_path = 'is_tran_data/is_tran_201907'+str(i)+'.txt'
    extract.to_csv(save_path, index=False)

  0%|                                                                                           | 0/21 [00:03<?, ?it/s]


KeyError: "['0' '1' '2' '3' '4' '5'] not found in axis"

In [None]:
path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
extract = pd.read_csv(path)
extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)
save_path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
extract.to_csv(save_path, index=False)

In [8]:
path = 'is_tran_data/is_tran_2019070'+str(1)+'.txt'
done = pd.read_csv(path)
done = done[done.link.isin(['348288'])]
for i in tqdm(range(2,10)):
    path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract = extract[extract.link.isin(['348288'])]
    done = done.append(extract)

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:23<00:00,  2.89s/it]


In [9]:
for i in tqdm(range(10,31)):
    path = 'is_tran_data/is_tran_201907'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract = extract[extract.link.isin(['348288'])]
    done = done.append(extract)

100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [01:01<00:00,  2.91s/it]


In [12]:
done

Unnamed: 0,link,label,current_slice_id,future_slice_id,time_diff,current_speed_min,current_speed_max,current_speed_mean,current_speed_std,current_eta_min,...,his_7__state_max,his_7_comstate,length,direction,path_class,speed_class,LaneNum,speed_limit,level,width
0,353495,0,3,3,9,20.000000,31.59375,24.984375,4.753906,21.906250,...,1,1,49,2,4,5,2,16.666667,4,55
10250,353495,0,3,3,5,32.812500,35.81250,33.906250,1.165039,33.406250,...,1,1,49,2,4,5,2,16.666667,4,55
10854,353495,0,7,7,13,31.906250,38.59375,33.718750,2.519531,35.406250,...,1,1,49,2,4,5,2,16.666667,4,55
14070,353495,0,4,4,30,27.000000,34.81250,30.843750,2.974609,31.296875,...,1,1,49,2,4,5,2,16.666667,4,55
23896,353495,0,4,4,14,30.296875,32.59375,31.359375,0.924805,36.312500,...,1,1,49,2,4,5,2,16.666667,4,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469305,353495,0,3,3,13,26.203125,32.90625,29.828125,2.794922,33.687500,...,1,1,49,2,4,5,2,16.666667,4,55
471348,353495,0,5,5,19,28.406250,35.31250,33.250000,2.519531,33.406250,...,1,1,49,2,4,5,2,16.666667,4,55
472975,353495,0,3,3,6,27.296875,32.59375,29.578125,1.958984,33.687500,...,1,1,49,2,4,5,2,16.666667,4,55
478516,353495,0,1,1,7,35.687500,38.68750,38.093750,1.200195,34.000000,...,1,1,49,2,4,5,2,16.666667,4,55


In [6]:
save_path = '348288r.txt'
done.to_csv(save_path, index=False)