In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score,accuracy_score
import lightgbm as lgb
from collections import Counter
import warnings
import gc

warnings.filterwarnings("ignore")

In [None]:
def get_base_info(x):
    return [i.split(':')[-1] for i in x.split(' ')]

def get_speed(x):
    return np.array([i.split(',')[0] for i in x], dtype='float16')

def get_eta(x):
    return np.array([i.split(',')[1] for i in x], dtype='float16')

def get_state(x):
    return np.array([(i.split(',')[2]) for i in x], dtype='int16')

def get_cnt(x):
    return np.array([i.split(',')[3] for i in x], dtype='int16')

In [None]:
def gen_feat(path):
    df = pd.read_csv(path, sep=';', header=None)
    df['link'] = df[0].apply(lambda x: x.split(' ')[0])
    df['label'] = df[0].apply(lambda x: int(x.split(' ')[1]))
    df['label'] = df['label'].apply(lambda x: 3 if x > 3 else x)
    df['label'] -= 1
    df['current_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[2]))
    df['future_slice_id'] = df[0].apply(lambda x: int(x.split(' ')[3]))
    df['time_diff'] = df['future_slice_id'] - df['current_slice_id']
    del df[0]
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 1 if x <= 200 else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 2 if (200< x <= 230) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 3 if (230< x <= 340) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 4 if (340< x <= 470) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 5 if (470< x <= 520) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 6 if (520< x <= 588) else x)
    #df['current_slice_id'] = df['current_slice_id'].apply(lambda x: 7 if (588< x <= 720) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 1 if x <= 200 else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 2 if (200< x <= 230) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 3 if (230< x <= 340) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 4 if (340< x <= 470) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 5 if (470< x <= 520) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 6 if (520< x <= 588) else x)
    #df['future_slice_id'] = df['future_slice_id'].apply(lambda x: 7 if (588< x <= 720) else x)
    for i in tqdm(range(1, 6)):
        df['his_info'] = df[i].apply(get_base_info)
        if i == 1:
            flg = 'current'
        else:
            flg = f'his_{(6 - i) * 7}'
        df['his_speed'] = df['his_info'].apply(get_speed)
        df[f'{flg}_speed_min'] = df['his_speed'].apply(lambda x: x.min())
        df[f'{flg}_speed_max'] = df['his_speed'].apply(lambda x: x.max())
        df[f'{flg}_speed_mean'] = df['his_speed'].apply(lambda x: x.mean())
        df[f'{flg}_speed_std'] = df['his_speed'].apply(lambda x: x.std())

        df['his_eta'] = df['his_info'].apply(get_eta)
        df[f'{flg}_eta_min'] = df['his_eta'].apply(lambda x: x.min())
        df[f'{flg}_eta_max'] = df['his_eta'].apply(lambda x: x.max())
        df[f'{flg}_eta_mean'] = df['his_eta'].apply(lambda x: x.mean())
        df[f'{flg}_eta_std'] = df['his_eta'].apply(lambda x: x.std())

        df['his_cnt'] = df['his_info'].apply(get_cnt)
        df[f'{flg}_cnt_min'] = df['his_cnt'].apply(lambda x: x.min())
        df[f'{flg}_cnt_max'] = df['his_cnt'].apply(lambda x: x.max())
        df[f'{flg}_cnt_mean'] = df['his_cnt'].apply(lambda x: x.mean())
        df[f'{flg}_cnt_std'] = df['his_cnt'].apply(lambda x: x.std())
        
        df['his_state'] = df['his_info'].apply(get_state).apply(lambda x:np.where(x<=0,1,x))
        df['his_state'] = df['his_state'].apply(lambda x:np.where(x>=4,3,x))
        df[f'{flg}__state_min'] = df['his_state'].apply(lambda x: x.min())
        df[f'{flg}__state_max'] = df['his_state'].apply(lambda x: x.max())
        df[f'{flg}_comstate'] = df['his_state'].apply(lambda x: Counter(x).most_common()[0][0])
        df.drop([i, 'his_info', 'his_speed', 'his_eta', 'his_cnt', 'his_state'], axis=1, inplace=True)
        #df.drop([i, 'his_info', 'his_speed', 'his_eta', 'his_cnt'], axis=1, inplace=True)
        attr = pd.read_csv('attr.txt', sep='\t',
                       names=['link', 'length', 'direction', 'path_class', 'speed_class', 'LaneNum', 'speed_limit',
                              'level', 'width'], header=None)
        df['link'] = df['link'].apply(int)
    return(df.merge(attr, on='link', how='left'))

In [None]:
ahhh = gen_feat('traffic/20190702.txt')

In [None]:
ahhh.to_csv('is_tran_data/is_tran_2019070'+str(2)+'.txt', index=False)

In [None]:
for i in tqdm(range(1,10)):
    path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)
    save_path = is_tran_data/is_tran_2019070'+str(i)+'.txt
    extract.to_csv(save_path, index=False)

In [None]:
    path = 'is_tran_data/is_tran_2019070'+str(1)+'.txt'
    extract = pd.read_csv(path)
    #extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)

In [None]:
 extract = extract[extract.link.isin(['353495'])]


In [None]:
extract = extract.append(extract)
extract

In [None]:
for i in tqdm(range(10,31)):
    path = 'is_tran_data/is_tran_201907'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)
    save_path = 'is_tran_data/is_tran_201907'+str(i)+'.txt'
    extract.to_csv(save_path, index=False)

In [None]:
path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
extract = pd.read_csv(path)
extract.drop(['0','1','2','3','4','5'], axis=1, inplace=True)
save_path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
extract.to_csv(save_path, index=False)

In [None]:
path = 'is_tran_data/is_tran_2019070'+str(1)+'.txt'
done = pd.read_csv(path)
done = done[done.link.isin(['348288'])]
for i in tqdm(range(2,10)):
    path = 'is_tran_data/is_tran_2019070'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract = extract[extract.link.isin(['348288'])]
    done = done.append(extract)

In [None]:
for i in tqdm(range(10,31)):
    path = 'is_tran_data/is_tran_201907'+str(i)+'.txt'
    extract = pd.read_csv(path)
    extract = extract[extract.link.isin(['348288'])]
    done = done.append(extract)

In [None]:
done

In [None]:
save_path = '348288r.txt'
done.to_csv(save_path, index=False)