In [1]:
import numpy as np
import pandas as pd
import datetime

from sklearn import preprocessing

In [2]:
def weekdayOneHotEncoder(series):
    enc = preprocessing.OneHotEncoder()
    enc.fit([[0], [1], [2], [3], [4], [5], [6]])
    weekdays = []
    for t in series.values:
        weekdays.append([t])
    return enc.transform(weekdays).toarray()

In [3]:
def directionOneHotEncoder(series):
    enc = preprocessing.OneHotEncoder()
    enc.fit([[0], [1], [2], [3], [4], [5], [6], [7], [8]])
    directions = []
    for t in series.values:
        directions.append([t])
    return enc.transform(directions).toarray()

In [4]:
# train_start_date = pd.to_datetime('2016-7-19')
# train_end_date = pd.to_datetime('2016-9-05')
# test_start_date = pd.to_datetime('2016-9-06')
# test_end_date = pd.to_datetime('2016-9-12')
# export_path = '../../data/offline/validation_9_5/'

# train_start_date = pd.to_datetime('2016-7-19')
# train_end_date = pd.to_datetime('2016-9-19')
# test_start_date = pd.to_datetime('2016-9-20')
# test_end_date = pd.to_datetime('2016-9-26')
# export_path = '../../data/offline/validation_9_20/'

train_start_date = pd.to_datetime('2016-7-19')
train_end_date = pd.to_datetime('2016-10-17')
test_start_date = pd.to_datetime('2016-10-18')
test_end_date = pd.to_datetime('2016-10-24')
export_path = '../../data/online/'

all_day_count = (test_end_date - train_start_date).days
train_day_count = (train_end_date - train_start_date).days
test_day_count = (test_end_date - test_start_date).days

In [5]:
weekdays = []
ranges = pd.date_range(start=train_start_date, end=test_end_date)
for d in ranges:
    weekdays.append(d.weekday())

In [6]:
weather1_df = pd.read_csv('../../data/preprocess/preprocess_weather1.csv', index_col=0)
weather2_df = pd.read_csv('../../data/preprocess/preprocess_weather2.csv', index_col=0)
weather3_df = pd.read_csv('../../data/preprocess/preprocess_weather3.csv', index_col=0)
weather4_df = pd.read_csv('../../data/preprocess/preprocess_weather4.csv', index_col=0)

In [7]:
routes = pd.read_csv('../../data/original/routes (table 4).csv')

In [8]:
morninig_routes_times = []
afternoon_routes_times = []

for i in routes.index:
    link_seq = routes.loc[i]['link_seq']
    links = link_seq.split(',')
    
    morning_time_df = None
    for k, link_id in enumerate(links):
        if k == 0:
            morning_time_df = pd.read_csv('../../data/process/link_residual_time/morning/'+ str(link_id) + '.csv', index_col=0, parse_dates=True)
        else:
            morning_time_df += pd.read_csv('../../data/process/link_residual_time/morning/'+ str(link_id) + '.csv', index_col=0, parse_dates=True)

    afternoon_time_df = None
    for k, link_id in enumerate(links):
        if k == 0:
            afternoon_time_df = pd.read_csv('../../data/process/link_residual_time/afternoon/'+ str(link_id) + '.csv', index_col=0, parse_dates=True)
        else:
            afternoon_time_df += pd.read_csv('../../data/process/link_residual_time/afternoon/'+ str(link_id) + '.csv', index_col=0, parse_dates=True)
            
    morninig_routes_times.append(morning_time_df)
    afternoon_routes_times.append(afternoon_time_df)

In [9]:
for m_rt in morninig_routes_times:
    m_rt['std'] = m_rt[['0', '1', '2', '3', '4', '5']].std(axis=1)
    m_rt['mean'] = m_rt[['0', '1', '2', '3', '4', '5']].mean(axis=1)
    
for a_rt in afternoon_routes_times:
    a_rt['std'] = a_rt[['0', '1', '2', '3', '4', '5']].std(axis=1)
    a_rt['mean'] = a_rt[['0', '1', '2', '3', '4', '5']].mean(axis=1)

In [10]:
for i in routes.index:
    time_df = morninig_routes_times[i]
    time_df = time_df.loc[train_start_date:test_end_date]

    temp_df = pd.merge(time_df, weather1_df, how='left', left_index=True, right_index=True)
    time_df = pd.merge(temp_df, weather2_df, how='left', left_index=True, right_index=True)
    time_df['weekday'] = weekdays

    train_time_df = time_df.loc[train_start_date:train_end_date]

    X_time_df = time_df[['0', '1', '2', '3', '4', '5','std', 'mean',
                           'pressure_x', 'sea_pressure_x', 'wind_speed_x', 'temperature_x', 
                           'rel_humidity_x', 'precipitation_x','pressure_y', 'sea_pressure_y', 
                           'wind_speed_y', 'temperature_y', 'rel_humidity_y', 'precipitation_y',
                          ]]

    y_train_time_df = train_time_df[['6', '7', '8', '9', '10', '11']]

    tempa = preprocessing.scale(X_time_df)
    tempb = weekdayOneHotEncoder(time_df['weekday'])
    tempc = directionOneHotEncoder(time_df['wind_direction_x'])
    tempd = directionOneHotEncoder(time_df['wind_direction_y']) 
    temp_df = pd.DataFrame(np.concatenate((tempa, tempb, tempc, tempd), axis=1))

    X_train_df = temp_df.loc[np.array(range(train_day_count + 1))]
    X_train_df.to_csv(export_path + 'feature/feature3/morning/X_train_route' + str(i) + '.csv')
    y_train_df = pd.DataFrame(y_train_time_df.values)
    y_train_df.to_csv(export_path + 'feature/feature3/morning/y_train_route' + str(i) + '.csv')

    X_test_df = temp_df.loc[np.array(range(train_day_count, train_day_count + test_day_count + 1))]
    X_test_df.reset_index(drop=True).to_csv(export_path + 'feature/feature3/morning/X_test_route' + str(i) + '.csv')


In [11]:
for i in routes.index:
    time_df = afternoon_routes_times[i]
    time_df = time_df.loc[train_start_date:test_end_date]

    temp_df = pd.merge(time_df, weather1_df, how='left', left_index=True, right_index=True)
    time_df = pd.merge(temp_df, weather2_df, how='left', left_index=True, right_index=True)
    time_df['weekday'] = weekdays

    train_time_df = time_df.loc[train_start_date:train_end_date]

    X_time_df = time_df[['0', '1', '2', '3', '4', '5','std', 'mean',
                           'pressure_x', 'sea_pressure_x', 'wind_speed_x', 'temperature_x', 
                           'rel_humidity_x', 'precipitation_x','pressure_y', 'sea_pressure_y', 
                           'wind_speed_y', 'temperature_y', 'rel_humidity_y', 'precipitation_y',
                          ]]

    y_train_time_df = train_time_df[['6', '7', '8', '9', '10', '11']]

    tempa = preprocessing.scale(X_time_df)
    tempb = weekdayOneHotEncoder(time_df['weekday'])
    tempc = directionOneHotEncoder(time_df['wind_direction_x'])
    tempd = directionOneHotEncoder(time_df['wind_direction_y']) 
    temp_df = pd.DataFrame(np.concatenate((tempa, tempb, tempc, tempd), axis=1))

    X_train_df = temp_df.loc[np.array(range(train_day_count + 1))]
    X_train_df.to_csv(export_path + 'feature/feature3/afternoon/X_train_route' + str(i) + '.csv')
    y_train_df = pd.DataFrame(y_train_time_df.values)
    y_train_df.to_csv(export_path + 'feature/feature3/afternoon/y_train_route' + str(i) + '.csv')

    X_test_df = temp_df.loc[np.array(range(train_day_count, train_day_count + test_day_count + 1))]
    X_test_df.reset_index(drop=True).to_csv(export_path + 'feature/feature3/afternoon/X_test_route' + str(i) + '.csv')