# Libs

In [1]:
import os
import re
import sys
import glob
import time
import datetime
import warnings
warnings.filterwarnings("ignore")
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor

# Load Data

In [2]:
filedir = glob.glob(pathname='../Data/*.csv')
filedir

['../Data\\area_passenger_index.csv',
 '../Data\\area_passenger_info.csv',
 '../Data\\grid_strength.csv',
 '../Data\\migration_index.csv',
 '../Data\\shortstay_20200117_20200131.csv',
 '../Data\\shortstay_20200201_20200215.csv']

In [3]:
area_passenger_ind = pd.read_csv(filedir[0],header = None)
area_passenger_ind.columns = ['areaIdx','datetime','Density']
area_passenger_ind['datetime'] = pd.to_datetime(area_passenger_ind['datetime'],format="%Y%m%d%H")
area_passenger_ind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717840 entries, 0 to 717839
Data columns (total 3 columns):
areaIdx     717840 non-null int64
datetime    717840 non-null datetime64[ns]
Density     717840 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 16.4 MB


# Autoregressive prediction

In [None]:
predValues = []
sp = time.time()
for areaId in range(1,998):
    data = area_passenger_ind[area_passenger_ind.areaIdx == areaId]['Density'].values

    # build dataset
    window_size = 8
    num_samples = len(data) - window_size
    X = np.zeros((num_samples,window_size + embedding_dim))
    y = np.zeros((num_samples,1))

    for idx in range(len(data) - window_size - 1):
        X[idx] = data[idx:idx + window_size]
        y[idx] = data[idx + window_size + 1]

    # evaluation metric
    def score(y_pred, y_test):
        rmse = np.sqrt(np.mean(np.square(y_pred - y_test)))
        return 1 / (1 + rmse)

    # train and predict
    reg = RandomForestRegressor(n_estimators=100)
    reg = reg.fit(X,y)

    num_test_days = 9
    data = data.tolist()
    for i in range(num_test_days * 24):
        pred_value = reg.predict(np.array(data[-window_size:]).reshape(1,-1))
        predValues.append(np.abs(pred_value))
        data.append(pred_value)
    
    if areaId % 100 == 0:
        print("[Area-{:d}] Finished. Duration: {:.1f} sec.".format(areaId, time.time() - sp))

In [None]:
submit_table = pd.read_csv("../Data/submit_example/test_submit_example.csv",header = None)
submit_table.iloc[:,2] = predValues
submit_table.to_csv('../Data/submit_files/submit_rf_window=8_20200310.csv',header = None, index = None)