# Libs

In [46]:
import os
import re
import sys
import glob
import time
import datetime
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
from functools import reduce

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append("d:\softwares\python36\lib\site-packages")
from easyeda import eda
from geohash import encode

from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense, LeakyReLU, Input
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import KLD,categorical_crossentropy
from tensorflow.keras.utils import normalize

# Load Data

In [2]:
filedir = glob.glob(pathname='../Data/*.csv')
filedir

['../Data\\area_passenger_index.csv',
 '../Data\\area_passenger_info.csv',
 '../Data\\grid_strength.csv',
 '../Data\\migration_index.csv',
 '../Data\\shortstay_20200117_20200131.csv',
 '../Data\\shortstay_20200201_20200215.csv']

# Feature Engineering

## area info

In [3]:
# load data
area_passenger_info = pd.read_csv(filedir[1], header=None)
area_passenger_info.columns = ['areaIdx', 'areaName', 'areaType', 'centerLon', 'centerLat',
                               'gridLon', 'gridLat', 'coverage']
area_passenger_info.info()

# area type
# 交通设施：0-2，旅游景点：3，教育培训：4，购物：5，医疗：6，运动健身：7
areaTypes = area_passenger_info['areaType'].unique()
normalTypes = {'旅游景点':3,'教育培训':4,'购物':5,'医疗':6,'运动健身':7}
type_to_idx = {}
idx = 0
for item in areaTypes:
    preType = re.match("(.*);(.*)",item)[1]
    if  preType == '交通设施':
        type_to_idx[item] = idx
        idx += 1
    elif preType in normalTypes.keys():
        type_to_idx[item] = normalTypes[preType]
    else:
        print("this type does not exist.")

area_passenger_info['areaType'] = area_passenger_info['areaType'].replace(type_to_idx)
area_passenger_info['coverage'] = area_passenger_info['coverage'] / 4e+4

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 997 entries, 0 to 996
Data columns (total 8 columns):
areaIdx      997 non-null int64
areaName     997 non-null object
areaType     997 non-null object
centerLon    997 non-null float64
centerLat    997 non-null float64
gridLon      997 non-null float64
gridLat      997 non-null float64
coverage     997 non-null float64
dtypes: float64(5), int64(1), object(2)
memory usage: 62.4+ KB


## index-stats embedding

In [4]:
area_passenger_ind = pd.read_csv(filedir[0],header = None)
area_passenger_ind.columns = ['areaIdx','datetime','Density']
area_passenger_ind['datetime'] = pd.to_datetime(area_passenger_ind['datetime'],format="%Y%m%d%H")
area_passenger_ind.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717840 entries, 0 to 717839
Data columns (total 3 columns):
areaIdx     717840 non-null int64
datetime    717840 non-null datetime64[ns]
Density     717840 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 16.4 MB


In [5]:
area_passenger_ind['ToD'] = area_passenger_ind['datetime'].map(lambda x: x.hour)
area_passenger_ind['DoW'] = area_passenger_ind['datetime'].map(lambda x: x.weekday())
embed_label = area_passenger_ind.pivot_table(index='areaIdx',
                                             columns='ToD',
                                             values='Density',
                                             aggfunc=['mean', 'std', 'median', np.ptp])

In [6]:
def get_embedding(embedding_dim, batch_size, epochs):
    # build model
    x = Input(shape=(1,))
    o = Embedding(input_dim=997, output_dim=embedding_dim,
                  embeddings_initializer=he_normal(), name='embedding')(x)
    h = Dense(24 * 4, use_bias=False,
              kernel_initializer=he_normal(), activation='relu')(o)
    model = Model(inputs=x, outputs=h)
    model.compile(loss='mse', optimizer=Adam(3e-4))
    
    # train embedding weights
    hist = model.fit(np.arange(0, 997).reshape(-1, 1), normalize(embed_label.values),
                 batch_size=batch_size, epochs=epochs, shuffle=True, verbose=0)
    
    # output embedding vector
    areaEmbedding = model.get_weights()[0]
    
    return areaEmbedding, hist

In [7]:
embedding_dim = 100
areaEmbedding, trainingLog = get_embedding(embedding_dim=embedding_dim,
                                           batch_size=8,epochs=1000)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


## Historic index (same area)

In [60]:
def get_hitoric_index(area_passenger_ind, window_size, num_samples, num_areas, num_days):
    
    # init
    historicIndex = np.zeros((num_samples, window_size))
    sample_idx = 0
    sp = time.time()
    
    # main loop
    for area_idx in range(1, num_areas + 1):
        if area_idx % 200 == 0:
            print("[Area-{:d}] started, duration: {:.1f} sec.".format(area_idx, time.time() - sp))
        area_df = area_passenger_ind[area_passenger_ind.areaIdx == area_idx]
        for i in range(24 * num_days - window_size):
            historicIndex[sample_idx] = area_df['Density'].values[i:i + window_size]
            sample_idx += 1
    
    return historicIndex

In [62]:
# params
window_size = 6
num_areas = 997
num_days = 30
num_samples = (24 * num_days - window_size)  * num_areas

# get historic index
historicIndex = get_hitoric_index(area_passenger_ind,
                                  window_size=window_size,
                                  num_samples=num_samples,
                                  num_areas=num_areas,
                                  num_days=num_days)

[Area-200] started, duration: 1.7 sec.
[Area-400] started, duration: 3.5 sec.
[Area-600] started, duration: 5.2 sec.
[Area-800] started, duration: 7.0 sec.


In [64]:
histMean = historicIndex.mean(axis = 1)
histStd = historicIndex.std(axis = 1)

In [65]:
historicIndexDf = pd.DataFrame()
for col in range(window_size):
    historicIndexDf['historic_' + str(col)] = historicIndex[:,col]
historicIndexDf['histMean'] = histMean
historicIndexDf['histStd'] = histStd

## Concat

In [66]:
areaEmbeddingDf = pd.DataFrame(np.arange(1,998),columns=['areaIdx'])
for col in range(embedding_dim):
    areaEmbeddingDf["embedding_" + str(col)] = areaEmbedding[:,col]

In [67]:
areaAttr = pd.merge(area_passenger_info, areaEmbeddingDf, on='areaIdx')
areaAttr.drop('areaName', axis=1, inplace=True)

In [68]:
dfs = [area_passenger_ind, areaAttr]
AreaDensity = reduce(lambda a,b:pd.merge(a,b,on='areaIdx'),dfs)
AreaDensity.drop("datetime",axis=1,inplace=True)

# Build Dataset

In [80]:
# init
X = np.zeros((num_samples,AreaDensity.shape[1]))
sample_idx = 0
sp = time.time()

# main loop
for area_idx in range(1, num_areas + 1):
    if area_idx % 200 == 0:
        print("[Area-{:d}] started, duration: {:.1f} sec.".format(area_idx, time.time() - sp))
    area_df = AreaDensity[AreaDensity.areaIdx == area_idx]
    for i in range(window_size, 24 * num_days):
        X[sample_idx] = area_df.values[i,:]
        sample_idx += 1

[Area-200] started, duration: 33.8 sec.
[Area-400] started, duration: 67.8 sec.
[Area-600] started, duration: 102.0 sec.
[Area-800] started, duration: 135.4 sec.


In [88]:
X = pd.concat((pd.DataFrame(X, columns=AreaDensity.columns),
               historicIndexDf), axis=1)
Y_data = X['Density']
X_data = X.drop(['areaIdx', 'Density'], axis=1)

# Model Dev

In [85]:
predValues = []
sp = time.time()
for areaId in range(997):
    data = area_passenger_ind[area_passenger_ind.areaIdx == areaId]['Density'].values

    # build dataset
    window_size = 8
    num_samples = len(data) - window_size
    X = np.zeros((num_samples,window_size + embedding_dim))
    y = np.zeros((num_samples,1))

    for idx in range(len(data) - window_size - 1):
        X[idx] = data[idx:idx + window_size]
        y[idx] = data[idx + window_size + 1]

    # evaluation metric
    def score(y_pred, y_test):
        rmse = np.sqrt(np.mean(np.square(y_pred - y_test)))
        return 1 / (1 + rmse)

    # train and predict
    reg = RandomForestRegressor(n_estimators=100)
    reg = reg.fit(X,y)

    num_test_days = 9
    data = data.tolist()
    for i in range(num_test_days * 24):
        pred_value = reg.predict(np.array(data[-window_size:]).reshape(1,-1))
        predValues.append(np.abs(pred_value))
        data.append(pred_value)
    
    if areaId % 100 == 0:
        print("[Area-{:d}] Finished. Duration: {:.1f} sec.".format(areaId, time.time() - sp))

[Area-100] Finished. Duration: 91.9 sec.
[Area-200] Finished. Duration: 181.7 sec.
[Area-300] Finished. Duration: 272.7 sec.
[Area-400] Finished. Duration: 362.7 sec.
[Area-500] Finished. Duration: 452.7 sec.
[Area-600] Finished. Duration: 543.6 sec.
[Area-700] Finished. Duration: 631.6 sec.
[Area-800] Finished. Duration: 725.1 sec.
[Area-900] Finished. Duration: 814.8 sec.


In [86]:
submit_table = pd.read_csv("../Data/submit_example/test_submit_example.csv",header = None)
submit_table.iloc[:,2] = predValues
submit_table.to_csv('../Data/submit_files/submit_rf_window=8_20200310.csv',header = None, index = None)