In [1]:
# Given CSV file of past 14 days data, predict next 5 periods for all latt long and save to csv
import pandas as pd
import os
import Geohash
import numpy as np
import pickle
import time


DATA_PATH = os.getcwd() + "/Data/test.csv"
OUTPUT_CSV_PATH = os.getcwd() + '/Predict/output.csv'


# Load global variables
with open("Model/long_to_idx_dic.pkl","rb") as f:
    long_to_idx_dic = pickle.load(f)
with open("Model/latt_to_idx_dic.pkl","rb") as f:
    latt_to_idx_dic = pickle.load(f)
with open("Model/xgb_model.model","rb") as f:
    gbm = pickle.load(f)
with open("Model/lgb_model.model","rb") as f:
    lgbm = pickle.load(f)

latt_vals = sorted([item for item in latt_to_idx_dic.keys()])
long_vals = sorted([item for item in long_to_idx_dic.keys()])

In [2]:
def parse_raw_data(data_path=DATA_PATH):
    """Parses csv file into numpy array"""
    day_periods = 24*4
    df = pd.read_csv(data_path)

    # Add latt and long columns
    df['latt'] = df['geohash6'].apply(lambda x: Geohash.decode(x)[0])
    df['long'] = df['geohash6'].apply(lambda x: Geohash.decode(x)[1])
    last_day = max(df['day'].unique())
    last_period = max(['0'+item if len(item.split(':')[0])<2 else item for item in df[df['day']==last_day]['timestamp'].unique()])
    last_hour,last_minutes = map(int,last_period.split(':'))
    last_day_period = last_hour*4 + last_minutes//15

    # Populate raw demand data onto np array of shape (day_time_periods,latt,long)
    demand_data = []
    for day in range(last_day-8,last_day+1):
        filtered_day = df[df['day'] == day]
        print("Loading Day %d"%(day),end='\r')
        hours_gen = range(last_hour+1) if day==last_day else range(24)
        for hour in hours_gen:
            for minute in "0 15 30 45".split():
                if day==last_day and hour==last_hour and int(minute)>last_minutes:
                    break

                timestamp = "%d:%s"%(hour,minute)
                a = np.zeros((len(latt_vals),len(long_vals)))
                filtered_time = filtered_day[filtered_day['timestamp']==timestamp]

                for idx,item in filtered_time[['latt','long','demand']].iterrows():
                    latt,long,demand = item
                    latt_idx = latt_to_idx_dic[latt]
                    long_idx = long_to_idx_dic[long]
                    a[latt_idx,long_idx] = demand
                demand_data.append(a)
    demand_data = np.stack(demand_data,axis=0)
    
    # Check if enough data was provided
    assert demand_data.shape[0] >= 7*day_periods+3
    
    return demand_data, (last_day,last_day_period)
    



In [3]:

def increase_time_period(day,day_period,increase_by=1):
    """Calculates value of day and day period after an increase in day_period"""
    day_periods = 24*4
    day_period += increase_by
    return day + day_period//day_periods, day_period%day_periods


def add_df_entries(df,pred_arr,day,day_period):
    """
    Adds entries from a numpy array into an existing pandas df, and returns the combined df

    Parameters
    ----------
    df : Pandas Dataframe
        Pandas dataframe of timestamps, day, geohash6 and demand data
    pred_arr: Numpy Array
        Numpy array of predicted demand with the shape (n_latt,n_long), where
            n_latt refers to the number of lattitude values
            n_long refers to the number of longitude values
    day: int
        Day value of the predicted numpy array
    day_period: int
        Day Period of the predicted numpy array. Valid values are 0-95 inclusive.
    
    Returns
    -------
    Pandas Dataframe
        Pandas dataframe of combined numpy and df provided
    """
    new_rows = []
    time = "%d:%d"%(day_period//4,(day_period%4)*15)
    for latt in latt_vals:
        latt_idx = latt_to_idx_dic[latt]
        for long in long_vals:
            long_idx = long_to_idx_dic[long]
            demand = pred_arr[latt_idx,long_idx]
            # Similar to training data, do not include 0 entries 
            if demand <=10e-6:
                continue
            geohash6 = Geohash.encode(latt,long,precision=6)
            new_rows.append({
                'geohash6': geohash6,
                'day': day,
                'timestamp': time,
                'demand': demand
            })
            
    
    new_rows_df = pd.DataFrame(new_rows)
    return df.append(new_rows_df,sort=False)


def extract_features(data,day,day_period):
    """
    Extracts features from numpy array of demand, for testing
    Numpy array should only contain data for the previous 7 days + 3 day_periods
    It is expected that model data is present in the /Model folder before running

    Parameters
    ----------
    data : Numpy array
        3D Numpy array of demand data, with the shape (n_periods, n_latt, n_long), where
            n_periods refers to the number of 15-minute periods
            n_latt refers to the number of lattitude values
            n_long refers to the number of longitude values
    day: int
        Day value of the period to be predicted
    day_period: int
        Day Period to be predicted. Valid values are 0-95 inclusive.

    Returns
    -------
    Numpy Array
        4D Numpy array of features, with the shape (n_features, 1, n_latt, n_long), where
            n_features refers to the number of features for each data sample
            n_latt refers to the number of lattitude values
            n_long refers to the number of longitude values
    """
    # Load normalizers
    with open("Model/all_meta_features.pkl","rb") as f:
        all_meta_features = pickle.load(f)
    day_periods = 24*4
    all_features = []
    n_periods = data.shape[0]
    
    # Normal Features
    # Current day features
    for X in [-7,-6,-5,-4,-3,-2,-1]:
        all_features.append(data[X:X+1+n_periods,:,:])
    # Previous day features
    for D in [-7,-4,-3,-2,-1]:
        for X in [-3,-2,-1,0,1,2,3,4]:
            all_features.append(data[D*day_periods + X:D*day_periods + X + 1 + n_periods,:,:])
    
    # Time period features
    day_periods_arr = np.arange(day_periods)/day_periods
    
    sin_arr = np.sin(2*np.pi*day_periods_arr)
    sin_arr = sin_arr[day_period:day_period+1]
    all_features.append(sin_arr[:,None,None]*np.ones((1,1,data.shape[2]))*np.ones((1,data.shape[1],1)))
    
    cos_arr = np.cos(2*np.pi*day_periods_arr)
    cos_arr = cos_arr[day_period:day_period+1]
    all_features.append(cos_arr[:,None,None]*np.ones((1,1,data.shape[2]))*np.ones((1,data.shape[1],1)))
    
    # Weekday period features
    weekday_periods_arr = np.arange(7)/7
    day = day % 7
    
    sin_arr = np.sin(2*np.pi*weekday_periods_arr)
    sin_arr = sin_arr[day : day+1]
    all_features.append(sin_arr[:,None,None]*np.ones((1,1,data.shape[2]))*np.ones((1,data.shape[1],1)))
    
    cos_arr = np.cos(2*np.pi*weekday_periods_arr)
    cos_arr = cos_arr[day : day+1]
    all_features.append(cos_arr[:,None,None]*np.ones((1,1,data.shape[2]))*np.ones((1,data.shape[1],1)))
    
    # Geospatial features
    latt = (np.arange(data.shape[1])[:,None]/data.shape[1]) * np.ones(data.shape[2])[None,:]
    long = np.ones(data.shape[1])[:,None] * (np.arange(data.shape[2])[None,:]/data.shape[2])
    all_features.extend([latt[None,:,:],long[None,:,:]])
    
    # Aggregrated demand features by location
    all_features.append(all_meta_features['agg_location'])

    
    # Aggregrated demand features by time period
    full_agg_arr = np.sum(np.sum(data,axis=1,keepdims=True),axis=2,keepdims=True).squeeze()
    divisor = all_meta_features['agg_period']
    divisor = np.tile(divisor, reps=full_agg_arr.shape[0]//96+2)
    divisor_start_idx = (day_period-full_agg_arr.shape[0])%96
    divisor = divisor[divisor_start_idx:divisor_start_idx+full_agg_arr.shape[0]]
    
    full_agg_arr /= divisor
    # Current day aggregrated demand
    for X in [-4,-3,-2,-1]:
        agg_arr = full_agg_arr[X+n_periods:X+1+n_periods,None,None]
        all_features.append(agg_arr*np.ones((1,data.shape[1],data.shape[2])))

    # Past day aggregrated demand
    for D in [-7]:
        for X in [-1,0,1]:
            agg_arr = full_agg_arr[D*day_periods + X:D*day_periods + X+1+n_periods,None,None]
            all_features.append(agg_arr*np.ones((1,data.shape[1],data.shape[2])))

    return np.stack(all_features,axis=0)



In [4]:
def predict(demand_data,last_day_tup):
    """
    Predicts T+1 to T+5 demand given demand data until period T, and saves prediction as a csv file in OUTPUT_CSV_PATH 

    Parameters
    ----------
    demand_data : Numpy array
        3D Numpy array of demand data, with the shape (n_periods, n_latt, n_long), where
            n_periods refers to the number of 15-minute periods
            n_latt refers to the number of lattitude values
            n_long refers to the number of longitude values
    last_day_tup: Tuple
        Tuple of (day, day_period) for the most recent day and day period in the demand data
    """
    last_day, last_day_period = last_day_tup
    day_periods = 24*4
    col_names = "geohash6 day timestamp demand".split()
    output_df = pd.DataFrame(columns = col_names)
    zero_demand = np.load('Model/zero_demand.npy')
    
    # Predict next value 5 times for all latt and long
    for _ in range(5):
        # Extract relevant from demand data
        last_day,last_day_period = increase_time_period(last_day,last_day_period)
        input_demand = demand_data[-7*day_periods-3:,:,:]
        input_demand = extract_features(input_demand,last_day,last_day_period)
        input_demand = input_demand.reshape((input_demand.shape[0],-1)).T

        # Predict
        pred = (gbm.predict(input_demand).squeeze() + lgbm.predict(input_demand).squeeze())/2
        pred = pred.clip(0,1)
        pred = pred.reshape((len(latt_vals),len(long_vals)))
        pred[zero_demand[0],zero_demand[1]] = 0. # Set geocode locations which are not in training set to zero
        demand_data = np.concatenate((demand_data,pred[None,:,:]),axis=0)

        # Store output in df
        output_df = add_df_entries(output_df,pred,last_day,last_day_period)
        
    # Save output to file
    output_df.to_csv(OUTPUT_CSV_PATH,index=False)

In [5]:
def run():
    demand_data, last_day_tup = parse_raw_data()
    predict(demand_data,last_day_tup)

In [7]:
if __name__ == "__main__":
    run()

Loading Day 61