# 1. Load Dependencies

In [1]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import torch.nn as nn
import torch
import joblib
import os

from google.colab import drive
drive.mount('/content/drive')

!pip install scikit-tensor-py3
from sktensor import dtensor

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
if torch.cuda.is_available():      
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# 2. Load Saved Modules

In [3]:
# Load Min-max scaler models

scaler_dic = dict()
features = ['calories', 'distance']

dir = '/content/drive/My Drive/Colab Notebooks/DATA5703/scaler_model/'

for feature in features:
    path = os.path.join(dir, 'scaler_'+feature+'_2.m')
    scaler_dic[feature] = joblib.load(path)

# Load onehot encoder for categorical features

dir = '/content/drive/My Drive/Colab Notebooks/DATA5703/OneHotEncoder.m'

OneHot_enc = joblib.load(dir)

# Load user embedding and route embedding

user_dir = '/content/drive/My Drive/Colab Notebooks/DATA5703/Data/embedding/userEmbed_tensorD_13.m'
route_dir = '/content/drive/My Drive/Colab Notebooks/DATA5703/Data/embedding/routeEmbed_tensorD_13.m'

user_embed_df = joblib.load(user_dir)
route_embed_df = joblib.load(route_dir)

In [4]:
# Define model

class DisReg_MLP_2Layer(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, p):
        super(DisReg_MLP_2Layer, self).__init__()

        self.fc1 = nn.Linear(input_dim, hidden_dim_1)
        self.fc2 = nn.Linear(hidden_dim_1, 1)

        self.act_1 = nn.ReLU()
        self.act_2 = nn.Sigmoid()

        self.drop_1 = nn.Dropout(p)

    def forward(self, data):

        output = self.fc1(data)
        output = self.act_1(output)
        output = self.drop_1(output)

        output = self.fc2(output)
        output = self.act_2(output)

        return output


# Define function to load model

def load_model(filename, model):

    DATA_PATH = "/content/drive/My Drive/Colab Notebooks/DATA5703/"+filename

    checkpoint = torch.load(DATA_PATH)
    model.load_state_dict(checkpoint['best_model_state_dict'])

    return model


# Load Model

LOAD_MODEL_NAME = 'DisReg_MLP_2L_13'
INPUT_DIM = 34
HIDDEN_DIM = 64
DROP_OUT = 0.2

model = DisReg_MLP_2Layer(
    INPUT_DIM, HIDDEN_DIM, DROP_OUT).to(device)
model = load_model(LOAD_MODEL_NAME, model)

# 3. Load and Process Data

In [5]:
file_path_test = '/content/drive/My Drive/Colab Notebooks/DATA5703/Data/Dataset/[1109]TestData_adjusted.csv'
cols = ['id', 'userId', 'gender', 'sport',
        'calories', 'Route_id', 'distance_adjusted_sum']

test_df = pd.read_csv(file_path_test, usecols=cols, nrows=500)

# 4. Take User Input and Process Data

In [6]:
# Take User Inputs

USER_ID = 13577416
GENDER = 'male'
SPORT = 'bike'
CALORIES = 500
WORKOUT_ID = 153749422

In [7]:
# find Route_id based on workout id
Route_id = test_df.loc[test_df.id == WORKOUT_ID, 'Route_id'].to_numpy()[0]
# find route total distance based on workout id
distance_adjusted_sum = test_df.loc[test_df.id ==
                                    WORKOUT_ID, 'distance_adjusted_sum'].to_numpy()[0]
# find userEmbedding based on userId
user_embed = np.array(
    user_embed_df[user_embed_df.userId == USER_ID].userEmbed.values[0])
# find routeEmbedding based on routeId
route_embed = np.array(
    route_embed_df[route_embed_df.Route_id == Route_id].routeEmbed.values[0])

In [8]:
def data_process(gender, sport, calories, Route_id, distance_adjusted_sum, user_embed, route_embed):
    # label encode gender

    choices = [0, 1, 2]
    conditions = [
        (gender == 'male'),
        (gender == 'female'),
        (gender == 'unknown')]

    genderId = np.select(conditions, choices, default=0)

    # label encode sport

    conditions = [
        (sport == 'run'),
        (sport == 'bike'),
        (sport == 'mountain bike')]

    sportId = np.select(conditions, choices, default=0)

    # scale calories
    calories_scaled = scaler_dic['calories'].transform(
        np.array(calories).reshape(-1, 1))[0]
    # scale distance
    total_distance_scaled = scaler_dic['distance'].transform(
        np.array(distance_adjusted_sum).reshape(-1, 1))[0]
    # one hot encode genderId and sportId
    gender_sport_onehot = OneHot_enc.transform(
        np.hstack((genderId, sportId)).reshape(1, -1)).toarray()[0]
    # concatenate input features into numpy array
    data_input = np.hstack((calories_scaled, total_distance_scaled,
                            user_embed, route_embed, gender_sport_onehot))

    return data_input


data_input = data_process(
    GENDER,
    SPORT,
    CALORIES,
    Route_id,
    distance_adjusted_sum,
    user_embed,
    route_embed)

# 5. Inference

In [9]:
model.eval()
with torch.no_grad():
    pred = model(torch.Tensor(data_input).to(device)).item()

# convert predicted distance to km
pred_km = scaler_dic['distance'].inverse_transform(
    np.array(pred).reshape(-1, 1))[0][0]

In [10]:
print('Predicted distance: {:.4} km'.format(pred_km))
print('Workout route total distance: {:.4} km'.format(distance_adjusted_sum))

Predicted distance: 14.87 km
Workout route total distance: 15.06 km
