# Activity Duration Model (ADM)

This is the second part of the Activity Scheduler model. In our analysis, we will be using the output file `predicted_ATM.csv` produced in ATM model as the input.

## Goal
Our goal is to use a tree to implement a duration prediction model, based on the activity type we predicted. The method is to treat the leaf subset itself as a definition of an empirical distribution.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_next = pd.read_csv('df_train_ATM.csv')

In [3]:
def adjust_dep_time(row):
    if row['dep_time'] < row['arr_time']:
        return row['dep_time'] + 96
    else:
        return row['dep_time']

# Apply function to each row in the DataFrame
df_next['dep_time'] = df_next.apply(adjust_dep_time, axis=1)
df_next

Unnamed: 0,id,purpose,arr_time,dep_time,gender,education,driver_license,X_transit_pass,employment,X_student,next_purpose,new_next_purpose,predicted_next_purpose
0,10320361,1,12,33,1,6,1,0,1,0,22,others,1
1,10320361,22,34,62,1,6,1,0,1,0,1,1,1
2,10320361,1,63,66,1,6,1,0,1,0,15,others,17
3,10320361,15,67,70,1,6,1,0,1,0,15,others,1
4,10320361,15,71,74,1,6,1,0,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173007,72119444,20,40,47,2,1,0,0,0,0,1,1,20
173008,72119444,1,54,107,2,1,0,0,0,0,,,
173009,72119712,1,12,66,1,6,1,0,1,0,2,2&3,24
173010,72119712,2,66,72,1,6,1,0,1,0,1,1,1


In [4]:
df_next['duration'] = df_next['dep_time'] - df_next['arr_time']

In [5]:
df_next

Unnamed: 0,id,purpose,arr_time,dep_time,gender,education,driver_license,X_transit_pass,employment,X_student,next_purpose,new_next_purpose,predicted_next_purpose,duration
0,10320361,1,12,33,1,6,1,0,1,0,22,others,1,21
1,10320361,22,34,62,1,6,1,0,1,0,1,1,1,28
2,10320361,1,63,66,1,6,1,0,1,0,15,others,17,3
3,10320361,15,67,70,1,6,1,0,1,0,15,others,1,3
4,10320361,15,71,74,1,6,1,0,1,0,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173007,72119444,20,40,47,2,1,0,0,0,0,1,1,20,7
173008,72119444,1,54,107,2,1,0,0,0,0,,,,53
173009,72119712,1,12,66,1,6,1,0,1,0,2,2&3,24,54
173010,72119712,2,66,72,1,6,1,0,1,0,1,1,1,6


## Empirical tree

In [6]:
# one_hot_encoding
df_next['purpose'] = df_next['purpose']
df_model = pd.get_dummies(df_next, columns=['purpose'])
df_model

Unnamed: 0,id,arr_time,dep_time,gender,education,driver_license,X_transit_pass,employment,X_student,next_purpose,...,purpose_17,purpose_18,purpose_19,purpose_20,purpose_21,purpose_22,purpose_23,purpose_24,purpose_25,purpose_26
0,10320361,12,33,1,6,1,0,1,0,22,...,0,0,0,0,0,0,0,0,0,0
1,10320361,34,62,1,6,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
2,10320361,63,66,1,6,1,0,1,0,15,...,0,0,0,0,0,0,0,0,0,0
3,10320361,67,70,1,6,1,0,1,0,15,...,0,0,0,0,0,0,0,0,0,0
4,10320361,71,74,1,6,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173007,72119444,40,47,2,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
173008,72119444,54,107,2,1,0,0,0,0,,...,0,0,0,0,0,0,0,0,0,0
173009,72119712,12,66,1,6,1,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
173010,72119712,66,72,1,6,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

features = df_model[['purpose_1','purpose_2','purpose_3','purpose_4','purpose_5','purpose_6','purpose_7','purpose_8','purpose_9','purpose_10',
                     'purpose_11','purpose_12','purpose_13','purpose_14','purpose_15','purpose_16','purpose_17','purpose_18','purpose_19','purpose_20',
                     'purpose_21','purpose_22','purpose_23','purpose_24','purpose_25','purpose_26',
                     'arr_time','education','driver_license']] # 输入为下一个活动的类型和当前活动的结束时间

target = df_model['duration']

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

regressor = DecisionTreeRegressor(random_state=42)

regressor.fit(features_train, target_train)

predictions = regressor.predict(features_test)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

features = df_model[['purpose_1','purpose_2','purpose_3','purpose_4','purpose_5','purpose_6','purpose_7','purpose_8','purpose_9','purpose_10',
                     'purpose_11','purpose_12','purpose_13','purpose_14','purpose_15','purpose_16','purpose_17','purpose_18','purpose_19','purpose_20',
                     'purpose_21','purpose_22','purpose_23','purpose_24','purpose_25','purpose_26',
                     'arr_time','education','driver_license']]
target = df_model['duration']

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# 使用随机森林回归器，设置n_estimators为100，max_depth为10，min_samples_split为2，min_samples_leaf为1来避免过拟合
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, random_state=42)

regressor.fit(features_train, target_train)

predictions = regressor.predict(features_test)

In [9]:
print('Duration')
print('predicted_mean:',predictions.mean(),'   predicted_std:',np.std(predictions))
print('testingset_mean:',target_test.mean(),'   testingset_std:',np.std(target_test))

Duration
predicted_mean: 20.90083129786905    predicted_std: 9.9800829176003
testingset_mean: 20.894980204028553    testingset_std: 16.44411198066082


In [10]:
import dill

with open('ADM.pkl', 'wb') as f:
    dill.dump(regressor, f)

In [11]:
features.columns

Index(['purpose_1', 'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5',
       'purpose_6', 'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10',
       'purpose_11', 'purpose_12', 'purpose_13', 'purpose_14', 'purpose_15',
       'purpose_16', 'purpose_17', 'purpose_18', 'purpose_19', 'purpose_20',
       'purpose_21', 'purpose_22', 'purpose_23', 'purpose_24', 'purpose_25',
       'purpose_26', 'arr_time', 'education', 'driver_license'],
      dtype='object')