In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import eli5

In [2]:
# load data
df_train = pd.read_hdf('../input/tram.train.h5')
df_test = pd.read_hdf('../input/tram.test.h5') #test set without answers

df = pd.concat([df_train, df_test])

In [3]:
#preparing data for algorithm
df['planned_time'] = pd.to_datetime(df['planned_time'])
df['datetime'] = pd.to_datetime(df['datetime'])
#delay_sec - real delay time in seconds
df['delay_sec'] = (df['datetime'] - df['planned_time'])/np.timedelta64(1,'s')
df.loc[df['delay_sec']<0, 'delay_sec'] = 0

df['hour'] = df['planned_time'].dt.hour
df['weekday'] = df['planned_time'].dt.weekday

df['direction_cat'] = df['direction'].factorize()[0]
df['stop_name_cat'] = df['stop_name'].factorize()[0]


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 308152 entries, 0 to 308145
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   id             308152 non-null  int64         
 1   delay          175986 non-null  float64       
 2   datetime       175986 non-null  datetime64[ns]
 3   stop           308152 non-null  int64         
 4   stop_name      308152 non-null  object        
 5   number         308152 non-null  int64         
 6   direction      308152 non-null  object        
 7   planned_time   308152 non-null  datetime64[ns]
 8   vehicle_id     308152 non-null  float64       
 9   trip_id        308152 non-null  int64         
 10  seq_num        305770 non-null  float64       
 11  delay_sec      175986 non-null  float64       
 12  hour           308152 non-null  int64         
 13  weekday        308152 non-null  int64         
 14  direction_cat  308152 non-null  int64         
 15  

In [5]:
#set feats to use in algorithm
feats_num = df.select_dtypes('number').columns
feats_drop = ['id', 'delay', 'delay_sec', 'trip_id']

def get_feat(feats_num, feats_drop, feats_add=[]):
#     print(feats_drop)
    feats = [feat for feat in feats_num if feat not in feats_drop]
    return feats + feats_add

In [11]:
#xgboost parameters
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.2,
            'subsample': 0.92,
            'random_state': 21,
            'n_estimators': 100,
            }

def validate_model(df, feats, params, eli=True):
    df_train = df[~df['delay'].isnull()].copy()
    
    X = df_train[feats].values
    y = df_train['delay'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

    model = XGBRegressor(**params)
    model.fit(X_train, np.sqrt(y_train))
#     model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_absolute_error(y_test, y_pred**2)
#     score = mean_absolute_error(y_test, y_pred)
    print('MAE: {}'.format(score))

    if eli:
        model.fit(X, y)
        display(eli5.show_weights(model, feature_names=feats))
        
def submit(df, feats, params):
    train = df[~df['delay'].isnull()].copy()
    test = df[df['delay'].isnull()]
    test = test.drop('delay', axis=1)
    
    X = train[feats].values
    y = train['delay'].values
    X_test = test[feats].values
    
    model = XGBRegressor(**params)
#     model.fit(X, y)
#     y_pred = model.predict(X_test)
    
    model.fit(X, np.sqrt(y))
    y_pred = (model.predict(X_test))**2
    
    test['delay'] = y_pred
    test[ ['id', 'delay'] ].to_csv("../output/xgb_1.csv", index=False)

In [7]:
feats = get_feat(feats_num, feats_drop)
validate_model(df, feats, params)

MAE: 48.09329381649878


Weight,Feature
0.2767,seq_num
0.2059,number
0.1708,direction_cat
0.1279,hour
0.0985,stop
0.0634,stop_name_cat
0.0568,weekday
0.0,vehicle_id


vehicle_id seems to be irrelevant so we can drop it

stop and stop_name are duplicates so let's check which one gives better result

In [8]:
feats = get_feat(feats_num, feats_drop + ['stop', 'vehicle_id'])
validate_model(df, feats, params, eli=False)

MAE: 48.227096021566226


In [9]:
feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'])
validate_model(df, feats, params, eli=False)

MAE: 48.243967236968054


I will leave stop 

now I check square root transformation on target variable.

In [12]:
#sqrt added to validate_model function
feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'])
validate_model(df, feats, params)

MAE: 46.144653635631606


Weight,Feature
0.316,seq_num
0.1911,direction_cat
0.1846,number
0.1543,hour
0.0915,stop
0.0624,weekday


It gives better score so i will use it further.

seq_num feature has some missing values . I will try to impute some of them based on existing routes for the same line and overlapping direction 

In [13]:
#seq_num column has some missing values
def replace_seq_num(number, direction_nan, direction):
    mask2 = (df['number']==number) & (df['direction']==direction)
    df_t = df[mask2].groupby('stop_name')['seq_num'].max().reset_index()
    for stop in df_t['stop_name'].unique():
        mask1 = (df['number']==number) & (df['direction']==direction_nan) & (df['stop_name']==stop)
        df.loc[mask1 & (df.seq_num.isnull()) , 'seq_num'] = df_t.loc[df_t['stop_name']==stop, 'seq_num'].values[0]

replace_seq_num(3, 'Dworzec Tow.', 'Krowodrza Górka')
replace_seq_num(4, 'Kombinat', 'Wzgórza K.')
replace_seq_num(21, 'Kombinat', 'Kopiec Wandy')
replace_seq_num(22, 'Kombinat', 'Walcownia')
replace_seq_num(44, 'Kombinat', 'Kopiec Wandy')

df['seq_num'] = df['seq_num'].fillna(-1)

In [14]:
feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'])
validate_model(df, feats, params)

MAE: 46.07282379268727


Weight,Feature
0.3067,seq_num
0.1921,direction_cat
0.1825,number
0.1639,hour
0.0868,stop
0.0679,weekday


In [15]:
# group hours into: rush_hours -3 , normal -2 , no_traffic -1
def day_time(x):
    if x>14 and x<18:
        return 3
    elif x>0 and x<7:
        return 1
    return 2
df['day_time'] = df['hour'].map(day_time)

In [17]:
feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'], ['day_time'])
validate_model(df, feats, params)

MAE: 46.11846231260734


Weight,Feature
0.3252,day_time
0.2,seq_num
0.1505,direction_cat
0.1469,number
0.074,stop
0.0606,hour
0.0427,weekday


day_time's feature importance is high but it is not improve score.

In [18]:
# create length of route (number of stops) for each number and direction
df['route_len'] = df.groupby(['number', 'direction'])['seq_num'].transform('max')

feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'], ['route_len'])
validate_model(df, feats, params)

MAE: 46.00828414627362


Weight,Feature
0.2563,seq_num
0.1759,number
0.1754,direction_cat
0.1482,route_len
0.1161,hour
0.071,stop
0.0571,weekday


In [19]:
# group seq_num
cut_labels = [-1, 1, 2, 3, 4, 5, 6]
cut_bins = [-2, 0, 5, 8, 14, 22, 27, 37]
df['seq_num_cut'] = pd.cut(df['seq_num'], bins=cut_bins, labels=cut_labels)


feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'], ['route_len', 'seq_num_cut'])
validate_model(df, feats, params)

MAE: 46.03210530502848


Weight,Feature
0.2504,seq_num
0.1763,number
0.1564,route_len
0.1519,direction_cat
0.1133,hour
0.0597,stop
0.0557,weekday
0.0364,seq_num_cut


In [20]:
# group stops based on average of delay
def group_stops(grp):
    avg = grp['delay'].mean()
    if avg < 20:
        grp['stop_group'] = 1
    elif avg < 40:
        grp['stop_group'] = 2
    elif avg < 60:
        grp['stop_group'] = 3
    elif avg < 80:
        grp['stop_group'] = 4
    else:
        grp['stop_group'] = 5
    return grp

df = df.groupby('stop_name').apply(group_stops)

feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'], ['route_len', 'stop_group'])
validate_model(df, feats, params)

MAE: 45.83170344654855


Weight,Feature
0.3921,stop_group
0.1851,seq_num
0.0944,direction_cat
0.0932,number
0.0917,route_len
0.0768,hour
0.0338,weekday
0.0329,stop


In [21]:
# add to feats day_time and seq_num_cut
feats = get_feat(feats_num, feats_drop + ['stop_name_cat', 'vehicle_id'], ['route_len', 'stop_group', 'day_time', 'seq_num_cut'])
validate_model(df, feats, params)

MAE: 45.900120419186166


Weight,Feature
0.2655,stop_group
0.209,seq_num_cut
0.1563,day_time
0.1027,seq_num
0.0652,number
0.0639,direction_cat
0.0568,route_len
0.033,hour
0.026,weekday
0.0216,stop
