In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import eli5

In [2]:
# load data
df_train = pd.read_hdf('../input/tram.train.h5')
df_test = pd.read_hdf('../input/tram.test.h5') #test set without answers

df = pd.concat([df_train, df_test])

In [3]:
#preparing data for algorithm
df['planned_time'] = pd.to_datetime(df['planned_time'])
df['datetime'] = pd.to_datetime(df['datetime'])
#delay_sec - real delay time in seconds
df['delay_sec'] = (df['datetime'] - df['planned_time'])/np.timedelta64(1,'s')
df.loc[df['delay_sec']<0, 'delay_sec'] = 0

df['hour'] = df['planned_time'].dt.hour
df['weekday'] = df['planned_time'].dt.weekday

df['direction_cat'] = df['direction'].factorize()[0]
# df['stop_name_cat'] = df['stop_name'].factorize()[0]

#seq_num column has some missing values
def replace_seq_num(number, direction_nan, direction):
    mask2 = (df['number']==number) & (df['direction']==direction)
    df_t = df[mask2].groupby('stop_name')['seq_num'].max().reset_index()
    for stop in df_t['stop_name'].unique():
        mask1 = (df['number']==number) & (df['direction']==direction_nan) & (df['stop_name']==stop)
        df.loc[mask1 & (df.seq_num.isnull()) , 'seq_num'] = df_t.loc[df_t['stop_name']==stop, 'seq_num'].values[0]

replace_seq_num(3, 'Dworzec Tow.', 'Krowodrza Górka')
replace_seq_num(4, 'Kombinat', 'Wzgórza K.')
replace_seq_num(21, 'Kombinat', 'Kopiec Wandy')
replace_seq_num(22, 'Kombinat', 'Walcownia')
replace_seq_num(44, 'Kombinat', 'Kopiec Wandy')

df['seq_num'] = df['seq_num'].fillna(-1)

# group hours into: rush_hours -3 , normal -2 , no_traffic -1
def day_time(x):
    if x>14 and x<18:
        return 3
    elif x>0 and x<7:
        return 1
    return 2
df['day_time'] = df['hour'].map(day_time)

# create length of route (number of stops) for each number and direction
df['route_len'] = df.groupby(['number', 'direction'])['seq_num'].transform('max')

# group seq_num
cut_labels = [-1, 1, 2, 3, 4, 5, 6]
cut_bins = [-2, 0, 5, 8, 14, 22, 27, 37]
df['seq_num_cut'] = pd.cut(df['seq_num'], bins=cut_bins, labels=cut_labels)

# group stops based on average of delay
def group_stops(grp):
    avg = grp['delay'].mean()
    if avg < 20:
        grp['stop_group'] = 1
    elif avg < 40:
        grp['stop_group'] = 2
    elif avg < 60:
        grp['stop_group'] = 3
    elif avg < 80:
        grp['stop_group'] = 4
    else:
        grp['stop_group'] = 5
    return grp

df = df.groupby('stop_name').apply(group_stops)

# function calculate statistics for given features
def df_group_delay(df, groupby_feats):
    agg_params = {
        'mean_{}_delay'.format('_'.join(groupby_feats)): ('delay', 'mean'),
        'mean_{}_delay_sec'.format('_'.join(groupby_feats)): ('delay_sec', 'mean'),
        'median_{}_delay_sec'.format('_'.join(groupby_feats)): ('delay_sec', 'median'),
        'std_{}_delay'.format('_'.join(groupby_feats)): ('delay', 'std'),
        'count_zeros_{}_delay'.format('_'.join(groupby_feats)): ('delay', lambda vals: len([x for x in vals if x == 0]) ),
        'prob_zeros_{}_delay'.format('_'.join(groupby_feats)): ('delay', lambda vals: np.mean([x == 0 for x in vals]) ),
    }
    
    return df[groupby_feats + ['delay', 'delay_sec']].groupby(groupby_feats).agg(
        **agg_params
    ).reset_index()

df_tmp = df_group_delay(df, ['stop', 'number', 'direction'])
if 'mean_stop_number_direction_delay' not in df:
    df = pd.merge(df, df_tmp, on=['stop', 'number', 'direction'], how='left')
    
# time difference between arrivals for each stop and day based on planned_time 
df['date'] = df['planned_time'].dt.date
df['diff'] = df.sort_values(['stop_name','planned_time']).groupby(['stop_name', 'date'])['planned_time'].diff().dt.seconds.div(60, fill_value=0)

#time difference not grouped 
df['time_diff'] = df['planned_time'].diff().dt.total_seconds().fillna(0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 308152 entries, 0 to 308151
Data columns (total 28 columns):
 #   Column                                   Non-Null Count   Dtype         
---  ------                                   --------------   -----         
 0   id                                       308152 non-null  int64         
 1   delay                                    175986 non-null  float64       
 2   datetime                                 175986 non-null  datetime64[ns]
 3   stop                                     308152 non-null  int64         
 4   stop_name                                308152 non-null  object        
 5   number                                   308152 non-null  int64         
 6   direction                                308152 non-null  object        
 7   planned_time                             308152 non-null  datetime64[ns]
 8   vehicle_id                               308152 non-null  float64       
 9   trip_id                   

In [5]:
#set feats to use in algorithm
feats_num = df.select_dtypes('number').columns
feats_drop = ['id', 'delay', 'delay_sec', 'trip_id', 'vehicle_id', 'stop_name_cat']

def get_feat(feats_num, feats_drop, feats_add=[]):
#     print(feats_drop)
    feats = [feat for feat in feats_num if feat not in feats_drop]
    return feats + feats_add

In [6]:
#xgboost parameters
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.2,
            'subsample': 0.92,
            'random_state': 21,
            'n_estimators': 100,
            }

def validate_model(df, feats, params, eli=True):
    df_train = df[~df['delay'].isnull()].copy()
    
    X = df_train[feats].values
    y = df_train['delay'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

    model = XGBRegressor(**params)
    model.fit(X_train, np.sqrt(y_train))
#     model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_absolute_error(y_test, y_pred**2)
#     score = mean_absolute_error(y_test, y_pred)
    print('MAE: {}'.format(score))

    if eli:
        model.fit(X, y)
        display(eli5.show_weights(model, feature_names=feats))
        
def submit(df, feats, params):
    train = df[~df['delay'].isnull()].copy()
    test = df[df['delay'].isnull()]
    test = test.drop('delay', axis=1)
    
    X = train[feats].values
    y = train['delay'].values
    X_test = test[feats].values
    
    model = XGBRegressor(**params)
#     model.fit(X, y)
#     y_pred = model.predict(X_test)
    
    model.fit(X, np.sqrt(y))
    y_pred = (model.predict(X_test))**2
    
    test['delay'] = y_pred
    test[ ['id', 'delay'] ].to_csv("../output/xgb_1.csv", index=False)

In [7]:
feats = get_feat(feats_num, feats_drop)
validate_model(df, feats, params)

MAE: 27.646713547827403


Weight,Feature
0.7219,time_diff
0.0878,prob_zeros_stop_number_direction_delay
0.0393,mean_stop_number_direction_delay_sec
0.0303,mean_stop_number_direction_delay
0.0237,day_time
0.0186,median_stop_number_direction_delay_sec
0.0136,diff
0.0117,std_stop_number_direction_delay
0.0101,hour
0.0084,stop_group


In [8]:
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.2,
            'subsample': 0.92,
            'random_state': 21,
            'n_estimators': 400}
feats = get_feat(feats_num, feats_drop)
validate_model(df, feats, params, eli=False)

MAE: 27.754449167406698


In [9]:
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.15,
            'subsample': 0.92,
            'random_state': 21,
            'n_estimators': 400}
feats = get_feat(feats_num, feats_drop)
validate_model(df, feats, params, eli=False)

MAE: 27.63159433815376


In [10]:
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.1,
            'subsample': 0.92,
            'random_state': 21,
            'n_estimators': 400}
feats = get_feat(feats_num, feats_drop)
validate_model(df, feats, params, eli=False)

MAE: 27.54620882949872


In [12]:
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.1,
            'subsample': 0.92,
            'random_state': 21,
            'n_estimators': 600}
feats = get_feat(feats_num, feats_drop)
validate_model(df, feats, params, eli=False)

MAE: 27.571497660343
