In [2]:
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
sys.path.append("..")
import utility.utility_functions as ut
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = ut.load_post_data_prep_all_zone()
df.dropna(inplace=True)
df = df.drop(df[df.trip_time < 5].index)
df = df.drop('pickup_date',axis=1)
df.reset_index(inplace=True,drop=True)

In [4]:
main_df = ut.get_dummies(['weekday','time_binned'],df)
main_df = ut.swap_first_and_last_col(main_df)
y, X = ut.split_X_y(main_df,'num_of_taxis')

In [5]:
scaler = MinMaxScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)

In [6]:
# Feature score 
features = main_df.columns.tolist()
model_lr = LinearRegression()
model_lr.fit(X,y)
importance = model_lr.coef_
for i,v in enumerate(importance):
	print(f'Feature: {features[i]}, Score: {v:.2f}]')

Feature: Zone, Score: 0.32]
Feature: 00:00 - 00:59, Score: 800672609152.22]
Feature: 01:00 - 01:59, Score: 800672609141.45]
Feature: 02:00 - 02:59, Score: 800672609131.81]
Feature: 03:00 - 03:59, Score: 800672609122.36]
Feature: 04:00 - 04:59, Score: 800672609121.13]
Feature: 05:00 - 05:59, Score: 800672609123.67]
Feature: 06:00 - 06:59, Score: 800672609142.59]
Feature: 07:00 - 07:59, Score: 800672609153.39]
Feature: 08:00 - 08:59, Score: 800672609153.21]
Feature: 09:00 - 09:59, Score: 800672609146.51]
Feature: 10:00 - 10:59, Score: 800672609145.08]
Feature: 11:00 - 11:59, Score: 800672609146.50]
Feature: 12:00 - 12:59, Score: 800672609152.29]
Feature: 13:00 - 13:59, Score: 800672609152.82]
Feature: 14:00 - 14:59, Score: 800672609157.26]
Feature: 15:00 - 15:59, Score: 800672609157.07]
Feature: 16:00 - 16:59, Score: 800672609149.74]
Feature: 17:00 - 17:59, Score: 800672609173.18]
Feature: 18:00 - 18:59, Score: 800672609199.01]
Feature: 19:00 - 19:59, Score: 800672609200.93]
Feature: 20:

In [7]:
feature_dict = {'Features':features[:-1],'Score':importance}
feature_importance_sorted = pd.DataFrame(feature_dict).sort_values(by = ['Score'],ascending=True)['Features']

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
#Backwards
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(scaled_X, y, test_size=0.3, random_state=123)
X_train_scaled_backwards = X_train_scaled.copy()
backward_rmse_list = []
backward_best_rmse   = np.std(y)
for feature in (feature_importance_sorted):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
    xg_reg.fit(X_train_scaled_backwards,y_train)
    train_pred = xg_reg.predict(X_train_scaled_backwards)
    rmse_train = np.sqrt(mean_squared_error(y_train,train_pred))
    backward_rmse_list.append(rmse_train)
    if backward_best_rmse > rmse_train:
        backward_best_rmse = rmse_train
        backward_best_features = X_train_scaled_backwards.columns.tolist()
    X_train_scaled_backwards.drop(feature,axis=1,inplace = True)

In [9]:
print(f'Target Feature STD: {np.std(y):.2f}, Our best rmse is: {backward_best_rmse:.2f} with {len(backward_best_features)} features and the features are: {backward_best_features}')

Target Feature STD: 136.78, Our best rmse is: 115.40 with 36 features and the features are: ['Zone', '00:00 - 00:59', '01:00 - 01:59', '02:00 - 02:59', '03:00 - 03:59', '04:00 - 04:59', '05:00 - 05:59', '06:00 - 06:59', '07:00 - 07:59', '08:00 - 08:59', '09:00 - 09:59', '10:00 - 10:59', '11:00 - 11:59', '12:00 - 12:59', '13:00 - 13:59', '14:00 - 14:59', '15:00 - 15:59', '16:00 - 16:59', '17:00 - 17:59', '18:00 - 18:59', '19:00 - 19:59', '20:00 - 20:59', '21:00 - 21:59', '22:00 - 22:59', '23:00 - 23:59', 'Tmax', 'Tmin', 'Tdep', 'HDD', 'CDD', 'Precipitation', 'new_snow', 'snow_depth', 'trip_distance', 'trip_time', 'speed']


In [10]:
#Forwards
forward_rmse_list = []
forward_best_rmse = np.std(y)
forward_list = []
X_train_scaled_forward = []
for feature in list(reversed(feature_importance_sorted.tolist())):
    forward_list.append(feature)
    X_train_scaled_forward = X_train_scaled[forward_list]
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
    xg_reg.fit(X_train_scaled_forward,y_train)
    train_pred = xg_reg.predict(X_train_scaled_forward)
    rmse_train = np.sqrt(mean_squared_error(y_train,train_pred))
    forward_rmse_list.append(rmse_train)
    if forward_best_rmse > rmse_train:
        forward_best_rmse = rmse_train
        forward_best_features = X_train_scaled_forward.columns.tolist()

In [11]:
print(f'Target Feature STD: {np.std(y):.2f}, Our best rmse is: {forward_best_rmse:.2f} with {len(forward_best_features)} features and the features are: {forward_best_features}')

Target Feature STD: 136.78, Our best rmse is: 115.92 with 37 features and the features are: ['Tmax', 'Tmin', '21:00 - 21:59', '19:00 - 19:59', '18:00 - 18:59', '22:00 - 22:59', '20:00 - 20:59', '23:00 - 23:59', '17:00 - 17:59', '14:00 - 14:59', '15:00 - 15:59', '07:00 - 07:59', '08:00 - 08:59', '13:00 - 13:59', '12:00 - 12:59', '00:00 - 00:59', '16:00 - 16:59', '09:00 - 09:59', '11:00 - 11:59', '10:00 - 10:59', '06:00 - 06:59', '01:00 - 01:59', '02:00 - 02:59', '05:00 - 05:59', '03:00 - 03:59', '04:00 - 04:59', 'trip_distance', 'snow_depth', 'CDD', 'Precipitation', 'Zone', 'Tdep', 'HDD', 'new_snow', 'trip_time', 'speed', 'Saturday']


In [12]:
with open('../best_features.txt','w') as file:
    for feature in backward_best_features:
        file.write("%s\n"%feature)