In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
%matplotlib inline

Mounted at /content/drive


In [2]:
# train/test 데이터셋 preprocessing을 위한 functions

def start_pipeline(df):
  return df.copy()

def concat_columns(df, col1, col2):
  #combine two categorical variables
  df[f"{col1}_{col2}"] = df[col1].astype(str) + df[col2].astype(str)
  return df

def target_encoding(df, variable_to_encode, target_variable):
  # encode a categorical variable with the mean value of each category
  # variable_to_encode is the categorical variable we wish to encode
  # in this case, target_variable is ATA
  means_dict = df.groupby(variable_to_encode)[target_variable].mean().to_dict()
  df[f"mean_{target_variable}_by_{variable_to_encode}"] = df[variable_to_encode].replace(means_dict)
  return df

def distance_column(df):
  # calculate distance from latitude and longitude
  df['distance'] = ((df['pickup_lat'] - df['driver_lat'])**2 +
                    (df['pickup_lng'] - df['driver_lng'])**2)*100000
  return df


def pick_columns(df, column_lst):
  #select the variables (ie columns) to use for model training
  return df[column_lst]

In [3]:
tada_eta = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/tada_eta.xlsx')
tada_eta = tada_eta.sample(frac=1, random_state=0).reset_index(drop=True)
train = tada_eta[:12000]
test = tada_eta[12000:]

# Select features to use for model training
use_features = ["ATA", "api_eta", "hour", 'distance', 'mean_ATA_by_pickup_gu_hour']

# Preprocess train & test datasets
train = (train
         .pipe(start_pipeline)
         .pipe(concat_columns, "pickup_gu", "hour")
         .pipe(target_encoding, variable_to_encode="pickup_gu_hour", target_variable='ATA')
         .pipe(distance_column)
         .pipe(pick_columns, column_lst=use_features))
test = (test
         .pipe(start_pipeline)
         .pipe(concat_columns, "pickup_gu", "hour")
         .pipe(target_encoding, variable_to_encode="pickup_gu_hour", target_variable='ATA')
         .pipe(distance_column)
         .pipe(pick_columns, column_lst=use_features))

x_train = np.asarray(train.drop(labels = 'ATA', axis=1))
y_train = np.asarray(train['ATA'])
x_test = np.asarray(test.drop(labels='ATA', axis=1))
y_test = np.asarray(test['ATA'])

# Choose hyperparameters
# GridSearch 사용하지 않고 직접 찾았습니다
hyperparams = {'n_estimators': 186,
               'max_depth': 2,
               'min_samples_leaf': 1,
               'learning_rate': 0.04,
               'loss': 'squared_error'}

gb = ensemble.GradientBoostingRegressor(**hyperparams)
gb.fit(x_train, y_train)
print("Training complete")
print("Training set MSE: ", gb.train_score_[-1])

mse = mean_squared_error(y_test, gb.predict(x_test))
print("Test set MSE",mse)

Training complete
Training set MSE:  7.767936958243358
Test set MSE 7.759516283188541
