In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [3]:
df = pd.read_csv("data/taxi_fare.csv", parse_dates=["pickup_datetime"])
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1.0


In [4]:
df.dropna(inplace=True)

pickup_dt = df.pickup_datetime.dt

df["hour"] = pickup_dt.hour
df["day"] = pickup_dt.day
df["weekday"] = pickup_dt.weekday
df["weekend"] = (df.weekday >= 5).astype(int)
df["month"] = pickup_dt.month
df["year"] = pickup_dt.year
df.drop(columns=["pickup_datetime"], inplace=True)

df["Distance"] =  np.abs(df.pickup_longitude - df.dropoff_longitude) + np.abs(df.pickup_latitude - df.dropoff_latitude)
df.drop(columns=["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"], inplace=True)


df = df[(df.fare_amount > 0) & (df.Distance > 0)]

df.head()

Unnamed: 0,fare_amount,passenger_count,hour,day,weekday,weekend,month,year,Distance
0,4.5,1.0,17,15,0,0,6,2009,0.011742
1,16.9,1.0,16,5,1,0,1,2010,0.107481
2,5.7,2.0,0,18,3,0,8,2011,0.019212
3,7.7,1.0,4,21,5,1,4,2012,0.029386
4,5.3,1.0,7,9,1,0,3,2010,0.027194


<img src="https://media.discordapp.net/attachments/969207152679993414/972863347479433256/unknown.png">

In [5]:
from jcopml.plot import plot_correlation_matrix
plot_correlation_matrix(df, target_col="fare_amount", numeric_col=["Distance", "hour", "day", "weekday", "weekend", "month", "year"])

interactive(children=(ToggleButtons(description='method', options=('spearman', 'kendall', 'pearson', 'pearson_…

hindarin feature yg saling berkorelasi

In [6]:
X = df.drop(columns=["fare_amount"])
y = df["fare_amount"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14705, 8), (3677, 8), (14705,), (3677,))

In [7]:
from jcopml.tuning import random_search_params as rsp
rsp.xgb_poly_params

{'prep__numeric__poly__degree': Integer(low=1, high=3),
 'prep__numeric__poly__interaction_only': [True, False],
 'algo__max_depth': Integer(low=1, high=10),
 'algo__learning_rate': Real(low=-2, high=0, prior='log-uniform'),
 'algo__n_estimators': Integer(low=100, high=200),
 'algo__subsample': Real(low=0.3, high=0.8, prior='uniform'),
 'algo__gamma': Integer(low=1, high=10),
 'algo__colsample_bytree': Real(low=0.1, high=1, prior='uniform'),
 'algo__reg_alpha': Real(low=-3, high=1, prior='log-uniform'),
 'algo__reg_lambda': Real(low=-3, high=1, prior='log-uniform')}

In [8]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(scaling="minmax", poly=2, transform="yeo-johnson"), ['passenger_count', 'year', 'Distance']),
    ('categoric', cat_pipe(encoder='onehot'), ['weekday', 'weekend', 'day','month', 'hour']),
])

from xgboost import XGBRegressor
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

from sklearn.model_selection import RandomizedSearchCV


model = RandomizedSearchCV(pipeline, rsp.xgb_poly_params, cv=3, n_iter=100, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

  warn("Transformer has default standardization, so the scaling argument is neglected")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'algo__colsample_bytree': 0.8943727088704059, 'algo__gamma': 7, 'algo__learning_rate': 0.04764963542138517, 'algo__max_depth': 3, 'algo__n_estimators': 118, 'algo__reg_alpha': 0.8013508750140631, 'algo__reg_lambda': 3.8765111709116367, 'algo__subsample': 0.7435432121325587, 'prep__numeric__poly__degree': 1, 'prep__numeric__poly__interaction_only': False}
0.8123910670877569 0.7835801137206394 0.7900199958862859


req alpa itu l1 <br>
req lambda itu l2 <br>

XGB itu ada built in regulasiasi

<img src="https://media.discordapp.net/attachments/969207152679993414/972865170202304612/unknown.png">

https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration