# Basic Feature Engineering

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from catboost import CatBoostRegressor

# defining RMSE
def RMSE(y, y_hat):

    return np.sqrt(np.mean((y-y_hat)**2))


def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])


#def _merge_external_data(X):
#    file_path = Path(__file__).parent / "external_data.csv"
#    df_ext = pd.read_csv(file_path, parse_dates=["date"])
#
#    X = X.copy()
#    # When using merge_asof left frame need to be sorted
#    X["orig_index"] = np.arange(X.shape[0])
#    X = pd.merge_asof(
#        X.sort_values("date"), df_ext[["date", "t"]].sort_values("date"), on="date"
#    )
    # Sort back to the original order
#    X = X.sort_values("orig_index")
#    del X["orig_index"]
#    return X


def get_estimator():
    date_encoder = FunctionTransformer(_encode_dates)
    date_cols = ["year", "month", "day", "weekday", "hour"]

    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ["counter_name", "site_name", "coordinates"]

    preprocessor = ColumnTransformer(
        [
            ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
            ("cat", categorical_encoder, categorical_cols),
        ]
    )
    regressor = CatBoostRegressor(
        depth=11,
        iterations=1000,
        rsm=0.05,
        sampling_frequency="PerTree",
        subsample=0.9,
        verbose=0,
    )

    pipe = make_pipeline(
        date_encoder,
        preprocessor,
        regressor,
    )

    return pipe

data = pd.read_parquet('train.parquet')
X_test = pd.read_parquet('final_test.parquet')

y = data['log_bike_count']
X = data[['counter_name', 'site_name', 'date', 'coordinates']]
X_test = X_test[['counter_name', 'site_name', 'date', 'coordinates']]

pipe = get_estimator()

pipe.fit(X, y)
pred = pipe.predict(X)





{'counter_id': 56, 'counter_name': 56, 'site_id': 30, 'site_name': 30, 'bike_count': 998, 'date': 8974, 'counter_installation_date': 22, 'coordinates': 30, 'counter_technical_id': 30, 'latitude': 30, 'longitude': 30, 'log_bike_count': 998}
