In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [42]:
data = pd.read_parquet("data/train.parquet")

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455163 entries, 48321 to 928462
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 455163 non-null  category      
 1   counter_name               455163 non-null  category      
 2   site_id                    455163 non-null  int64         
 3   site_name                  455163 non-null  category      
 4   bike_count                 455163 non-null  float64       
 5   date                       455163 non-null  datetime64[ns]
 6   counter_installation_date  455163 non-null  datetime64[ns]
 7   counter_technical_id       455163 non-null  category      
 8   latitude                   455163 non-null  float64       
 9   longitude                  455163 non-null  float64       
 10  log_bike_count             455163 non-null  float64       
dtypes: category(4), datetime64[ns](2), float64(4), i

In [44]:
data["date"] = pd.to_datetime(data["date"])
data["day_of_week"] = data["date"].dt.dayofweek
data["month"] = data["date"].dt.month
data["hour"] = data["date"].dt.hour

In [45]:
categorical_cols = [
    "counter_name",
    "site_name",
    "counter_technical_id",
    "day_of_week",
    "month",
    "hour",
]

In [46]:
numerical_cols = ["latitude", "longitude"]

In [47]:
for feature in categorical_cols:
    data[feature] = data[feature].astype(str)

In [48]:
cols_to_drop_train = ["counter_id", "site_id", "date", "counter_installation_date"]

In [49]:
data = data.drop(cols_to_drop_train, axis=1)
data = data.drop("bike_count", axis=1)
X = data.drop("log_bike_count", axis=1)
y = data["log_bike_count"]

In [50]:
encoder = OneHotEncoder(drop="first", sparse_output=False)

In [51]:
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))
X = pd.concat([X.drop(categorical_cols, axis=1), X_encoded], axis=1)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

ValueError: Found input variables with inconsistent numbers of samples: [724554, 455163]

In [21]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "booster": "gbtree",
        "eval_metric": "rmse",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.8, 1.0),
    }

    xgb_model = XGBRegressor(**params)
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [22]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

[I 2023-12-06 14:29:54,884] A new study created in memory with name: no-name-3d0e20b7-3c93-40b9-941d-e14b8c246730
[W 2023-12-06 14:29:54,896] Trial 0 failed with parameters: {'learning_rate': 0.10996874594655182, 'max_depth': 6, 'n_estimators': 102, 'subsample': 0.9554536686097066, 'colsample_bytree': 0.8719521793150562} because of the following error: ValueError('DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:counter_name: category, site_name: category, counter_technical_id: category, day_of_week: category, month: category, hour: category').
Traceback (most recent call last):
  File "c:\Users\ameya\miniforge-pypy3\envs\bikes-count\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ameya\AppData\Local\Temp\ipykernel_24348\1989577182.py", line 14, in objective
    xgb_model

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:counter_name: category, site_name: category, counter_technical_id: category, day_of_week: category, month: category, hour: category

In [16]:
reg = XGBRegressor(enable_categorical=True)

In [17]:
reg.fit(X_train, y_train)

ValueError: Experimental support for categorical data is not implemented for current tree method yet.

In [13]:
from sklearn.ensemble import AdaBoostRegressor

In [14]:
catboost_predictions = best_cat_reg.predict(X_train)
catboost_predictions_2d = np.expand_dims(catboost_predictions, axis=1)

In [15]:
ada_boost = AdaBoostRegressor(
    base_estimator=best_cat_reg,
    n_estimators=10,
    random_state=42,
    learning_rate=0.1,
)

In [22]:
feature_names = X_train.columns.tolist()

In [24]:
ada_boost.fit(X_train, y_train, feature_names=feature_names)

TypeError: BaseWeightBoosting.fit() got an unexpected keyword argument 'feature_names'

In [21]:
test_data = pd.read_parquet("data/final_test.parquet")
test_data["date"] = pd.to_datetime(test_data["date"])
test_data["day_of_week"] = test_data["date"].dt.dayofweek
test_data["month"] = test_data["date"].dt.month
test_data["hour"] = test_data["date"].dt.hour

In [22]:
cols_to_drop_test = [
    "counter_id",
    "site_id",
    "date",
    "counter_installation_date",
    "coordinates",
]

In [23]:
test_data = test_data.drop(cols_to_drop_test, axis=1)

In [24]:
predictions_cat = best_cat_reg.predict(test_data)

In [25]:
predictions_cat_2d = np.expand_dims(predictions_cat, axis=1)

In [27]:
ada_boost_predictions = ada_boost.predict(predictions_cat_2d)

In [28]:
predictions_df = pd.DataFrame({"log_bike_count": ada_boost_predictions})

In [29]:
predictions_df.to_csv("submissions.csv", index=True, index_label="Id")

In [3]:
bs = pd.read_parquet("data/train_kaggle.parquet")

In [4]:
bs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 48321 to 929187
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  floa