In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor



In [2]:
data = pd.read_parquet("data/train.parquet")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455163 entries, 48321 to 928462
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 455163 non-null  category      
 1   counter_name               455163 non-null  category      
 2   site_id                    455163 non-null  int64         
 3   site_name                  455163 non-null  category      
 4   bike_count                 455163 non-null  float64       
 5   date                       455163 non-null  datetime64[ns]
 6   counter_installation_date  455163 non-null  datetime64[ns]
 7   counter_technical_id       455163 non-null  category      
 8   latitude                   455163 non-null  float64       
 9   longitude                  455163 non-null  float64       
 10  log_bike_count             455163 non-null  float64       
dtypes: category(4), datetime64[ns](2), float64(4), i

In [4]:
data["date"] = pd.to_datetime(data["date"])
data["day_of_week"] = data["date"].dt.dayofweek
data["month"] = data["date"].dt.month
data["hour"] = data["date"].dt.hour

In [5]:
categorical_cols = [
    "counter_name",
    "site_name",
    "counter_technical_id",
    "day_of_week",
    "month",
    "hour",
]

In [6]:
numerical_cols = ["latitude", "longitude"]

In [7]:
for feature in categorical_cols:
    data[feature] = data[feature].astype(str)

In [8]:
cols_to_drop_train = ["counter_id", "site_id",
                      "date", "counter_installation_date"]

In [9]:
data = data.drop(cols_to_drop_train, axis=1)
data = data.drop("bike_count", axis=1)
X = data.drop("log_bike_count", axis=1)
y = data["log_bike_count"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

for feature in categorical_cols:
    X_train[feature] = X_train[feature].astype(str)
    X_test[feature] = X_test[feature].astype(str)

In [11]:
best_params = {
    "iterations": 1402,
    "depth": 10,
    "learning_rate": 0.03073417636647055,
    "l2_leaf_reg": 9.871350035301967,
}


train_pool = Pool(
    X_train, y_train, cat_features=categorical_cols, feature_names=list(X_train.columns)
)


test_pool = Pool(
    X_test, y_test, cat_features=categorical_cols, feature_names=list(X_test.columns)
)


best_cat_reg = CatBoostRegressor(
    cat_features=categorical_cols,
    verbose=True,
    **best_params,
    early_stopping_rounds=10,
)

In [12]:
best_cat_reg.fit(train_pool, eval_set=test_pool)

0:	learn: 1.6388929	test: 1.6363574	best: 1.6363574 (0)	total: 361ms	remaining: 8m 26s
1:	learn: 1.6032593	test: 1.6006975	best: 1.6006975 (1)	total: 517ms	remaining: 6m 1s
2:	learn: 1.5689927	test: 1.5663112	best: 1.5663112 (2)	total: 688ms	remaining: 5m 20s
3:	learn: 1.5347047	test: 1.5319411	best: 1.5319411 (3)	total: 884ms	remaining: 5m 9s
4:	learn: 1.5021848	test: 1.4994374	best: 1.4994374 (4)	total: 1.01s	remaining: 4m 41s
5:	learn: 1.4705174	test: 1.4677625	best: 1.4677625 (5)	total: 1.14s	remaining: 4m 25s
6:	learn: 1.4404231	test: 1.4376595	best: 1.4376595 (6)	total: 1.26s	remaining: 4m 11s
7:	learn: 1.4112425	test: 1.4084848	best: 1.4084848 (7)	total: 1.38s	remaining: 3m 59s
8:	learn: 1.3831874	test: 1.3803944	best: 1.3803944 (8)	total: 1.5s	remaining: 3m 51s
9:	learn: 1.3565719	test: 1.3537501	best: 1.3537501 (9)	total: 1.62s	remaining: 3m 44s
10:	learn: 1.3306779	test: 1.3278519	best: 1.3278519 (10)	total: 1.73s	remaining: 3m 39s
11:	learn: 1.3060384	test: 1.3032143	best: 1

<catboost.core.CatBoostRegressor at 0x1edb987dcc0>

In [13]:
from sklearn.ensemble import AdaBoostRegressor

In [14]:
catboost_predictions = best_cat_reg.predict(X_train)
catboost_predictions_2d = np.expand_dims(catboost_predictions, axis=1)

In [15]:
ada_boost = AdaBoostRegressor(
    base_estimator=best_cat_reg,
    n_estimators=10,
    random_state=42,
    learning_rate=0.1,
)

In [22]:
feature_names = X_train.columns.tolist()

In [24]:
ada_boost.fit(X_train, y_train, feature_names=feature_names)

TypeError: BaseWeightBoosting.fit() got an unexpected keyword argument 'feature_names'

In [21]:
test_data = pd.read_parquet("data/final_test.parquet")
test_data["date"] = pd.to_datetime(test_data["date"])
test_data["day_of_week"] = test_data["date"].dt.dayofweek
test_data["month"] = test_data["date"].dt.month
test_data["hour"] = test_data["date"].dt.hour

In [22]:
cols_to_drop_test = [
    "counter_id",
    "site_id",
    "date",
    "counter_installation_date",
    "coordinates",
]

In [23]:
test_data = test_data.drop(cols_to_drop_test, axis=1)

In [24]:
predictions_cat = best_cat_reg.predict(test_data)

In [25]:
predictions_cat_2d = np.expand_dims(predictions_cat, axis=1)

In [27]:
ada_boost_predictions = ada_boost.predict(predictions_cat_2d)

In [28]:
predictions_df = pd.DataFrame({"log_bike_count": ada_boost_predictions})

In [29]:
predictions_df.to_csv("submissions.csv", index=True, index_label="Id")

In [3]:
bs = pd.read_parquet("data/train_kaggle.parquet")

In [4]:
bs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 48321 to 929187
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[ns]
 6   counter_installation_date  496827 non-null  datetime64[ns]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  floa