In [29]:
import joblib
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

In [3]:
loaded_model = joblib.load("XGBoost_model.joblib")

In [10]:
data = pd.read_parquet("data/final_test.parquet")

In [5]:
external_data = pd.read_csv("data/external_data.csv")

In [11]:
data["Id"] = range(len(data))

In [9]:
external_data["date"] = pd.to_datetime(external_data["date"])
external_data = external_data.dropna(axis=1, how="all")
threshold = 0.8  # drop columns with 80% missing values
external_data = external_data.dropna(thresh=threshold * len(external_data), axis=1)
external_data = external_data.fillna(external_data.mean())
external_data = external_data.drop("numer_sta", axis=1)

  external_data = external_data.fillna(external_data.mean())


In [13]:
data_sorted = data.sort_values(by="date")
external_data_sorted = external_data.sort_values(by="date")

# Merge based on a 6-hour time difference
merged_data = pd.merge_asof(
    data_sorted,
    external_data_sorted,
    on="date",
    direction="backward",
    tolerance=pd.Timedelta("6H"),
)

In [18]:
feature_set = merged_data

In [19]:
# Convert the 'date' column to datetime objects
feature_set["date"] = pd.to_datetime(feature_set["date"])

# Extract datetime features
feature_set["day_of_week"] = feature_set["date"].dt.dayofweek
feature_set["month"] = feature_set["date"].dt.month
feature_set["hour"] = feature_set["date"].dt.hour

In [20]:
lag_variables = ["pmer", "t", "ff", "u", "pres", "raf10", "rr1"]

In [21]:
for variable in lag_variables:
    for i in range(1, 4):  # Create lag features for the past 3 hours
        feature_set[f"{variable}_lag_{i}"] = feature_set[variable].shift(i)

In [24]:
feature_set["wind_x"] = feature_set["ff"] * np.cos(np.radians(feature_set["dd"]))
feature_set["wind_y"] = feature_set["ff"] * np.sin(np.radians(feature_set["dd"]))
feature_set["temp_humidity_interaction"] = feature_set["t"] * feature_set["u"]
feature_set["temp_lag_1"] = feature_set["t"].shift(1)
feature_set["cloud_wind_interaction"] = feature_set["cl"] * feature_set["ff"]
# Create lag features for precipitation using precipitation over 1 hour
feature_set["precipitation_lag_1"] = feature_set["rr1"].shift(1)
feature_set["precipitation_lag_2"] = feature_set["rr1"].shift(2)

# Explore interactions with other weather variables
feature_set["interaction_precipitation_temperature"] = (
    feature_set["rr1"] * feature_set["t"]
)
# Calculate the age of the bike counter since installation
feature_set["counter_age"] = (
    feature_set["date"] - feature_set["counter_installation_date"]
).dt.days

In [30]:
# Imput NaN with mean
feature_set_cleaned = feature_set.fillna(feature_set.mean())

  feature_set_cleaned = feature_set.fillna(feature_set.mean())
  feature_set_cleaned = feature_set.fillna(feature_set.mean())


In [31]:
velib = pd.read_csv("data/velib_cleaned.csv")
velib_stations = list(zip(velib["latitude"], velib["longitude"]))
feature_set_coords = feature_set_cleaned[["latitude", "longitude"]]
velib["latitude"] = pd.to_numeric(velib["latitude"])
velib["longitude"] = pd.to_numeric(velib["longitude"])
velib_tree = cKDTree(velib[["latitude", "longitude"]].values)
distances, indices = velib_tree.query(feature_set_coords, k=1)
feature_set_cleaned["distance_to_nearest_velib"] = distances

In [34]:
feature_set_cleaned["year"] = feature_set_cleaned["date"].dt.year

In [41]:
# sorting by Id to return to original order

In [37]:
feature_set_cleaned = feature_set_cleaned.sort_values(by="Id")

In [43]:
numerical_features_xgb = [
    "counter_age",
    "t",
    "t_lag_1",
    "t_lag_2",
    "t_lag_3",
    "tend24",
    "distance_to_nearest_velib",
    "td",
    "temp_humidity_interaction",
    "wind_x",
    "wind_y",
    "tend",
    "u_lag_1",
    "u_lag_2",
    "u_lag_3",
    "hbas",
    "u",
    "rr24",
    "rafper",
    "n",
    "cloud_wind_interaction",
    "nbas",
    "rr12",
    "rr6",
    "pres",
    "pres_lag_1",
    "pres_lag_2",
    "pres_lag_3",
    "raf10_lag_1",
    "raf10_lag_2",
    "raf10_lag_3",
    "ff_lag_1",
    "ff_lag_2",
    "ff_lag_3",
    "pmer",
    "pmer_lag_1",
    "pmer_lag_2",
    "pmer_lag_3",
    "rr3",
    "raf10",
]

categorical_features_xgb = [
    "hour",
    "counter_name",
    "counter_technical_id",
    "site_name",
    "day_of_week",
    "etat_sol",
]

selected_features_xgb = numerical_features_xgb + categorical_features_xgb

In [44]:
df_xgb = feature_set_cleaned[selected_features_xgb].copy()

In [50]:
df_xgb["hour"] = df_xgb["hour"].astype("category")
df_xgb["day_of_week"] = df_xgb["day_of_week"].astype("category")
df_xgb["etat_sol"] = df_xgb["etat_sol"].astype("category")

In [51]:
df_xgb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51440 entries, 0 to 51285
Data columns (total 46 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   counter_age                51440 non-null  int64   
 1   t                          51440 non-null  float64 
 2   t_lag_1                    51440 non-null  float64 
 3   t_lag_2                    51440 non-null  float64 
 4   t_lag_3                    51440 non-null  float64 
 5   tend24                     51440 non-null  float64 
 6   distance_to_nearest_velib  51440 non-null  float64 
 7   td                         51440 non-null  float64 
 8   temp_humidity_interaction  51440 non-null  float64 
 9   wind_x                     51440 non-null  float64 
 10  wind_y                     51440 non-null  float64 
 11  tend                       51440 non-null  int64   
 12  u_lag_1                    51440 non-null  float64 
 13  u_lag_2                    5144

In [52]:
predictions = loaded_model.predict(df_xgb)

In [55]:
predictions_df = pd.DataFrame({"log_bike_count": predictions})

In [56]:
predictions_df.to_csv("predictions1.csv", index=True, index_label="Id")