# <span style='color:#ff5f27'> Initialization </span>

### Hopsworks Settings

In [None]:
import sys
from pathlib import Path
import os

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('airquality',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")
    
# Set the environment variables from the file <root_dir>/.env
from mlfs import config
if os.path.exists(f"{root_dir}/.env"):
    settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

### Imports

In [None]:
from datetime import date, datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import hopsworks
from mlfs import util
import json

import warnings
warnings.filterwarnings("ignore")

# <span style='color:#ff5f27'> Retrieve Data </span>

### Retrieve Metadata

In [None]:
# Check if HOPSWORKS_API_KEY env variable is set or if it is set in ~/.env
if settings.HOPSWORKS_API_KEY is not None:
    api_key = settings.HOPSWORKS_API_KEY.get_secret_value()
    os.environ['HOPSWORKS_API_KEY'] = api_key
project = hopsworks.login()
fs = project.get_feature_store() 

secrets = hopsworks.get_secrets_api()
location_str = secrets.get_secret("BIKES_LOCATION_JSON").value
location = json.loads(location_str)

country=location['country']
city=location['city']
latitude=location['latitude']
longitude=location['longitude']
today = date.today()
yesterday = today - timedelta(days=1)

print(f"City: {city}")
print(f"Contry: {country}")
print(f"Latitude: {latitude}")
print(f"Longitude: {longitude}")
print(f"Day: {today}")

### Connect to Feature Groups

In [None]:
bikes_fg = fs.get_feature_group(
    name='bikes_trento',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather_trento',
    version=1,
)

### Create Feature Views

In [None]:
selected_features = bikes_fg.select(
    ['id', 'bikes', 'date']
).join(weather_fg.select_features(), on=['city'])

In [None]:
feature_view = fs.get_or_create_feature_view(
    name='bikes_fv',
    description="Features selected for bike predictions.",
    version=1,
    labels=['bikes'],
    query=selected_features,
)

In [None]:
feature_view = fs.get_feature_view(name="bikes_fv", version=1)
feature_view.schema

# <span style='color:#ff5f27'> Create Train/Test Datasets </span>

### Retrieve the Data

In [None]:
start_date_test_data = "2025-06-01"
# Convert string to datetime object
test_start = datetime.strptime(start_date_test_data, "%Y-%m-%d")

In [None]:
# The line that generates the error:

X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_start=test_start
)

In [None]:
X_train.head()

### Remove Date and Parse Categories

In [None]:
X_features = X_train.drop(columns=['date'])
X_test_features = X_test.drop(columns=['date'])

X_features['id'] = X_features['id'].astype("category")
X_features['weather_code'] = X_features['weather_code'].astype("category")
X_test_features['id'] = X_test_features['id'].astype("category")
X_test_features['weather_code'] = X_test_features['weather_code'].astype("category")

### Print Info

In [None]:
X_features.head()

In [None]:
X_features.info()

In [None]:
y_train.head()

In [None]:
y_train.info()

# <span style='color:#ff5f27'> Train the Model </span>

### Instantiate and Train an XGBoost Model

In [None]:
# Creating an instance of the XGBoost Regressor
xgb_regressor = XGBRegressor(enable_categorical=True)

# Fitting the XGBoost Regressor to the training data
xgb_regressor.fit(X_features, y_train)

### Compute the Evaluation Metrics

In [None]:
# Predicting target values on the test set
y_pred = xgb_regressor.predict(X_test_features)

# Calculating Mean Squared Error (MSE) using sklearn
mse = mean_squared_error(y_test.iloc[:,0], y_pred)
print("MSE:", mse)

# Calculating R squared using sklearn
r2 = r2_score(y_test.iloc[:,0], y_pred)
print("R squared:", r2)

### Create Comparison Dataframe

In [None]:
df = y_test
df['predicted_bikes'] = y_pred
df['date'] = X_test['date']
df['id'] = X_test['id']
df = df.sort_values(by=['date'])
df.head(10)

### Create a directory for the model artifacts

In [None]:
model_dir = "trento_bikes_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

### Plot the Hindcast

In [None]:
metadata = pd.DataFrame()
metadata['id'] = df['id'].drop_duplicates()  
metadata.head()

In [None]:
metadata.info()

In [None]:
# for i in range(metadata.shape[0]):
#     street = metadata.iloc[i]['street']

#     file_path = images_dir + "/pm25_hindcast_" + street + ".png"
#     df_i = df[df['street'] == street]
#     plt = util.plot_air_quality_forecast(city, street, df_i, file_path, hindcast=True) 
#     plt.show()

### Plotting Feature Importances

In [None]:
plot_importance(xgb_regressor)
feature_importance_path = images_dir + "/feature_importance.png"
plt.savefig(feature_importance_path)
plt.show()

# <span style='color:#ff5f27'> Save the Model </span>

### Save the Model Locally

In [None]:
# Saving the XGBoost regressor object as a json file in the model directory
xgb_regressor.save_model(model_dir + "/trento_bikes_xgboost_model.json")

### Get Model Registry

In [None]:
mr = project.get_model_registry()

### Push the Model to Hopsworks

In [None]:
res_dict = { 
    "MSE": str(mse),
    "R squared": str(r2),
}

In [None]:
bikes_model = mr.python.create_model(
    name="trento_bikes_xgboost_model", 
    metrics= res_dict,
    feature_view=feature_view,
    description="Bikes Predictor for Trento",
)

In [None]:
bikes_model.save(model_dir)