## Imports and installations

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
import numpy as np

In [None]:
!pip install hopsworks

In [None]:
!pip install bayesian-optimization

## Hopsworks

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

In [None]:
feature_view = fs.get_feature_view(
    name = 'air_quality_fv',
    version = 1
)

In [None]:
trainX,testX,trainY,testY = feature_view.get_train_test_split(1)

## Data Cleaning

In [None]:
def cleanData(X,Y):
  try:
    X = X.drop(columns=["date","pm25","pm10","weathercode","no2","sunrise","sunset"])
  except:
    print("An exception occurred")
  notNa = X.isna().any(axis=1)
  return X[~notNa],Y[~notNa]

In [None]:
trainX,trainY = cleanData(trainX,trainY)
trainX.head()

## Transformation

### Train

In [None]:
scaler = StandardScaler()
scaledTrainX = scaler.fit_transform(trainX)
print(scaler.mean_)
print(scaler.var_)
print(scaledTrainX)
print(scaledTrainX.shape)

In [None]:
trainY = np.ravel(trainY)

### Test

In [None]:
testX,testY = cleanData(testX,testY)
testX = scaler.transform(testX)
testY = np.ravel(testY)

## Modeling

### GradientBoostingRegressor

In [None]:
gb = GradientBoostingRegressor()
gb.fit(scaledTrainX, trainY)
gb.score(testX,testY)
gbPred = gb.predict(testX)
gbRpred = np.rint(gbPred)
mean_squared_error(testY, gbRpred)

## RandomForest

In [None]:
rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(scaledTrainX, trainY)
rf.score(testX,testY)
rfPred = rf.predict(testX)
rfRpred = np.rint(rfPred)
mean_squared_error(testY, rfRpred)

## Neural network

In [None]:
NN = MLPRegressor(random_state=1, max_iter=500)
NN.fit(scaledTrainX, trainY)
NN.score(testX, testY)
nnPred = NN.predict(testX)
nnRpred = np.rint(nnPred)
mean_squared_error(testY, nnRpred)

## HyperParameter

In [None]:
from bayes_opt import BayesianOptimization

def rf_evaluate(n_estimators, max_depth):
    # Create the random forest object
    rf = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth))
    
    # Fit the model on the training data
    rf.fit(scaledTrainX, trainY)
    
    # Make predictions on the test data
    y_pred = rf.predict(testX)
    rfRpred = np.rint(y_pred)
    # Compute the accuracy score
    return -mean_squared_error(testY, rfRpred)

bo = BayesianOptimization(rf_evaluate, {'n_estimators': (50, 500), 'max_depth': (3, 400)})
# Perform the optimization
bo.maximize()

# Print the best parameters and the best score
print("Best parameters: {}".format(bo.max))
print("Best score: {:.2f}".format(bo.max['target']))

In [None]:
from bayes_opt import BayesianOptimization

def gb_evaluate(n_estimators, max_depth):
    # Create the random forest object
    #rf = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth))
    gb = GradientBoostingRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth))
    # Fit the model on the training data
    gb.fit(scaledTrainX, trainY)
    
    # Make predictions on the test data
    y_pred = gb.predict(testX)
    gbRpred = np.rint(y_pred)
    # Compute the accuracy score
    return -mean_squared_error(testY, gbRpred)

bo = BayesianOptimization(gb_evaluate, {'n_estimators': (50, 1000), 'max_depth': (3, 100)})
# Perform the optimization
bo.maximize()

# Print the best parameters and the best score
print("Best parameters: {}".format(bo.max))
print("Best score: {:.2f}".format(bo.max['target']))

## Model Registry

In [None]:
rf.predict(xx)

In [None]:
mr = project.get_model_registry()

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(testX)
output_schema = Schema(testY)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

model_schema.to_dict()

In [None]:
import joblib

joblib.dump(gb, 'model.pkl')

In [None]:
model = mr.sklearn.create_model(
    name="gradient_boost_model",
    metrics={"MSE": "178.32"},
    description="Gradient Boost Regressor without hyperparameter tuning",
    input_example=trainX.sample().to_numpy(),
    model_schema=model_schema
)

model.save('model.pkl')