## **Machine learning notebook for FDD Project @ Vasakronan summer internship 2022**

In [30]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, explained_variance_score, d2_tweedie_score
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from pathlib import Path
directory = './'

## Create machine learning model for the generated energy/day

In [None]:
def create_energy_day_models():
        k = 0
        files = Path(directory).glob('**/*.csv')
        for filepath in files:
                print(filepath)
                if 'uppsala' not in str(filepath):
                        data = pd.read_csv(filepath, skipinitialspace=True)
                        building_id = data['buildingId'][0]
                        # meter_id = data['metry_meter_id'][0]
                        # print("doing id: " + meter_id)
                        data = data.dropna(how = 'any')
                        data['date_time'] = pd.to_datetime(data['date_time'])
                        new_data = data.groupby(data.date_time.dt.date).agg({"value": "sum", "energy": "sum", "outdoorTemperature": "sum", "humidity": "sum", "month": "mean"})
                        print(new_data)
                        print(building_id)
                        # data['month'] = pd.to_datetime(
                        # data.round({'irradiance': 1, 'generated_energy': 1})

                        x = new_data[["value", "month", "outdoorTemperature", "humidity"]]
                        y = np.asarray(new_data[["energy"]])
                        if len(y) == 0:
                                continue
                        
                        x_train, x_test, y_train, y_test = train_test_split(
                                x, y, test_size=0.1, random_state=13)

                        model = RandomForestRegressor(max_features="auto", bootstrap=True,
                        max_depth= 80,
                        min_samples_leaf= 5,
                        min_samples_split= 12,
                        n_estimators= 100)
                        model.fit(x_train,y_train)


                        print(model.score(x_test, y_test))

                        preds = model.predict(x_test)

                        pickle.dump(model, open(f'./model_{building_id}_energy_per_day.pkl', 'wb'))
                        k += 1

create_energy_day_models()

In [None]:

def create_effect_hour_models():
    data = pd.read_csv('./data_biomedit1.csv')
    print(data)
    data['value'] = data['value']*335
    x = data[["value", "month", 'humidity', 'outdoorTemperature']]
    y = np.asarray(data[["energy"]])

    x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=0.20)
            
   # model = MLPRegressor(activation = 'tanh', alpha = 0.05, early_stopping = True, hidden_layer_sizes = (400, 1000, 1200), learning_rate = 'constant', max_iter = 10000, solver = 'adam', verbose=True)
    model = RandomForestRegressor(max_features="auto", bootstrap=True,
       max_depth= 500,
       min_samples_leaf= 5,
       min_samples_split= 12,
       n_estimators= 100)

    model.fit(x_train, y_train)

    print(model.score(x_test, y_test))
    pickle.dump(model, open(f'model_biomedi1v2_forest.pkl', 'wb'))
    #pickle.dump(scalerx, open(f'scalerx_{meter_id}.pkl', 'wb'))
    #pickle.dump(scalery, open(f'scalery_{meter_id}.pkl', 'wb'))
    preds = model.predict(x_test)
    

    # print(model.score(x_test, y_test))
    # pickle.dump(model, open(f'model_{meter_id}.pkl', 'wb'))
    return model, preds, y_test

model, preds, y_test = create_effect_hour_models()

In [None]:
plt.rcParams['figure.figsize'] = (30, 10)
plt.plot(model.loss_curve_)
plt.title("Loss Curve", fontsize=14)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()

In [None]:

preds_up = [i*1.10 for i in preds[:70].tolist()]
preds_down = [i*0.90 for i in preds[:70].tolist()]
x = np.linspace(0,69,70)
plt.rcParams['figure.figsize'] = (30, 10)
fig, ax = plt.subplots()
ax.plot(y_test[:70], label='Actual values')
ax.plot(preds[:70], 'r', label='ML prediction')
ax.fill_between(x, np.array(preds_up), np.array(preds_down), color="gray", alpha=0.5, label='20% interval around ML preds')
leg = ax.legend()
plt.show()

In [None]:
print(d2_tweedie_score(y_test, preds))

In [None]:
data = pd.read_csv('./sthlm_csv_test2.csv')
data = data.dropna(how = 'any')

def mlp_model(X, Y):

    estimator=MLPRegressor()


    param_grid = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,1)],
            'activation': ['relu','tanh','logistic'],
            'alpha': [0.0001, 0.05],
            'learning_rate': ['constant','adaptive'],
            'solver': ['adam']}

    gsc = GridSearchCV(
        estimator,
        param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    grid_result = gsc.fit(X, Y)


    best_params = grid_result.best_params_
    print(best_params)

    best_mlp = MLPRegressor(hidden_layer_sizes = best_params["hidden_layer_sizes"], 
                            activation =best_params["activation"],
                            solver=best_params["solver"],
                            max_iter= 5000, n_iter_no_change = 200
                )

    scoring = {
            'abs_error': 'neg_mean_absolute_error',
            'squared_error': 'neg_mean_squared_error',
            'r2':'r2'}

    scores = cross_validate(best_mlp, X, Y, cv=10, scoring=scoring, return_train_score=True, return_estimator = True)
    return scores

mlp_model(data[["irradiance"]][:100],
np.asarray(data[["generated_energy"]][:100]))