In [1]:
import pandas as pd
import numpy as np
from sqlalchemy.orm import session
from sqlalchemy import create_engine

from datetime import datetime, timedelta
import pytz # new import

import gc

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

# Data preparation

In [None]:
data.head()

In [None]:
data = data[(data["device_name"] == "Interruptor Principal") ^ (data["device_name"] == "Lavaloza") ^
            (data["device_name"] == "Tablero B") ^ (data["device_name"] == "Tablero E") ^
            (data["device_name"] == "Tablero C")]

In [None]:
data = data[data["measurement_time(UTC)"] >= pd.to_datetime('2018-04-01').tz_localize('Mexico/General') ]
data.head()

In [None]:
#Convert to Mexico time
data["measurement_time(UTC)"] = data["measurement_time(UTC)"].apply(lambda x: x.astimezone("Mexico/General"))
data.head()

In [None]:
# Rename measurement_time columns
data.rename(columns ={"measurement_time(UTC)":"measurement_time(MX)"}, inplace =True)
data.head()

In [None]:
#Get other time data
data["hour"] = data["measurement_time(MX)"].apply(lambda x: x.time().hour)
data["minute"] = data["measurement_time(MX)"].apply(lambda x: x.time().minute)
data["time"] = data["hour"] + data["minute"]/60
data["weekday"] = data["measurement_time(MX)"].apply(lambda x: x.weekday())
data["month"] = data["measurement_time(MX)"].apply(lambda x: x.month)
data.head()

In [None]:
# Separate Main from devices
main_df = data[data["device_name"] == "Interruptor Principal"]
devices_df = data[data["device_name"] != "Interruptor Principal"]
del data
gc.collect()

In [None]:
main_df.head()

In [None]:
devices_df.head()

In [None]:
devices_df.drop(columns = ["power(W)", "energy(Wh)", "power_factor", "voltage(V)", "hour","time", "weekday", "month"], inplace =True)

In [None]:
devices_df.head()

In [None]:
devices_df = pd.pivot_table(devices_df, index= "measurement_time(MX)", columns = "device_name", values = "current(A)",
                            aggfunc="mean")\
                            .reset_index().sort_values(by = "measurement_time(MX)", ascending =True).fillna(0)

In [None]:
devices_df.head()

In [None]:
main_df.head()

# Building the model

In [None]:
devices_to_train = ["Lavaloza", "Tablero B", "Tablero C", "Tablero E"]
features = list(main_df.columns[2:])

In [None]:
features

In [None]:
model_df = pd.merge(left = main_df, right = devices_df, on="measurement_time(MX)")
del devices_df, main_df
gc.collect()

In [None]:
X = model_df[features]
y = model_df[devices_to_train]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scaler = MinMaxScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
y_train["Lavaloza"].values.reshape(-1,1)

## Gradient Boosting Regressor

In [None]:
param_grid = {'n_estimators': [30,40,50, 60],
              'max_depth': [4,5,6],
              'learning_rate':[0.01, 0.1 , 0.3]}
grid = GridSearchCV(GradientBoostingRegressor(), param_grid, verbose=2, n_jobs = -1)

# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train_scaled, y_train["Lavaloza"].values.reshape(-1,1))

In [None]:
print('The best parameters to use are: ', grid.best_params_)

In [None]:
model_list_gbr= []
for column in devices_to_train:
    y = y_train[column].values.reshape(-1,1)
    gbr = GradientBoostingRegressor(max_depth = 5, n_estimators = 100, learning_rate= 0.1).fit(X_train_scaled, y)
    model_list_gbr.append(gbr)
    print("Model for {}".format(column))
    print(gbr.score(X_train_scaled, y_train[column].values.reshape(-1,1)))
    print(gbr.score(X_test_scaled, y_test[column].values.reshape(-1,1)))

## Multilayer Perceptrons

In [None]:
param_grid = {'activation': ["relu", "tanh"],
              'hidden_layer_sizes': [(100,), [100,100],[100,100,100]]
             }
grid = GridSearchCV(MLPRegressor(), param_grid, verbose=2, n_jobs = -1)

# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train_scaled, y_train["Lavaloza"].values.reshape(-1,1))

In [None]:
print('The best parameters to use are: ', grid.best_params_)

In [None]:
model_list_mlp= []
for column in y_train.columns:
    y = y_train[column].values.reshape(-1,1)
    mlp = MLPRegressor(activation = "tanh", hidden_layer_sizes = [100,100,100]).fit(X_train_scaled, y)
    model_list_mlp.append(mlp)
    print("Model for {}".format(column))
    print(mlp.score(X_train_scaled, y_train[column].values.reshape(-1,1)))
    print(mlp.score(X_test_scaled, y_test[column].values.reshape(-1,1)))
    print("\n")

# Create CSV

In [None]:
features_predict = scaler.transform(model_df[features])

for device, model_mlp, model_gbr in zip(devices_to_train,model_list_mlp, model_list_gbr):
    model_df[device + "_pred_mlp"] = model_mlp.predict(features_predict)
    model_df[device + "_pred_gbr"] = model_gbr.predict(features_predict)
model_df

In [None]:
model_df.to_csv("predicciones.csv")

In [None]:
model_list_mlp

In [None]:
features