In [None]:
import os
import pandas as pd
import numpy as np
import re
import subprocess
import sys

import plotly.express as px
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.preprocessing import MinMaxScaler

module_path = re.sub(r'Notebooks','Python Scripts',os.getcwd())
sys.path.append(module_path)
from performance_helper import *
from ml_helper import *

# from warnings import simplefilter
# from sklearn.exceptions import ConvergenceWarning
# simplefilter("ignore", category=ConvergenceWarning)

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

In [None]:
def main(year, month, file, gap_length):
    if not re.search(r'\d{4}',year):
        raise Exception(f"Incorret Input: {year}")
    elif not re.search(r'[A-Za-z]{3}',month):
        raise Exception(f"Incorret Input: {month}")
    elif not [file_i for file_i in ['Irradiance','Deger','Fixed'] if re.search(fr'{file}',file_i)]:
        raise Exception(f"Incorret Input: File")
    else:
        path_list = get_file_paths(file)
        path_list = path_function_extended(year,month,None,None,path_list)
        df = df_cleaner([path_list[0]],file)
    return df, file, year, month, gap_length

In [None]:
def process_validation_df(test_df, file, gap_length):
        
    test_df, base_df = missing_value_simulation(test_df, gap_length)

    test_df, base_df, test_month = missing_data_beg_end_month(test_df, base_df, file)

    test_gaps = map_nan_gaps_indexes(test_df, test_month)
            
    return test_df, base_df, test_gaps

In [None]:
# = load all the Data = #
test_df, file, year, month, gap_length = main(year = input("Year (format: YYYY): "),month = input("Month (format: jul): "),
     file = input("File (opt: Irradiance/Deger/Fixed): "), gap_length = int(input("Gap size (seconds): ")))

# = load test data = #
test_df, base_df, test_gaps = process_validation_df(test_df, file, gap_length)

In [None]:
datapath = re.sub("Notebooks","Support Files/",os.getcwd())
ml_df = pd.read_csv(datapath + 'ml_features.csv',index_col=0)
ml_df = reshape_ml(test_df, ml_df)

# = if reshaping = #
ml_df = resample_ml(test_df, ml_df, freq = '20s')
ml_df = interpolate(ml_df)

imputation_dict = {}

In [None]:
# px.scatter(test_df, y='DirectIR').show()

In [None]:
# px.scatter(ml_df, y='Direct Shortwave Radiation (W/m²) (sfc)').show()

In [None]:
import neptune
import neptune.integrations.sklearn as npt_utils

def MLP_single(df, ml_df, df_copy = base_df.copy(), test_gaps=test_gaps):
    
    ml_df['Seconds'] = [(time - time.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds() for time in ml_df.index]
    ml_df['Day'] = [d.day for d in ml_df.index]
    
    col_indx = 0
    
    for col in df.columns:
        
        if col != 'Temperature':continue
                
#         print(f'\n\nFor target variable: {col}\n')
        
        test_ml_df = ml_df.copy()
        
        test_ml_df[col] = df[df.index.isin(ml_df.index)][col]
        
        X = test_ml_df.drop([col], axis = 1).to_numpy()
        y = test_ml_df[col].to_numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
#         y_train = scaler.fit_transform(y_train)
#         y_test = scaler.fit_transform(y_test)
        
        lasso = Lasso(alpha=0.1) # range of lasso (0.001 to 10)
        lasso.fit(X_train, y_train)
        
        selected_indices = np.where(lasso.coef_ != 0)[0]
        
        if not selected_indices.any(): 
            (print(f'No features meet current criteria for: {col}'))
            continue
            
#         print("Coefficients:", list(zip(lasso.coef_,test_ml_df.drop([col], axis = 1).columns)))

        X_train_selected = X_train[:, selected_indices]
        X_test_selected = X_test[:, selected_indices]
        
        parameters = {
            "hidden_layer_sizes": (60,85),
            "activation": "tanh",
            "solver": "adam",
            "learning_rate_init": 0.001,
            "max_iter": 300,
            "alpha": 0.0001,
            "beta_1": 0.9,
            "beta_2": 0.999,
            "epsilon": 1e-8
        }
        
        # == Model == #
            
        mlp = MLPRegressor(**parameters)
        
        mlp.fit(X_train_selected, y_train)
        
        # == Model == #
        
        run = neptune.init_run(
            project="ethanmasters/PV-Solar-MLP",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMWZhYmFiYi0zYWEzLTQ3NTMtYmMyOS1jZjAzYjY0N2EwYjgifQ==",
            name="MLP-DiffuseIR",
            tags=["MLPRegressor", "regression", "Temperature"],
        )
        
        run["parameters"] = parameters
        
        run["mlp_summary"] = npt_utils.create_regressor_summary(mlp, X_train_selected, X_test_selected, y_train, y_test)

        run.stop()
        
        # == Model == #
                
#         print("Selected features: ", list(test_ml_df.drop([col], axis = 1).columns))
#         print("Target: ", col)
#         print("Number of layers: ", mlp.n_layers_)
#         print("Number of outputs: ", mlp.n_outputs_)
#         print("Output activation: ", mlp.out_activation_)
#         print("Number of iterations:", mlp.n_iter_)
#         print("Best loss: ", mlp.best_loss_)
#         print("Current loss: ", mlp.loss_)
#         print("Number of training samples seen: ", mlp.t_)
        
#         print("\nLoss Curve: ")
        
#         px.line(x = range(mlp.n_iter_), y = mlp.loss_curve_, title = "Loss Curve").show()
        
#         score = mlp.score(X_test_selected, y_test)
#         print("\nTest score:", score)
        
        if test_gaps:
            imputation_values = scaler.fit_transform(test_ml_df.drop([col], axis = 1).iloc[test_gaps[col],:].to_numpy()[:, selected_indices])
            prediction = mlp.predict(imputation_values)

            print(prediction)

            df.iloc[test_gaps[col],:][col] = prediction
            
        col_indx += 1
        
#         print("\nPerumation Importance: ")

#         feature_selection(test_ml_df.drop([col], axis = 1).iloc[:, selected_indices], mlp, X_train_selected, y_train)
        
#         multilinear_feature_selection(test_ml_df.drop([col], axis = 1).iloc[:, selected_indices], X_test_selected)
        
    if test_gaps:
        error_df = calculate_imputation_errors(df, df_copy, test_gaps)
        return error_df.to_dict()
MLP_single(test_df.copy(),ml_df.copy())

# imputation_dict['MLP'] = MLP_single(test_df.copy(),ml_df.copy())
# print(imputation_dict)

In [None]:
def MLP_multi(df, ml_df, df_copy = base_df.copy(), test_gaps=test_gaps):
    
    ml_df['Seconds'] = [(time - time.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds() for time in ml_df.index]
    ml_df['Day'] = [d.day for d in ml_df.index]
    
    col_indx = 0
    
    for col in df.columns:
                
        print(f'\n\nFor target variable: {col}\n')
        
        test_ml_df = ml_df.copy()
        
        test_ml_df[col] = df[df.index.isin(ml_df.index)][col]
        
        if col == 'GlobalIR':
            test_ml_df = test_ml_df.drop(['Wind Speed (km/h) (10 m)','Wind Direction (°) (10 m)',
                                           'Cloud Cover Total (%) (sfc)', 'Day'],axis=1)
        if col == 'DirectIR':
            test_ml_df = test_ml_df.drop(['Wind Speed (km/h) (10 m)','Wind Direction (°) (10 m)',
                                           'Cloud Cover Total (%) (sfc)', 'Day'],axis=1)
        if col == 'DiffuseIR':
            test_ml_df = test_ml_df.drop(['Wind Speed (km/h) (10 m)','Wind Direction (°) (10 m)',
                                           'Cloud Cover Total (%) (sfc)', 'Day', 'Temperature (°C) (2 m elevation corrected)'],axis=1)
        if col == 'Temperature':
            test_ml_df = test_ml_df.drop(['Wind Speed (km/h) (10 m)','Wind Direction (°) (10 m)',
                                           'Cloud Cover Total (%) (sfc)', 'Day'],axis=1)
        if col == 'WindSpeed':
            test_ml_df = test_ml_df.drop(['Wind Direction (°) (10 m)','Cloud Cover Total (%) (sfc)',
                                          'Day', 'Temperature (°C) (2 m elevation corrected)', 'Seconds'],axis=1)
                       
        X = test_ml_df.drop([col], axis = 1).to_numpy()
        y = test_ml_df[col].to_numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        scaler = MinMaxScaler()
        X_train_std = scaler.fit_transform(X_train)
        X_test_std = scaler.fit_transform(X_test)
        
        
        if col == 'GlobalIR':
            mlp = MLPRegressor(hidden_layer_sizes=(25,25,25), activation='tanh', solver='adam')
            mlp.fit(X_train_std, y_train)
            score = mlp.score(X_test_std, y_test)
        if col == 'DirectIR':
            mlp = MLPRegressor(hidden_layer_sizes=(70,95), activation='tanh', solver='adam')
            mlp.fit(X_train_std, y_train)
            score = mlp.score(X_test_std, y_test)
        if col == 'DiffuseIR':
            mlp = MLPRegressor(hidden_layer_sizes=(60,85), activation='tanh', solver='adam')
            mlp.fit(X_train_std, y_train)
            score = mlp.score(X_test_std, y_test)
        if col == 'Temperature':
            mlp = MLPRegressor(hidden_layer_sizes=(60,85), activation='tanh', solver='adam')
            mlp.fit(X_train_std, y_train)
            score = mlp.score(X_test_std, y_test)
        if col == 'WindSpeed':
            mlp = MLPRegressor(hidden_layer_sizes=(70,95), activation='relu', solver='adam')
            mlp.fit(X_train_std, y_train)
            score = mlp.score(X_test_std, y_test)
                
        print("Selected features: ", list(test_ml_df.drop([col], axis = 1).columns))
        print("Target: ", col)
        print("Number of layers: ", mlp.n_layers_)
        print("Number of outputs: ", mlp.n_outputs_)
        print("Output activation: ", mlp.out_activation_)
        print("Number of iterations:", mlp.n_iter_)
        print("Best loss: ", mlp.best_loss_)
        print("Current loss: ", mlp.loss_)
        print("Number of training samples seen: ", mlp.t_)
        
        print("\nLoss Curve: ")
        
        px.line(x = range(mlp.n_iter_), y = mlp.loss_curve_, title = "Loss Curve").show()
        
        print("\nTest score:", score)
        
        if test_gaps:
            imputation_values = scaler.fit_transform(test_ml_df.drop([col], axis = 1).iloc[test_gaps[col],:].to_numpy()[:, selected_indices])
            prediction = mlp.predict(imputation_values)

            print(prediction)

            df.iloc[test_gaps[col],:][col] = prediction
            
        col_indx += 1
        
        print("\nPerumation Importance: ")
        
        feature_selection(test_ml_df.drop([col], axis = 1), mlp, X_train_std, y_train)
        
        multilinear_feature_selection(test_ml_df.drop([col], axis = 1), X_test_std)

    if test_gaps:
        error_df = calculate_imputation_errors(df, df_copy, test_gaps)
        return error_df.to_dict()
   
imputation_dict['MLP'] = MLP_multi(test_df.copy(),ml_df.copy())
# print(imputation_dict)

In [None]:
print(help(MLPRegressor))
print(dir(MLPRegressor))

In [None]:
def MLP_2(df, ml_df, df_copy = base_df.copy(), test_gaps=test_gaps):
    
    ml_df['Seconds'] = [(time - time.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds() for time in ml_df.index]
    ml_df['Day'] = [d.day for d in ml_df.index]
    
    col_indx = 0
    
    for col in df.columns:
                
        if col != 'DiffuseIR':continue
            
        print(f'\n\nFor target variable: {col}\n')
        
        test_ml_df = ml_df.copy()
        
        test_ml_df[col] = df[df.index.isin(ml_df.index)][col]
            
        X = test_ml_df.drop([col], axis = 1).to_numpy()
        y = test_ml_df[col].to_numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        scaler = MinMaxScaler()
        X_train_std = scaler.fit_transform(X_train)
        X_test_std = scaler.fit_transform(X_test)
        
        lasso = Lasso(alpha=0.1) # range of lasso (0.001 to 10)
        lasso.fit(X_train_std, y_train)
        
#         print("Coefficients:", list(zip(lasso.coef_,test_ml_df.drop([col], axis = 1).columns)))

        selected_indices = np.where(lasso.coef_ != 0)[0]
        
        if not selected_indices.any(): 
            (print(f'No features meet current criteria for: {col}'))
            continue
            
#         print("\nSelected indices:", list(test_ml_df.drop([col], axis = 1).columns[selected_indices]))

        X_train_selected = X_train_std[:, selected_indices]
        X_test_selected = X_test_std[:, selected_indices]


        param_grid = {
                'hidden_layer_sizes': [(5,),(10,),(15,),(20,),(5,5),(10,10),(15,15),(20,20)],
            'activation': ['relu'],
            'solver': ['adam']
            } 


        mlp = MLPRegressor()
        
        grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1)
        grid_search.fit(X_train_selected, y_train)

        print(f"Best hyperparameters for {col}: ", grid_search.best_params_)
        print("Test score: ", grid_search.score(X_test_selected, y_test))
        
        print("Mean test score: ", grid_search.cv_results_['mean_test_score'])
        print("Mean fit time: ", grid_search.cv_results_['mean_fit_time'])
        print("Mean score time: ", grid_search.cv_results_['mean_score_time'])
                

MLP_2(test_df.copy(),ml_df.copy())

In [None]:
def MLP_2(df, ml_df, df_copy = base_df.copy(), test_gaps=test_gaps):
    
    ml_df['Seconds'] = [(time - time.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds() for time in ml_df.index]
    ml_df['Day'] = [d.day for d in ml_df.index]
    
    col_indx = 0
    
    for col in df.columns:
                
        if col != 'DiffuseIR':continue
            
        print(f'\n\nFor target variable: {col}\n')
        
        test_ml_df = ml_df.copy()
        
        test_ml_df[col] = df[df.index.isin(ml_df.index)][col]
            
        X = test_ml_df.drop([col], axis = 1).to_numpy()
        y = test_ml_df[col].to_numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        scaler = MinMaxScaler()
        X_train_std = scaler.fit_transform(X_train)
        X_test_std = scaler.fit_transform(X_test)
        
        lasso = Lasso(alpha=0.1) # range of lasso (0.001 to 10)
        lasso.fit(X_train_std, y_train)
        
#         print("Coefficients:", list(zip(lasso.coef_,test_ml_df.drop([col], axis = 1).columns)))

        selected_indices = np.where(lasso.coef_ != 0)[0]
        
        if not selected_indices.any(): 
            (print(f'No features meet current criteria for: {col}'))
            continue
            
#         print("\nSelected indices:", list(test_ml_df.drop([col], axis = 1).columns[selected_indices]))

        X_train_selected = X_train_std[:, selected_indices]
        X_test_selected = X_test_std[:, selected_indices]
        
#         if col == 'WindSpeed':
#             param_grid = {
#                 'hidden_layer_sizes': [(60,85),(70,95),(65,90)],
#                 'activation': ['relu'],
#                 'solver': ['adam']
#             }
#         elif col == 'GlobalIR':
#             param_grid = {
#                 'hidden_layer_sizes': [(25,25,25),(60,85),(70,95),(65,90)],
#                 'activation': ['tanh'],
#                 'solver': ['adam']
#             }
#         else:
#             param_grid = {
#                 'hidden_layer_sizes': [(60,85),(70,95),(65,90)],
#                 'activation': ['tanh'],
#                 'solver': ['adam']
#             }

        if col == 'WindSpeed':
            param_grid = {
                'hidden_layer_sizes': [(5,),(10,),(15,),(20,),(5,5),(10,10),(15,15),(20,20)],
            'activation': ['relu'],
            'solver': ['adam']
            } 
        else:
            param_grid = {
                'hidden_layer_sizes': [(5,),(10,),(15,),(20,),(5,5),(10,10),(15,15),(20,20)],
                'activation': ['tanh'],
                'solver': ['adam']
            }

        mlp = MLPRegressor()
        
        grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1)
        grid_search.fit(X_train_selected, y_train)

        print(f"Best hyperparameters for {col}: ", grid_search.best_params_)
        print("Test score: ", grid_search.score(X_test_selected, y_test))
        
        print("Mean test score: ", grid_search.cv_results_['mean_test_score'])
        print("Mean fit time: ", grid_search.cv_results_['mean_fit_time'])
        print("Mean score time: ", grid_search.cv_results_['mean_score_time'])
                

MLP_2(test_df.copy(),ml_df.copy())

In [None]:
from tslearn.utils import to_time_series_dataset

my_first_time_series = [1, 3, 4, 2]
my_second_time_series = [1, 2, 4, 2]
my_third_time_series = [1, 2, 4, 2, 2]
X = to_time_series_dataset([my_first_time_series,
                                my_second_time_series,
                                my_third_time_series])
y = [0, 1, 1]

In [None]:
def performance(impute_dict):
    imputation_df = pd.DataFrame.from_dict({(outerKey, innerKey): values for outerKey, innerDict in impute_dict.items() for innerKey, values in innerDict.items()}).T
    r2_df = pd.DataFrame(imputation_df.drop(['mae','mse','rmse'], axis=0,level=1).max(axis=0),columns = ['r2'])
    r2_df['Method'] = imputation_df.drop(['mae','mse','rmse'], axis=0,level=1).idxmax(axis=0).values
    mae_df = pd.DataFrame(imputation_df.drop(['r2','mse','rmse'], axis=0,level=1).min(axis=0),columns = ['mae'])
    mae_df['Method'] = imputation_df.drop(['r2','mse','rmse'], axis=0,level=1).idxmin(axis=0).values
    rmse_df = pd.DataFrame(imputation_df.drop(['r2','mse','mae'], axis=0,level=1).min(axis=0),columns = ['rmse'])
    rmse_df['Method'] = imputation_df.drop(['r2','mse','mae'], axis=0,level=1).idxmin(axis=0).values
    for row in r2_df.index:
        print(row)
        print(f"Optimal Imputation Method for {row}: {r2_df.loc[row]['Method'][0]}, R2 score: {round(r2_df.loc[row]['r2'],5)}")
        print(f"Optimal Imputation Method for {row}: {mae_df.loc[row]['Method'][0]}, MAE score: {round(mae_df.loc[row]['mae'],5)}")
        print(f"Optimal Imputation Method for {row}: {rmse_df.loc[row]['Method'][0]}, RMSE score: {round(rmse_df.loc[row]['rmse'],5)}\n")

    return imputation_df

# print(f"\nImputation methods for {file} and error metrics in {year}, {month} modeled with NaN values in {year_2}, {month_2}.\nIncluding gaps smaller than {nan_gap} seconds.\n")
performance(imputation_dict.copy())