In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer, PowerTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.tree import DecisionTreeRegressor  
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

In [None]:
# Load the data
data = pd.read_csv('wind_ava.csv.gz', compression='gzip')

# FIlter the data to only include the columns that end in 13
data = data.filter(regex='13$|energy')
#print(data)


In [None]:
y = data['energy']
x = data.drop(columns=['energy'])

third_quartile = np.quantile(y, 0.75)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False)
y_class = np.where(y_test < third_quartile, 'baja', 'alta')


In [None]:

pipe = Pipeline([
    ('scaler', PowerTransformer()),
    ('model', SVR(C=719, degree=1, epsilon=0.1, gamma='auto', kernel='rbf'))
])


pipe.fit(X_train, y_train)

y_pred_pipe = pipe.predict(X_test)

# Get the values that have the value small in the y_class array
y_pred_pipe_small = y_pred_pipe[y_class == 'baja']
y_pred_pipe_large = y_pred_pipe[y_class == 'alta']
y_test_small = y_test[y_class == 'baja']
y_test_large = y_test[y_class == 'alta']

# Calculate the RMSE for the small values
rmse_pipe_small = np.sqrt(mean_squared_error(y_pred_pipe_small, y_test_small))

# Calculate the RMSE for the large values
rmse_pipe_large = np.sqrt(mean_squared_error(y_pred_pipe_large, y_test_large))

print("RMSE small:", rmse_pipe_small)
print("RMSE large:", rmse_pipe_large)

Como se puede observar de los resultados de la celda anterior, las predicciones bajas son mucho más fiables que las predicciones elevadas.

Ahora, haremos el problema de Clasificación:
KNN, SVM, DecisionTreeClassifier, MLPClassifier

In [None]:
y_class = np.where(y < third_quartile, 'baja', 'alta')

X_train, X_test, y_train, y_test = train_test_split(x, y_class, test_size=0.2, shuffle=False)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
# Usaremos los parametros por default n_slits = 5, max_train_dize = None, test_size = None, gap = 0 
cv = TimeSeriesSplit()

param_grid = {
    'n_neighbors': range(5, 30, 2),
    'weights' : ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': range(1, 25, 5),
    'p' : [1,2], # distance `= 1 manhatan, p=2 euclidean,
}
scoring = ['accuracy', 'f1', 'precision', 'recall', 'balanced_accuracy']
results = {}
for score in scoring:
    grid_search = GridSearchCV(KNeighborsClassifier(),
                        param_grid, cv=cv, n_jobs=-1, scoring=score)

    pipe = Pipeline([
        ('scaler', RobustScaler()),
        ('model', grid_search)
    ])


    #grid_search2 = GridSearchCV(KNeighborsClassifier(),
    #                     param_grid, cv=cv, n_jobs=-1, scoring='neg_root_mean_squared_error')


    pipe.fit(X_train, y_train)


    best_model_pipe = pipe.named_steps["model"].best_estimator_
    best_params_pipe = pipe.named_steps["model"].best_params_
    best_score_pipe = pipe.named_steps["model"].best_score_

    print("Best model", best_model_pipe)
    print("Best params", best_params_pipe)
    print("Best score", best_score_pipe)
    results[score] = {'score' : best_score_pipe, 'params' : best_params_pipe}


print(results)