# **1.- Mini-análisis usando el modelo final**

In [None]:
from google.colab import files
import pandas as pd

# Sube el archivo wind_ava.csv desde tu ordenador
#uploaded = files.upload()

# También es una opción montar el drive para incluir el archivo desde ahí
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.model_selection import TimeSeriesSplit
import re

# Estos son datos que vamos a usar para todos los modelos
df = pd.read_csv("/content/drive/MyDrive/AA_P1_Grupo10/wind_ava.csv")

patron = r'.*\.13$'
columnas_a_mantener = []
for column in df.columns:
  if column == "datetime" or column == "energy":
    columnas_a_mantener.append(column)
  if re.match(patron, column):
    columnas_a_mantener.append(column)
df = df[columnas_a_mantener]

columnas_a_eliminar = ['energy', 'datetime', 'p55.162.13', 'stl4.13', 'cape.13']
y = df['energy']
X = df.drop(columns=columnas_a_eliminar)
tscv = TimeSeriesSplit(n_splits=3)

In [None]:
from joblib import load
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

modelo_lasso = load('/content/drive/MyDrive/AA_P1_Grupo10/modelo_final.pkl')



# Supongamos que tienes tus datos X y y

# Divide tus datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrena tu modelo en el conjunto de entrenamiento

# Realiza predicciones en el conjunto de prueba
y_pred = modelo_lasso.predict(X_test)

# Crea diferentes rangos de valores de la variable objetivo
# Por ejemplo, podrías dividir los valores en cuartiles
quantiles = np.percentile(y_test, [25, 50, 75])

# Calcula las métricas de rendimiento para cada rango
mae_by_quantile = []
for i in range(len(quantiles)):
    if i == 0:
        mask = (y_test <= quantiles[i])
    elif i == len(quantiles) - 1:
        mask = (y_test > quantiles[i-1])
    else:
        mask = (y_test > quantiles[i-1]) & (y_test <= quantiles[i])
    mae = mean_squared_error(y_test[mask], y_pred[mask])
    mae_by_quantile.append(mae)

# Imprime las métricas de rendimiento para cada rango
for i, mae in enumerate(mae_by_quantile):
    print(f"MSE for quantile {i+1}: {np.sqrt(mae)}")


MSE for quantile 1: 258.9253210844432
MSE for quantile 2: 264.24999874017374
MSE for quantile 3: 437.59340039440224


# **2.- Conversión a un problema de clasificación**

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Calculamos el tercer cuantil
tercer_cuantil = np.percentile(y, 75)
print("Tercer cuantil:", tercer_cuantil)
# Creamos la nueva y
new_y = np.where(y > tercer_cuantil, 'alta', 'baja')

#smote = SMOTE()
#rus = RandomUnderSampler(random_state=42)
ros = RandomOverSampler(random_state=42)

# Aplicar SMOTE a tus datos
#X, new_y = smote.fit_resample(X, new_y)
X, new_y = ros.fit_resample(X, new_y)
#X, new_y = rus.fit_resample(X, new_y)


Tercer cuantil: 1089.375


# **3.- Entrenamiento y selección de modelos**

## 3.1 - KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import RobustScaler

param_grid = {
    'kneighborsclassifier__n_neighbors': list(range(2, 30, 2)),
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__metric': ['euclidean', 'manhattan', 'chebyshev'],
    'kneighborsclassifier__p': [1, 2]  # Solo se aplica si se usa la distancia de Minkowski
}
knn_model_hpo = make_pipeline(RobustScaler(), KNeighborsClassifier())

# Hacemos nested cross validation
# Hacemos la evaluacion inner usando un RandomizedSearch con un budget de 20
regr = RandomizedSearchCV(estimator=knn_model_hpo, param_distributions=param_grid,
                                   n_iter=20, scoring='accuracy',
                                   cv=tscv, random_state=42)

regr.fit(X, new_y)
print("Best params", regr.best_params_)

# Hacemos la outer evaluation
knn_scores_hpo = cross_val_score(regr, X, new_y, scoring='accuracy', cv = tscv)
knn_accuracy_mean = knn_scores_hpo.mean()
print("El accuracy medio es ", knn_accuracy_mean)

Best params {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 1, 'kneighborsclassifier__n_neighbors': 16, 'kneighborsclassifier__metric': 'manhattan'}
El accuracy medio es  0.9138576779026217


## 3.2 - Árboles de decisión

In [None]:
from scipy.stats import randint as sp_randint
from sklearn import tree

tree_hpo = tree.DecisionTreeClassifier(random_state=42)

param_grid = {'max_depth': sp_randint (1, 16),
              'min_samples_split': sp_randint (2, 40),
              'min_samples_leaf': sp_randint (1, 20)}
regr = RandomizedSearchCV(estimator=tree_hpo, param_distributions=param_grid,
                                   n_iter=20, scoring='accuracy',
                                   cv=tscv, random_state=42)

regr.fit(X, new_y)
print("Best params", regr.best_params_)

tree_scores_hpo = cross_val_score(regr,
                            X, new_y,
                            scoring='accuracy',
                            cv = tscv)

tree_accuracy_mean = tree_scores_hpo.mean()
print("El accuracy medio es ", tree_accuracy_mean)

Best params {'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 13}
El accuracy medio es  0.8256554307116105


## 3.3 - Regresión logística

In [None]:
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform
import warnings

warnings.filterwarnings("ignore")

log_reg_hpo = make_pipeline(RobustScaler(), LogisticRegression(max_iter=1000))

param_grid = {#'logisticregression__penalty': ['l2', None],
              'logisticregression__C': np.logspace(-4, 4, 20)}

regr = RandomizedSearchCV(estimator=log_reg_hpo, param_distributions=param_grid,
                                   n_iter=20, scoring='accuracy',
                                   cv=tscv, random_state=42)
regr.fit(X, new_y)
print("Best params", regr.best_params_)

log_reg_hpo_scores = cross_val_score(regr, X, new_y, cv=tscv, scoring='accuracy')
log_reg_accuracy_mean = log_reg_hpo_scores.mean()
print("El accuracy medio es ", log_reg_accuracy_mean)

warnings.filterwarnings("default")

Best params {'logisticregression__C': 11.288378916846883}
El accuracy medio es  0.6692883895131084


## 3.4 - SVM


In [None]:
from sklearn.svm import SVC


svm_reg_hpo = make_pipeline(RobustScaler(), SVC())

param_grid = {'svc__C': loguniform(2**-5, 2**15),
              'svc__gamma': loguniform(2**-15, 8),
              'svc__kernel': ["linear"]}

regr = RandomizedSearchCV(estimator=svm_reg_hpo, param_distributions=param_grid,
                                   n_iter=20, scoring='accuracy',
                                   cv=tscv, random_state=42)

regr.fit(X, new_y)
print("Best params", regr.best_params_)

svm_hpo_scores = cross_val_score(regr, X, new_y, cv=tscv, scoring='accuracy')
svm_accuracy_mean = svm_hpo_scores.mean()
print("El accuracy medio es ", svm_accuracy_mean)

## 3.5 - Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

# Crear el modelo DummyRegressor que predice la media
dummy_model = DummyClassifier(strategy="most_frequent")

# Realizar la validación cruzada
dummy_scores = cross_val_score(dummy_model, X, new_y, cv=2, scoring='accuracy')
print("Error del modelo dummy:", dummy_scores.mean())

Error del modelo dummy: 0.4962089300758214
