In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR  # Importing SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Load and preprocess data
def load_and_preprocess_data(filepath):
    loc1 = pd.read_csv(filepath)
    loc1['Time'] = pd.to_datetime(loc1['Time'])
    loc1['Year'] = loc1['Time'].dt.year
    loc1['Month'] = loc1['Time'].dt.month
    features = ['Year', 'Month', 'temperature_2m', 'relativehumidity_2m', 
                'dewpoint_2m', 'windspeed_10m', 'windspeed_100m', 
                'winddirection_10m', 'winddirection_100m', 'windgusts_10m']
    X = loc1[features]
    y = loc1['Power']
    return X, y 

# Function to remove outliers
def remove_outliers(X, y, threshold=3.0):
    features_to_check = X.columns.difference(['Year', 'Month'])
    z_scores = np.abs(X[features_to_check].apply(zscore))
    mask = (z_scores < threshold).all(axis=1)
    
    outliers_removed = {feature: np.sum(~mask & (z_scores[feature] >= threshold)) for feature in features_to_check}
    
    X_clean = X[mask]
    y_clean = y[mask]
    
    return X_clean, y_clean, outliers_removed 

# Function to calculate adjusted R-squared
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1) 

# Function to train and evaluate SVR model
def train_and_evaluate_svr(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    model = SVR(kernel='rbf')  # You can change kernel to 'linear', 'poly', etc.
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, X_test.shape[0], X_test.shape[1])
    
    print('SVR Model Performance:')
    print('R^2:', r2)
    print('Adjusted R^2:', adj_r2)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('MAE:', mae)
    
    return X_train, X_test, y_train, y_test, y_pred 

# Function for K-Fold Cross-Validation for SVR
def kfold_cross_validation_svr(X, y):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    results = {'MSE': [], 'RMSE': [], 'MAE': [], 'R^2': [], 'Adj R^2': []}
    y_pred_all = []
    y_test_all = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Scale the data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model = SVR(kernel='rbf')
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        adj_r2 = adjusted_r2(r2, X_test.shape[0], X_test.shape[1])
        
        results['MSE'].append(mse)
        results['RMSE'].append(rmse)
        results['MAE'].append(mae)
        results['R^2'].append(r2)
        results['Adj R^2'].append(adj_r2)
        
        y_pred_all.extend(y_pred)
        y_test_all.extend(y_test)
    
    avg_results = {metric: np.mean(values) for metric, values in results.items()}
    std_results = {metric: np.std(values) for metric, values in results.items()}
    
    print("\nSVR Cross-Validation Results (Mean ± Std):")
    for metric, avg_value in avg_results.items():
        print(f"{metric}: {avg_value:.4f} ± {std_results[metric]:.4f}")
    
    return np.array(y_pred_all), np.array(y_test_all)

# Function to manually plot predictions
def plot_predictions(y_test_all, y_pred_all):
    fig, axs = plt.subplots(ncols=2, figsize=(12, 6))
    
    # Actual vs Predicted
    axs[0].scatter(y_test_all, y_pred_all, alpha=0.5)
    axs[0].plot([min(y_test_all), max(y_test_all)], [min(y_test_all), max(y_test_all)], color='red')
    axs[0].set_xlabel("Actual values")
    axs[0].set_ylabel("Predicted values")
    axs[0].set_title("Actual vs. Predicted values")
    
    # Residuals vs Predicted
    residuals = y_test_all - y_pred_all
    axs[1].scatter(y_pred_all, residuals, alpha=0.5)
    axs[1].hlines(0, min(y_pred_all), max(y_pred_all), color='red')
    axs[1].set_xlabel("Predicted values")
    axs[1].set_ylabel("Residuals")
    axs[1].set_title("Residuals vs. Predicted Values")
    
    plt.tight_layout()
    plt.show()

# Main script execution
if __name__ == "__main__":
    filepath = 'Location1.csv'
    
    # Load and preprocess data
    X, y = load_and_preprocess_data(filepath)
    
    # Remove outliers
    X_clean, y_clean, outliers_count = remove_outliers(X, y)
    print("Number of outliers removed from each variable:")
    for feature, count in outliers_count.items():
        print(f"{feature}: {count} outliers")
    
    # Train and evaluate SVR model
    X_train, X_test, y_train, y_test, y_pred = train_and_evaluate_svr(X_clean, y_clean)
    
    # K-Fold Cross-Validation for SVR
    y_pred_all, y_test_all = kfold_cross_validation_svr(X_clean, y_clean)
    
    # Manually plot predictions
    plot_predictions(y_test_all, y_pred_all)


Number of outliers removed from each variable:
dewpoint_2m: 0 outliers
relativehumidity_2m: 11 outliers
temperature_2m: 5 outliers
winddirection_100m: 0 outliers
winddirection_10m: 0 outliers
windgusts_10m: 337 outliers
windspeed_100m: 199 outliers
windspeed_10m: 318 outliers
