In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import zscore 

# Load and preprocess data
def load_and_preprocess_data(filepath):
    loc1 = pd.read_csv(filepath)
    loc1['Time'] = pd.to_datetime(loc1['Time'])
    loc1['Year'] = loc1['Time'].dt.year
    loc1['Month'] = loc1['Time'].dt.month
    features = ['Year', 'Month', 'temperature_2m', 'relativehumidity_2m', 
                'dewpoint_2m', 'windspeed_10m', 'windspeed_100m', 
                'winddirection_10m', 'winddirection_100m', 'windgusts_10m']
    X = loc1[features]
    y = loc1['Power']
    return X, y

# Function to remove outliers
def remove_outliers(X, y, threshold=3.0):
    features_to_check = X.columns.difference(['Year', 'Month'])
    z_scores = np.abs(X[features_to_check].apply(zscore))
    mask = (z_scores < threshold).all(axis=1)
    
    outliers_removed = {feature: np.sum(~mask & (z_scores[feature] >= threshold)) for feature in features_to_check}
    
    X_clean = X[mask]
    y_clean = y[mask]
    
    return X_clean, y_clean, outliers_removed

# Function to calculate adjusted R-squared
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Function to train and evaluate linear regression model with AIC/BIC
def train_and_evaluate_linear_regression(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Fit model with statsmodels for AIC/BIC
    X_train_sm = sm.add_constant(X_train)
    X_test_sm = sm.add_constant(X_test)
    model_sm = sm.OLS(y_train, X_train_sm).fit()
    
    y_pred = model_sm.predict(X_test_sm)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, X_test.shape[0], X_test.shape[1])
    aic = model_sm.aic
    bic = model_sm.bic
    
    print('R^2:', r2)
    print('Adjusted R^2:', adj_r2)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('MAE:', mae)
    print('AIC:', aic)
    print('BIC:', bic)
    
    return X_train, X_test, y_train, y_test, y_pred

# Function for K-Fold Cross-Validation
def kfold_cross_validation(X, y, model):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    results = {'MSE': [], 'RMSE': [], 'MAE': [], 'R^2': [], 'Adj R^2': [], 'AIC': [], 'BIC': []}
    y_pred_all = []
    y_test_all = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Fit model with statsmodels for AIC/BIC
        X_train_sm = sm.add_constant(X_train)
        X_test_sm = sm.add_constant(X_test)
        model_sm = sm.OLS(y_train, X_train_sm).fit()
        
        y_pred = model_sm.predict(X_test_sm)
        
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        adj_r2 = adjusted_r2(r2, X_test.shape[0], X_test.shape[1])
        aic = model_sm.aic
        bic = model_sm.bic
        
        results['MSE'].append(mse)
        results['RMSE'].append(rmse)
        results['MAE'].append(mae)
        results['R^2'].append(r2)
        results['Adj R^2'].append(adj_r2)
        results['AIC'].append(aic)
        results['BIC'].append(bic)
        
        y_pred_all.extend(y_pred)
        y_test_all.extend(y_test)
    
    avg_results = {metric: np.mean(values) for metric, values in results.items()}
    std_results = {metric: np.std(values) for metric, values in results.items()}
    
    print("\nCross-Validation Results (Mean ± Std):")
    for metric, avg_value in avg_results.items():
        print(f"{metric}: {avg_value:.4f} ± {std_results[metric]:.4f}")
    
    return np.array(y_pred_all), np.array(y_test_all)

# Main script execution
if __name__ == "__main__":
    filepath = 'Location1.csv'
    
    # Load and preprocess data
    X, y = load_and_preprocess_data(filepath)
    
    # Remove outliers
    X_clean, y_clean, outliers_count = remove_outliers(X, y)
    print("Number of outliers removed from each variable:")
    for feature, count in outliers_count.items():
        print(f"{feature}: {count} outliers")
    
    # Train and evaluate Linear Regression model
    X_train, X_test, y_train, y_test, y_pred = train_and_evaluate_linear_regression(X_clean, y_clean)
    
    # K-Fold Cross-Validation
    y_pred_all, y_test_all = kfold_cross_validation(X_clean, y_clean, LinearRegression())

    # Manually plot predictions (if needed, else remove this line)
    # plot_predictions(y_test_all, y_pred_all)
    
    # Define features with high correlation to Power
    features_high_corr = ['windspeed_10m', 'windspeed_100m', 'windgusts_10m']
    X_high_corr = X_clean[features_high_corr]
    y_high_corr = y_clean

    # Scale and train Ridge model
    scaler = StandardScaler()
    X_scaled_high_corr = scaler.fit_transform(X_high_corr)
    X_train_high_corr, X_test_high_corr, y_train_high_corr, y_test_high_corr = train_test_split(X_scaled_high_corr, y_high_corr, test_size=0.2, random_state=42)
    
    model_high_corr = sm.OLS(y_train_high_corr, sm.add_constant(X_train_high_corr)).fit()
    
    y_pred_high_corr = model_high_corr.predict(sm.add_constant(X_test_high_corr))
    r_square_high_corr = r2_score(y_test_high_corr, y_pred_high_corr)
    mse_high_corr = mean_squared_error(y_test_high_corr, y_pred_high_corr)
    rmse_high_corr = np.sqrt(mse_high_corr)
    mae_high_corr = mean_absolute_error(y_test_high_corr, y_pred_high_corr)
    adj_r2_high_corr = adjusted_r2(r_square_high_corr, X_test_high_corr.shape[0], X_test_high_corr.shape[1])
    aic_high_corr = model_high_corr.aic
    bic_high_corr = model_high_corr.bic
    
    # Print results
    print("\nResults using highly correlated features:")
    print('R^2:', r_square_high_corr)
    print('Adjusted R^2:', adj_r2_high_corr)
    print('MSE:', mse_high_corr)
    print('RMSE:', rmse_high_corr)
    print('MAE:', mae_high_corr)
    print('AIC:', aic_high_corr)
    print('BIC:', bic_high_corr)


FileNotFoundError: [Errno 2] No such file or directory: 'Location1.csv'