In [None]:
#Importing necessary libraries
from sklearn.svm import SVR, NuSVR
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.exceptions import ConvergenceWarning,DataConversionWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)

In [None]:
#load solar irradiance dataset
data = pd.read_csv('london_weather.csv')

In [None]:
#Changing the date column from integer to datetime
data['date'] = pd.to_datetime(data['date'], format='%Y%m%d')
data

In [None]:
#statistical description of dataset 
data.describe()

In [None]:
# Plot solar irradiance distribution 
plt.figure(figsize=(13,8))
plt.scatter(data['date'], data['global_radiation'])
plt.xlabel('Date')
plt.ylabel('Solar Irradiance W/m2')
plt.title('Yearly Trends for Solar Irradiance')
plt.show()

In [None]:
# Diaplaying irradiance levels in 2020 to identify trends and seasonanlity
plt.figure(figsize=(10,8))
year = 2020  # Specify the year you want to isolate
filtered_df = data[data['date'].dt.year == year]

plt.plot(filtered_df['date'], filtered_df['global_radiation'])
plt.xlabel('Date')
plt.ylabel('Solar Irradiance W/m2')
plt.title('Trends for {}'.format(year))
plt.show()

In [None]:
# Check for missing values
missing_values_count = data.isnull().sum()
total_rows = data.shape[0]
# Check for missing values percentage 
missing_values_percentage = (missing_values_count / total_rows) * 100

In [None]:
# Put missing values sum and percentage in dataframe 
output_df = pd.DataFrame({
    'Missing Values': missing_values_count,
    'Percentage of Missing Values': missing_values_percentage
})

output_df

In [None]:
# Dropping snow depth column because it has little effect on irradiance and over a 1000 missing values
df = data.drop("snow_depth", axis=1)

#Filling the remaining empty columns with the mean of every column.
column_means = df.mean()
df = df.fillna(column_means)

In [None]:
# Input feature selection for model training
features = ['cloud_cover','sunshine','max_temp',
       'mean_temp','min_temp','precipitation','pressure']

In [None]:
# Target variable selection
target = ['global_radiation']

In [None]:
# Extract features into dataframes
X = df[features]
y = df[target]

In [None]:
# Standardizing the dataset before splitting and training
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y)

In [None]:
# splitting the data into training and training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=324) # Set test size to 25% of dataset

In [None]:
#Creating a function for evaluating the model
def evaluate_model(y_true, y_pred):
    # Calculate MAE, MSE, RMSE, and R2
    model_mae = mean_absolute_error(y_true, y_pred)
    model_mse = mean_squared_error(y_true, y_pred)
    model_rmse = np.sqrt(model_mse)
    model_r2 = r2_score(y_true, y_pred)

    return model_mae, model_mse, model_rmse, model_r2



In [None]:
# Putting all models into one dictionary
models = {
    "SVR": SVR(),
    "NuSVR": NuSVR(),
    "MLP Regressor": MLPRegressor(),
    "LGBM Regressor": LGBMRegressor(),
    "CatBoost Regressor" : CatBoostRegressor(iterations=1000, learning_rate=0.01, depth=6, l2_leaf_reg=1,verbose=False)    
}


#Creating and empty list for models, R2, RMSE, MAE, MSE
model_list = []
r2_list = []
rmse_list = []
mae_list = []
mse_list = []

# Creating a for loop to assign evaluation scorec into these empty lists
for model_name, model in models.items():
    model_r2_scores = []
    model_rmse_scores = []
    model_mae_scores = []
    model_mse_scores = []
  

    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_test_pred = model.predict(X_test)



    # Evaluate Test dataset
    model_test_mae,model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    # Append evaluation scores into empty lists above
    model_r2_scores.append(model_test_r2)
    model_rmse_scores.append(model_test_rmse)
    model_mae_scores.append(model_test_mae)
    model_mse_scores.append(model_test_mse)

    # Append model name into model_list
    model_list.append(model_name)
    

    r2_list.append(model_r2_scores)
    rmse_list.append(model_rmse_scores)
    mae_list.append(model_mae_scores)
    mse_list.append(model_mse_scores)



In [None]:
# Create a dataframe for model evaluation scores
df=pd.DataFrame({'Model Name': model_list, 'R2 Scores': r2_list,'MSE Scores': mse_list, 'RMSE Scores': rmse_list, 'MAE Scores': mae_list})
df['RMSE Scores'] = df['RMSE Scores'].apply(lambda x: x[0])
df['MSE Scores'] = df['MSE Scores'].apply(lambda x: x[0])
df['R2 Scores'] = df['R2 Scores'].apply(lambda x: x[0])
df['MAE Scores'] = df['MAE Scores'].apply(lambda x: x[0])


df_MSE2 = df.sort_values(by=["MAE Scores"], ascending=True)
df_R22 = df.sort_values(by=["R2 Scores"], ascending=False)
df_RMSE2 = df.sort_values(by=["RMSE Scores"], ascending=True)
df_MAE2 = df.sort_values(by=["MAE Scores"], ascending=True)

In [None]:
df_MSE2

In [None]:
# Variable names
variable_names = ['cloud_cover','sunshine','max_temp','mean_temp','min_temp','precipitation','pressure']
target_variable = ['global_radiation']

# Store coefficients in a DataFrame
coefficients_df = pd.DataFrame(columns=["Model", "Variable", "Coefficient"])

In [None]:
# List to store coefficient information
coefficients_list = []

# Generate separate plots for each model
for model_name, model in models.items():
    model.fit(X, y)
    
    if hasattr(model, 'coef_'):
        coef = model.coef_
    elif hasattr(model, 'feature_importances_'):
        coef = model.feature_importances_
    else:
        continue
    
    # Store coefficients in the list
    for i, c in enumerate(coef):
        coefficients_list.append({"Model": model_name, "Variable": variable_names[i], "Coefficient": c})
    
    # Generate coefficient plot
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(range(len(coef)), coef)
    ax.set_title(f"{model_name} Coefficients")
    ax.set_xlabel("Variables")
    ax.set_ylabel("Coefficient Value")
    ax.set_xticks(range(len(coef)))
    ax.set_xticklabels(variable_names)
    plt.tight_layout()
    plt.show()


In [None]:


# Select a range of data points for the plot (e.g., from 50 to 100)
data_range = slice(50, 100)

# Plot the true line and predicted line for each model
for model_name, model in models.items():
    # Make predictions using the model
    y_test_pred = model.predict(X_test)
    
    # Create a new plot for each model
    plt.figure(figsize=(8, 6))
    
    # Plot the true line in blue
    plt.plot(y_test[data_range], color='blue', label='True')
    
    # Plot the predicted line in a different color
    plt.plot(y_test_pred[data_range], color='red', linestyle='dashed', label='Predicted')
    
    
    plt.xlabel('Data Points')
    plt.ylabel('Solar Irradiance W/m2')
    plt.title(f'{model_name} True vs. Predicted')
    plt.legend()
    
    # Display the plot for the current model
    plt.show()

