In [None]:
import pandas as pd
import xarray as xr
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from shapely.geometry import Point
import plotly.express as px

In [None]:
### Functions to extract data from netcdf files

def get_variable_from_netcfd(df, lon, lat, var):
    """Extract variable from netcdf file and interpolate to 30 min intervals"""
    return (
        df[var]
        .sel(longitude=lon, latitude=lat, method="nearest")
        .to_dataframe()
        .set_index("valid_time")[[var]]
        .resample("1800s")
        .interpolate()
    )


def get_ssrd_from_netcfd(df, lon, lat):
    _ssrd = get_variable_from_netcfd(df, lon, lat, "ssrd")
    _ssrd["radiation"] = (_ssrd["ssrd"] - _ssrd["ssrd"].shift(1)).clip(lower=0)
    _ssrd["radiation"] = _ssrd["radiation"].fillna(0)
    return _ssrd[["radiation"]]

In [None]:
# Load training data

training = pd.read_csv("data/training_data.csv")[["dtm", "solar_generation_MW"]]
training["valid_time"] = pd.to_datetime(training["dtm"])
training = training.drop(columns=["dtm"])

In [None]:
# Load weather data

nwp = xr.load_dataset("data/hres_1day_south_scotland_202101_202306.nc")

In [None]:
solar_variables = ["t2m", "d2m", "lcc", "mcc", "hcc", "tp"]

In [None]:
# Create range of 4 latitudes and 4 longitudes
# from the maximum and minimum values of the NWP data

lat_range = np.linspace(
    nwp.latitude.values.tolist()[0], nwp.latitude.values.tolist()[-1], 4
)
lon_range = np.linspace(nwp.longitude.min(), nwp.longitude.max(), 4)

lat_range = nwp.latitude.values.tolist()
lon_range = nwp.longitude.values.tolist()

In [None]:
scotland_gdf = gpd.read_file("lad.json")

In [None]:
cross_array = np.array(np.meshgrid(lat_range, lon_range)).T.reshape(-1, 2)
points = [Point(lon, lat) for lat, lon in cross_array]
# Inside Merge
inside_points = [point for point in points if scotland_gdf.contains(point).any()]
inside_array = np.array([[point.y, point.x] for point in inside_points])

In [None]:
ssrd = pd.DataFrame()

for lat, lon in inside_array:
    weather = get_ssrd_from_netcfd(nwp, lon, lat)
    weather["latitude"] = round(lat, 1)
    weather["longitude"] = round(lon, 2)
    ssrd = pd.concat([ssrd, weather])

ssrd = ssrd.reset_index()
ssrd["valid_time"] = ssrd["valid_time"].dt.tz_localize("UTC")

In [None]:
training_radiation = training.merge(ssrd)

In [None]:
training_radiation.sample(1000).plot(x = 'radiation', y = 'solar_generation_MW', kind = 'scatter')

In [None]:
correlations = (
    pd.DataFrame(
        training_radiation.groupby(by=["latitude", "longitude"])[
            ["solar_generation_MW", "radiation"]
        ]
        .corr()
        .iloc[0::2, -1]
    )
    .reset_index()
    .drop(columns=["level_2"]).rename(columns={"radiation": "correlation"})
)

correlations['correlation'].hist(bins = 200)

In [None]:
latitudes = nwp.latitude.values.tolist()
longitudes = nwp.longitude.values.tolist()

fig, ax = plt.subplots()

scotland_gpd_df = gpd.read_file("lad.json")
scotland_plot = scotland_gpd_df.plot(ax=ax)
scotland_plot.set_xlim(-8, -1.5)
scotland_plot.set_ylim(54, 60)


rect = Rectangle(
    (nwp.longitude.min() - 0.05, nwp.latitude.min() - 0.05),
    (nwp.longitude.max() - nwp.longitude.min()) + 0.1,
    nwp.latitude.max() - nwp.latitude.min() + 0.1,
    linewidth=1,
    edgecolor="r",
    facecolor="none",
)

scotland_plot.scatter(
    correlations["longitude"],
    correlations["latitude"],
    c=correlations["correlation"],
    s=50,
    cmap="coolwarm",
)


fig.set_size_inches(10, 10)

ax.set_ylim(54.5, 56.6)
ax.set_xlim(-6, -1.8)

ax.add_patch(rect)

plt.show()

In [None]:
training_radiation["days_since_start_of_year"] = training_radiation["valid_time"].dt.dayofyear
training_radiation["half_hour"] = training_radiation["valid_time"].dt.hour * 2 + training_radiation["valid_time"].dt.minute / 30

training_radiation["sin_days"] = np.sin(
    2 * np.pi * training_radiation["days_since_start_of_year"] / 365
)
training_radiation["cos_days"] = np.cos(
    2 * np.pi * training_radiation["days_since_start_of_year"] / 365
)
training_radiation["sin_hh"] = np.sin(2 * np.pi * training_radiation["half_hour"] / 48)
training_radiation["cos_hh"] = np.cos(2 * np.pi * training_radiation["half_hour"] / 48)

In [None]:
select_location = correlations.sort_values(by = 'correlation', ascending = False).head(1).reset_index(drop = True)

In [None]:
select_latitude = select_location['latitude'][0]
select_longitude = select_location['longitude'][0]

In [None]:
select_location_df = training_radiation[
    (training_radiation["latitude"] == select_latitude)
    & (training_radiation["longitude"] == select_longitude)
].reset_index(drop = True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, SGDRegressor, HuberRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

selected_features = ['radiation']#, 'sin_days','cos_days','sin_hh','cos_hh']

select_location_df_test = select_location_df.tail(2000)
select_location_df_train = select_location_df.head(len(select_location_df) - 2000)

X_test = select_location_df_test[selected_features]
y_test = select_location_df_test['solar_generation_MW']

X_train = select_location_df_train[selected_features]
y_train = select_location_df_train['solar_generation_MW']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

In [None]:
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    ElasticNet(),
    BayesianRidge(),
    HuberRegressor(),
    
    # SVR(),
    RandomForestRegressor(),
    
    KNeighborsRegressor(),
    
    
    AdaBoostRegressor(),
    MLPRegressor(),
    GradientBoostingRegressor(),
    DecisionTreeRegressor(),


]

In [None]:
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model)
    # Clear the model
    del model
    print(round(mean_absolute_error(y_test, y_pred),2))
    print()

In [None]:
select_location_df_test['y_pred'] = y_pred

In [None]:
px.line(select_location_df_test, x = 'valid_time', y = ['solar_generation_MW', 'y_pred'])