In [25]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
np.random.seed(31415)



### Ideas:

#### How well does a model trained on one solar farm transfer to another?
#### Does using sin and cosin as compared to non cyclical encodings make a difference?
#### Does stacking improve the results as compared to basline model?

# Load PV_01 Data

In [26]:
df = pd.read_csv('GermanSolarFarm/data/pv_01.csv', delimiter=';')

## First look at the data

In [27]:
# Lock the dataframe and display all columns
with pd.option_context('display.max_columns', None):
    display(df.head(20))

Unnamed: 0,time_idx,hour_of_day,hour_of_day_cos,hour_of_day_sin,month_of_year,month_of_year_cos,month_of_year_sin,season_of_year,season_of_year_cos,season_of_year_sin,sunposition_thetaZ,sunposition_solarAzimuth,sunposition_extraTerr,sunposition_solarHeight,clearsky_diffuse,clearsky_direct,clearsky_global,clearsky_diffuse_agg,clearsky_direct_agg,clearsky_global_agg,Albedo,WindComponentUat0,WindComponentVat0,WindComponentUat100,WindComponentVat100,DewpointTemperatureAt0,TemperatureAt0,PotentialVorticityAt1000,PotentialVorticityAt950,RelativeHumidityAt1000,RelativeHumidityAt950,RelativeHumidityAt0,SnowDensityAt0,SnowDepthAt0,SnowfallPlusStratiformSurfaceAt0,SurfacePressureAt0,SolarRadiationGlobalAt0,SolarRadiationDirectAt0,SolarRadiationDiffuseAt0,TotalCloudCoverAt0,LowerWindSpeed,LowerWindDirection,LowerWindDirectionMath,LowerWindDirectionCos,LowerWindDirectionSin,UpperWindSpeed,UpperWindDirection,UpperWindDirectionMath,UpperWindDirectionCos,UpperWindDirectionSin,power_normed,Unnamed: 51
0,1.0,0.0,1.0,0.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.250553,3.654783e-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167936,0.425266,0.653892,0.481074,0.648675,0.654981,0.426538,0.457604,0.211843,0.847222,0.89981,0.897686,9.062803e-08,0.0,3.9947439999999997e-19,0.314882,0.0,0.0,0.000497,0.96875,0.29408,0.558633,0.190752,0.68174,0.965801,0.359087,0.577455,0.172559,0.733785,0.941978,0.0,
1,2.0,0.142857,0.957824,0.399332,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.250817,3.659645e-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167554,0.40716,0.75098,0.459453,0.756602,0.661824,0.434919,0.46538,0.257749,0.833333,0.884296,0.892039,9.062803e-08,0.0,3.9947439999999997e-19,0.231609,0.0,0.0,0.000497,1.0,0.399157,0.531501,0.217852,0.600192,0.989859,0.475393,0.545543,0.204424,0.641217,0.979643,0.0,
2,3.0,0.285714,0.838278,0.732544,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.251081,3.664501e-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167554,0.544557,0.667997,0.600537,0.643652,0.597057,0.399884,0.529615,0.255517,0.756944,0.760184,0.831692,9.062803e-08,0.0,3.9947439999999997e-19,0.2312,0.0,0.0,0.000497,1.0,0.410575,0.623444,0.12602,0.851167,0.855924,0.472248,0.635529,0.114573,0.875928,0.829663,0.0,
3,4.0,0.428571,0.661157,0.944463,0.909091,0.017332,0.258819,1.0,0.0,0.0,0.708461,0.379841,0.3652606,0.291539,0.449027,0.250852,0.291822,0.268048,0.11097,0.143219,0.167554,0.688942,0.657905,0.708066,0.614184,0.598159,0.406052,0.46102,0.224923,0.743056,0.873953,0.818583,9.062803e-08,0.0,3.9947439999999997e-19,0.248368,0.057197,0.005365,0.117925,0.976562,0.581901,0.669254,0.080266,0.937664,0.741765,0.591015,0.673493,0.076666,0.943093,0.731664,0.035185,
4,5.0,0.571429,0.455788,1.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,0.642045,0.555848,0.4449296,0.357955,0.516983,0.346842,0.383311,0.516742,0.353931,0.389707,0.167554,0.761857,0.593752,0.763622,0.552329,0.5598,0.40703,0.435287,0.214737,0.625,0.837754,0.728758,9.062803e-08,0.0,3.9947439999999997e-19,0.261169,0.244397,0.149812,0.267378,0.703125,0.653922,0.698923,0.050633,0.97485,0.656581,0.641152,0.701128,0.049073,0.976413,0.651756,0.120988,
5,6.0,0.714286,0.256176,0.88996,0.909091,0.017332,0.258819,1.0,0.0,0.0,0.906693,0.715489,0.1185731,0.093307,0.219087,0.054751,0.087215,0.393716,0.207071,0.24613,0.167554,0.696794,0.636432,0.741356,0.607739,0.52314,0.401475,0.499192,0.223085,0.527778,0.718814,0.660145,9.062803e-08,0.0,3.9947439999999997e-19,0.28243,0.174044,0.096567,0.209285,0.421875,0.580611,0.67702,0.072509,0.948921,0.72016,0.634306,0.681408,0.068763,0.954045,0.709385,0.201235,
6,7.0,0.857143,0.095371,0.632563,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.747869,3.683873e-18,0.0,0.0,0.0,0.0,0.027662,0.003605,0.008412,0.167554,0.708199,0.65432,0.750916,0.627016,0.498032,0.369976,0.499398,0.22126,0.5625,0.760184,0.680852,9.062803e-08,0.0,3.9947439999999997e-19,0.29527,0.004348,0.000536,0.009186,0.78125,0.605988,0.673972,0.075553,0.944629,0.728703,0.658522,0.677493,0.072672,0.948766,0.720473,0.004321,
7,8.0,1.0,0.0,0.270427,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.747607,3.688703e-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167554,0.661221,0.649628,0.707356,0.621651,0.520023,0.344619,0.468019,0.221271,0.701389,0.879124,0.794304,9.062803e-08,0.0,3.9947439999999997e-19,0.304307,0.000111,0.0,0.000745,0.242188,0.539887,0.665792,0.083723,0.93231,0.751213,0.594543,0.671081,0.079075,0.939536,0.738343,0.0,
8,9.0,0.0,1.0,0.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.252655,3.693527e-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167554,0.6089,0.652655,0.667247,0.634165,0.514629,0.332251,0.452582,0.223473,0.715278,0.889467,0.813337,9.062803e-08,0.0,3.9947439999999997e-19,0.292988,0.0,0.0,0.000497,0.320312,0.474159,0.651299,0.098199,0.907709,0.789437,0.548595,0.658273,0.091864,0.918987,0.772855,0.0,
9,10.0,0.142857,0.957824,0.399332,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.252917,3.698346e-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16715,0.445719,0.679126,0.51293,0.715492,0.494087,0.333745,0.484505,0.23965,0.631944,0.755013,0.762055,9.062803e-08,0.0,3.9947439999999997e-19,0.235231,0.0,0.0,0.000497,1.0,0.334415,0.567169,0.182227,0.706414,0.955404,0.458441,0.578866,0.171151,0.737686,0.939892,0.0,


In [28]:
# Check unique values in the 'Unnamed: 51' column
df['Unnamed: 51'].unique()

array([nan])

After checking the data, this column seems to be an encoding issue because every row ends with a ';.' The column can, therfore, be dropped. 

In [29]:
# Drop the 'Unnamed: 51' column
df = df.drop('Unnamed: 51', axis=1)

Firsty the entire datset apart from the target are scaled from 0 to 1 using a min-max normalization. The target on the other hand was scaled using the ouput capacity of the entire solarfarm. This should in theory negate the effects of different nominal capacitys. 

Explanation for every coloumn (Unfortunatly this was not provided therefore some interpretations might be flaud):
Every values is scaled between 0 - 1, and also contain a sign and cosin coding. We will focus on the value itself once. Some values also contain a Math postfix it is unclear what is meant by that 


## Dataset Overview

The dataset consists of multiple features (columns) that have been preprocessed to ensure consistency and enable effective modeling. The preprocessing steps include scaling the values and encoding specific features.

### Preprocessing

1. **Min-max normalization**: All features, except for the target variable, have been scaled from 0 to 1 using min-max normalization. This process transforms the original values in each column to a standardized scale, making it easier for machine learning algorithms to converge faster and improve overall performance.

2. **Target variable scaling**: The target variable, which seems to be related to solar farm output, has been scaled using the output capacity of the entire solar farm. This approach normalizes the target variable across different solar farms with varying nominal capacities, enabling fair comparisons and analysis.

### Overall column Explanations

1. **Sine and cosine encoding**: Some of the features in the dataset have been encoded using sine and cosine transformations. This is often done for cyclical features, such as time, to capture their cyclical nature and make it easier for the model to understand the underlying patterns.

2. **Mathematical postfix**: Certain values in the dataset have a mathematical postfix. Without additional context, it's difficult to accurately interpret the meaning of these postfixes. It could be a notation system used to differentiate between different types of values or measurements in the dataset, but further information would be required to provide a definitive interpretation.

### Column Explanation
| Column Name                      | Description                                                                                                                                                                                                                                                                                                                                                        |
|----------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| time_idx                         | A unique identifier for each point in the data.                                                                                                                                                                                                                                                                                                                    |
| hour_of_day                      | The hour of the day the data was collected ranged from 0 to 1 (3-hour resolution).                                                                                                                                                                                                                                                                                 |
| month_of_year                    | The month of the year the data was collected.                                                                                                                                                                                                                                                                                                                      |
| season_of_year                   | The season of the year the data was collected.                                                                                                                                                                                                                                                                                                                     |
| sunposition_thetaZ               | The zenith angle of the sun in degrees, which is the angle between the sun and the vertical. https://pvpmc.sandia.gov/modeling-steps/1-weather-design-inputs/sun-position/                                                                                                                                                                                         |
| sunposition_solarAzimuth         | The solar azimuth angle is the azimuth (horizontal angle with respect to north) of the Sun's position. This horizontal coordinate defines the Sun's relative direction along the local horizon, whereas the solar zenith angle (or its complementary angle solar elevation) defines the Sun's apparent altitude. https://en.wikipedia.org/wiki/Solar_azimuth_angle |
| sunposition_extraTerr            | Extraterrestrial radiation () is the intensity (power) of the sun at the top of the Earth’s atmosphere. https://pvpmc.sandia.gov/modeling-steps/1-weather-design-inputs/irradiance-and-insolation-2/extraterrestrial-radiation/                                                                                                                                    |
| sunposition_solarHeight          | The height of the sun above the horizon in degrees. https://en.wikipedia.org/wiki/Position_of_the_Sun#:~:text=At%20the%20solstices%2C%20the%20angle,at%20the%20southern%20summer%20solstice.                                                                                                                                                                       |
| clearsky_diffuse                 | The amount of direct radiation on a horizontal surface under clear sky conditions in W/m^2.                                                                                                                                                                                                                                                                        |
| clearsky_direct                  | The total amount of radiation on a horizontal surface under clear sky conditions in W/m^2.                                                                                                                                                                                                                                                                         |
| clearsky_global                  | The amount of diffuse radiation on a horizontal surface under clear sky conditions in W/m^2, likely aggregated over the day.                                                                                                                                                                                                                                       |
| clearsky_diffuse_agg             | The amount of diffuse radiation on a horizontal surface under clear sky conditions in W/m^2, likely aggregated over the day.                                                                                                                                                                                                                                       |
| clearsky_direct_agg              | The amount of direct radiation on a horizontal surface under clear sky conditions in W/m^2, likely aggregated over the day.                                                                                                                                                                                                                                        |
| clearsky_global_agg              | The total amount of radiation on a horizontal surface under clear sky conditions in W/m^2, likely aggregated over the day.                                                                                                                                                                                                                                         |
| Albedo                           | The fraction of solar energy reflected by a surface, ranging from 0 to 1. https://en.wikipedia.org/wiki/Albedo                                                                                                                                                                                                                                                     |
| WindComponentUat0                | Not 100% sure but likely windspeed at the ground in a certain direction in m/s. http://colaweb.gmu.edu/dev/clim301/lectures/wind/wind-uv                                                                                                                                                                                                                           |
| WindComponentVat0                | Not 100% sure but likely windspeed at the ground in a certain direction in m/s.  http://colaweb.gmu.edu/dev/clim301/lectures/wind/wind-uv                                                                                                                                                                                                                          |
| WindComponentUat100              | Not 100% sure but likely windspeed at the ground in a certain direction in m/s.  http://colaweb.gmu.edu/dev/clim301/lectures/wind/wind-uv                                                                                                                                                                                                                          |
| WindComponentVat100              | Not 100% sure but likely windspeed at the ground in a certain direction in m/s.  http://colaweb.gmu.edu/dev/clim301/lectures/wind/wind-uv                                                                                                                                                                                                                          |
| DewpointTemperatureAt0           | The temperature at which dew would form at ground level in degrees Celsius.                                                                                                                                                                                                                                                                                        |
| TemperatureAt0                   | The temperature at ground level.                                                                                                                                                                                                                                                                                                                                   |
| PotentialVorticityAt1000         | Likely: The potential vorticity at a pressure level of 1000 hPa in PVU (potential vorticity unit). https://en.wikipedia.org/wiki/Potential_vorticity                                                                                                                                                                                                               |
| PotentialVorticityAt950          | Likely: The potential vorticity at a pressure level of 950 hPa in PVU (potential vorticity unit).  https://en.wikipedia.org/wiki/Potential_vorticity                                                                                                                                                                                                               |
| RelativeHumidityAt1000           | Likely: The relative humidity at a pressure level of 1000 hPa. https://www.lenntech.com/calculators/humidity/relative-humidity.htm                                                                                                                                                                                                                                 |
| RelativeHumidityAt950            | Likely: The relative humidity at a pressure level of 950 hPa, expressed as a percentage. https://www.lenntech.com/calculators/humidity/relative-humidity.htm                                                                                                                                                                                                       |
| RelativeHumidityAt0              | The relative humidity at ground level.                                                                                                                                                                                                                                                                                                                             |
| SnowDensityAt0                   | The density of snow on the ground.                                                                                                                                                                                                                                                                                                                                 |
| SnowDepthAt0                     | The depth of snow on the ground.                                                                                                                                                                                                                                                                                                                                   |
| SnowfallPlusStratiformSurfaceAt0 | The amount of snowfall and stratiform precipitation at ground level.                                                                                                                                                                                                                                                                                               |
| SurfacePressureAt0               | The air pressure at ground level.                                                                                                                                                                                                                                                                                                                                  |
| SolarRadiationGlobalAt0          | The total amount of solar radiation on a horizontal surface at ground level.                                                                                                                                                                                                                                            |
| SolarRadiationDirectAt0          | The amount of diffuse radiation on a horizontal surface at ground level.                                                                                                                                                                                                                                                                                  |
| SolarRadiationDiffuseAt0         | The amount of diffuse radiation on a horizontal surface at ground.                                                                                                                                                                                                                                                                                                 |
| TotalCloudCoverAt0               | The total cloud cover at ground level.                                                                                                                                                                                                                                                                                                                             |
| LowerWindSpeed                   | The average wind speed in (likely) the lower atmosphere.                                                                                                                                                                                                                                                                                                           |
| LowerWindDirection               | The average wind direction in the lower atmosphere.                                                                                                                                                                                                                                                                                                                |
| LowerWindDirectionMath           | The mathematical wind direction in the lower atmosphere.                                                                                                                                                                                                                                                                                                           |
| UpperWindSpeed                   | The average wind speed in (likely) the upper atmosphere.                                                                                                                                                                                                                                                                                                           |
| UpperWindDirection               | The average wind direction in the upper atmosphere.                                                                                                                                                                                                                                                                                                                |
| UpperWindDirectionMath           | Likely the mathematical wind direction in the upper atmosphere in degrees.                                                                                                                                                                                                                                                                                         |
| power_normed                     | The normalized power output of the solar panels, ranging from 0 to 1.                                                                                                                                                                                                                                                                                        


In summary, the dataset has undergone various preprocessing steps to ensure consistency and facilitate effective modeling. The features have been scaled and encoded as necessary, but some column descriptions remain unclear due to the lack of information.





### Size of the dataset


In [30]:
# Get the length of the dataframe
print(f"Total number of measurements: {len(df)}")
# Devide the dataframe length by 8 to get the number of days (8 measurements per day = a resolution of 3 hours)
print(f"Number of days measured: {len(df)/8}")

Total number of measurements: 6217
Number of days measured: 777.125


This is a little odd as the lecture (2-Energy p. 41) stated 990 days however the data provided only seems to include 777 days. There does not seem to be a predefined train test split. 

### Datatypes in the dataset

In [31]:
# Lets check the data types of all the columns
# dont restict the output rows
with pd.option_context('display.max_rows', None):
    display(df.dtypes)

time_idx                            float64
hour_of_day                         float64
hour_of_day_cos                     float64
hour_of_day_sin                     float64
month_of_year                       float64
month_of_year_cos                   float64
month_of_year_sin                   float64
season_of_year                      float64
season_of_year_cos                  float64
season_of_year_sin                  float64
sunposition_thetaZ                  float64
sunposition_solarAzimuth            float64
sunposition_extraTerr               float64
sunposition_solarHeight             float64
clearsky_diffuse                    float64
clearsky_direct                     float64
clearsky_global                     float64
clearsky_diffuse_agg                float64
clearsky_direct_agg                 float64
clearsky_global_agg                 float64
Albedo                              float64
WindComponentUat0                   float64
WindComponentVat0               

The output looks good as we would expect all coloums to have a numeric value which is the case.

In [32]:
df.head()

Unnamed: 0,time_idx,hour_of_day,hour_of_day_cos,hour_of_day_sin,month_of_year,month_of_year_cos,month_of_year_sin,season_of_year,season_of_year_cos,season_of_year_sin,...,LowerWindDirection,LowerWindDirectionMath,LowerWindDirectionCos,LowerWindDirectionSin,UpperWindSpeed,UpperWindDirection,UpperWindDirectionMath,UpperWindDirectionCos,UpperWindDirectionSin,power_normed
0,1.0,0.0,1.0,0.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,...,0.558633,0.190752,0.68174,0.965801,0.359087,0.577455,0.172559,0.733785,0.941978,0.0
1,2.0,0.142857,0.957824,0.399332,0.909091,0.017332,0.258819,1.0,0.0,0.0,...,0.531501,0.217852,0.600192,0.989859,0.475393,0.545543,0.204424,0.641217,0.979643,0.0
2,3.0,0.285714,0.838278,0.732544,0.909091,0.017332,0.258819,1.0,0.0,0.0,...,0.623444,0.12602,0.851167,0.855924,0.472248,0.635529,0.114573,0.875928,0.829663,0.0
3,4.0,0.428571,0.661157,0.944463,0.909091,0.017332,0.258819,1.0,0.0,0.0,...,0.669254,0.080266,0.937664,0.741765,0.591015,0.673493,0.076666,0.943093,0.731664,0.035185
4,5.0,0.571429,0.455788,1.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,...,0.698923,0.050633,0.97485,0.656581,0.641152,0.701128,0.049073,0.976413,0.651756,0.120988


## Feature reduction
Given the large number of features in the dataset, some of which may be difficult to interpret without domain knowledge in meteorology, it can be challenging to identify meaningful relationships among the variables. Therefore, in order to facilitate model interpretability and potentially improve prediction performance, we propose reducing the number of features. Specifically, we will focus on selecting features that have been previously identified as having a significant impact on model performance, as discussed in relevant lecture. This process will allow us to mitigate the "curse of dimensionality" and improve the efficiency of our modeling efforts.


We propose excluding certain columns from the dataset for the following reasons:

- <b>"WindComponentUat0, WindComponentVat0, WindComponentUat100, and WindComponentVat100"</b> will be excluded as their meaning and impact on power output prediction are unclear. Instead, we will rely on the more easily interpretable LowerWindSpeed and UpperWindSpeed columns for information about wind speed.

- <b>"clearsky_diffuse, clearsky_direct, clearsky_global, clearsky_diffuse_agg, clearsky_direct_agg, and clearsky_global_agg"</b> will be excluded due to their potential difficulty in interpretation and dependence on cloud cover. Cloud cover, which is measured separately in the TotalCloudCoverAt0 column, can be a more informative metric in predicting solar power output.

- <b>"RelativeHumidityAt1000 and RelativeHumidityAt950"</b> columns, while containing potentially valuable information, may be difficult for the model to interpret due to the use of three different pressure measurements at varying altitudes. Therefore, we will focus solely on the RelativeHumidityAt0 column, which is measured at ground level.

- <b>"PotentialVorticityAt950"</b> will be excluded since the PotentialVorticityAt1000 measurement should be sufficient for modeling purposes.



In [33]:
# drop indComponentUat0, WindComponentVat0, WindComponentUat100, and WindComponentVat100
df = df.drop(['WindComponentUat0', 'WindComponentVat0', 'WindComponentUat100', 'WindComponentVat100'], axis=1)
# drop clearsky_diffuse, clearsky_direct, clearsky_global, clearsky_diffuse_agg, clearsky_direct_agg, and clearsky_global_agg
df = df.drop(['clearsky_diffuse', 'clearsky_direct', 'clearsky_global', 'clearsky_diffuse_agg', 'clearsky_direct_agg', 'clearsky_global_agg'], axis=1)
# drop RelativeHumidityAt1000 and RelativeHumidityAt950
df = df.drop(['RelativeHumidityAt1000', 'RelativeHumidityAt950'], axis=1)
# drop PotentialVorticityAt950
df = df.drop(['PotentialVorticityAt950'], axis=1)

In [34]:
# Lock the dataframe and display all columns
with pd.option_context('display.max_columns', None):
    display(df.head(20))

Unnamed: 0,time_idx,hour_of_day,hour_of_day_cos,hour_of_day_sin,month_of_year,month_of_year_cos,month_of_year_sin,season_of_year,season_of_year_cos,season_of_year_sin,sunposition_thetaZ,sunposition_solarAzimuth,sunposition_extraTerr,sunposition_solarHeight,Albedo,DewpointTemperatureAt0,TemperatureAt0,PotentialVorticityAt1000,RelativeHumidityAt0,SnowDensityAt0,SnowDepthAt0,SnowfallPlusStratiformSurfaceAt0,SurfacePressureAt0,SolarRadiationGlobalAt0,SolarRadiationDirectAt0,SolarRadiationDiffuseAt0,TotalCloudCoverAt0,LowerWindSpeed,LowerWindDirection,LowerWindDirectionMath,LowerWindDirectionCos,LowerWindDirectionSin,UpperWindSpeed,UpperWindDirection,UpperWindDirectionMath,UpperWindDirectionCos,UpperWindDirectionSin,power_normed
0,1.0,0.0,1.0,0.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.250553,3.654783e-18,0.0,0.167936,0.654981,0.426538,0.457604,0.897686,9.062803e-08,0.0,3.9947439999999997e-19,0.314882,0.0,0.0,0.000497,0.96875,0.29408,0.558633,0.190752,0.68174,0.965801,0.359087,0.577455,0.172559,0.733785,0.941978,0.0
1,2.0,0.142857,0.957824,0.399332,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.250817,3.659645e-18,0.0,0.167554,0.661824,0.434919,0.46538,0.892039,9.062803e-08,0.0,3.9947439999999997e-19,0.231609,0.0,0.0,0.000497,1.0,0.399157,0.531501,0.217852,0.600192,0.989859,0.475393,0.545543,0.204424,0.641217,0.979643,0.0
2,3.0,0.285714,0.838278,0.732544,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.251081,3.664501e-18,0.0,0.167554,0.597057,0.399884,0.529615,0.831692,9.062803e-08,0.0,3.9947439999999997e-19,0.2312,0.0,0.0,0.000497,1.0,0.410575,0.623444,0.12602,0.851167,0.855924,0.472248,0.635529,0.114573,0.875928,0.829663,0.0
3,4.0,0.428571,0.661157,0.944463,0.909091,0.017332,0.258819,1.0,0.0,0.0,0.708461,0.379841,0.3652606,0.291539,0.167554,0.598159,0.406052,0.46102,0.818583,9.062803e-08,0.0,3.9947439999999997e-19,0.248368,0.057197,0.005365,0.117925,0.976562,0.581901,0.669254,0.080266,0.937664,0.741765,0.591015,0.673493,0.076666,0.943093,0.731664,0.035185
4,5.0,0.571429,0.455788,1.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,0.642045,0.555848,0.4449296,0.357955,0.167554,0.5598,0.40703,0.435287,0.728758,9.062803e-08,0.0,3.9947439999999997e-19,0.261169,0.244397,0.149812,0.267378,0.703125,0.653922,0.698923,0.050633,0.97485,0.656581,0.641152,0.701128,0.049073,0.976413,0.651756,0.120988
5,6.0,0.714286,0.256176,0.88996,0.909091,0.017332,0.258819,1.0,0.0,0.0,0.906693,0.715489,0.1185731,0.093307,0.167554,0.52314,0.401475,0.499192,0.660145,9.062803e-08,0.0,3.9947439999999997e-19,0.28243,0.174044,0.096567,0.209285,0.421875,0.580611,0.67702,0.072509,0.948921,0.72016,0.634306,0.681408,0.068763,0.954045,0.709385,0.201235
6,7.0,0.857143,0.095371,0.632563,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.747869,3.683873e-18,0.0,0.167554,0.498032,0.369976,0.499398,0.680852,9.062803e-08,0.0,3.9947439999999997e-19,0.29527,0.004348,0.000536,0.009186,0.78125,0.605988,0.673972,0.075553,0.944629,0.728703,0.658522,0.677493,0.072672,0.948766,0.720473,0.004321
7,8.0,1.0,0.0,0.270427,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.747607,3.688703e-18,0.0,0.167554,0.520023,0.344619,0.468019,0.794304,9.062803e-08,0.0,3.9947439999999997e-19,0.304307,0.000111,0.0,0.000745,0.242188,0.539887,0.665792,0.083723,0.93231,0.751213,0.594543,0.671081,0.079075,0.939536,0.738343,0.0
8,9.0,0.0,1.0,0.0,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.252655,3.693527e-18,0.0,0.167554,0.514629,0.332251,0.452582,0.813337,9.062803e-08,0.0,3.9947439999999997e-19,0.292988,0.0,0.0,0.000497,0.320312,0.474159,0.651299,0.098199,0.907709,0.789437,0.548595,0.658273,0.091864,0.918987,0.772855,0.0
9,10.0,0.142857,0.957824,0.399332,0.909091,0.017332,0.258819,1.0,0.0,0.0,1.0,0.252917,3.698346e-18,0.0,0.16715,0.494087,0.333745,0.484505,0.762055,9.062803e-08,0.0,3.9947439999999997e-19,0.235231,0.0,0.0,0.000497,1.0,0.334415,0.567169,0.182227,0.706414,0.955404,0.458441,0.578866,0.171151,0.737686,0.939892,0.0


### Check for outlieres and Missing data

In [35]:
with pd.option_context('display.max_columns', None):
    display(df.describe())

Unnamed: 0,time_idx,hour_of_day,hour_of_day_cos,hour_of_day_sin,month_of_year,month_of_year_cos,month_of_year_sin,season_of_year,season_of_year_cos,season_of_year_sin,sunposition_thetaZ,sunposition_solarAzimuth,sunposition_extraTerr,sunposition_solarHeight,Albedo,DewpointTemperatureAt0,TemperatureAt0,PotentialVorticityAt1000,RelativeHumidityAt0,SnowDensityAt0,SnowDepthAt0,SnowfallPlusStratiformSurfaceAt0,SurfacePressureAt0,SolarRadiationGlobalAt0,SolarRadiationDirectAt0,SolarRadiationDiffuseAt0,TotalCloudCoverAt0,LowerWindSpeed,LowerWindDirection,LowerWindDirectionMath,LowerWindDirectionCos,LowerWindDirectionSin,UpperWindSpeed,UpperWindDirection,UpperWindDirectionMath,UpperWindDirectionCos,UpperWindDirectionSin,power_normed
count,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0,6217.0
mean,3109.0,0.49992,0.533149,0.608563,0.544548,0.420717,0.60022,0.548603,0.394155,0.554595,0.800269,0.476646,0.2287822,0.199731,0.280741,0.582749,0.420889,0.479249,0.751955,0.05842311,0.015384,0.01036643,0.559719,0.144934,0.097825,0.142138,0.659605,0.295383,0.564245,0.403288,0.628539,0.608671,0.343724,0.573815,0.411037,0.640362,0.597398,0.109715
std,1794.837644,0.327388,0.365148,0.333564,0.320168,0.358962,0.334571,0.378155,0.385151,0.391409,0.279257,0.287884,0.3065492,0.279257,0.172684,0.151626,0.160634,0.052128,0.17753,0.148668,0.075354,0.05935139,0.150708,0.221917,0.18302,0.195103,0.372568,0.147934,0.230603,0.322571,0.362625,0.300345,0.153723,0.232721,0.328869,0.360847,0.301061,0.178543
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1555.0,0.142857,0.256176,0.270427,0.272727,0.068148,0.258819,0.333333,0.0,0.0,0.656749,0.2423,3.391465e-18,0.0,0.133041,0.471233,0.2935,0.454071,0.648657,9.062803e-08,0.0,3.9947439999999997e-19,0.460421,0.0,0.0,0.000497,0.335938,0.190774,0.387985,0.120118,0.290335,0.378777,0.237028,0.398749,0.118401,0.322074,0.366889,0.0
50%,3109.0,0.428571,0.661157,0.632563,0.545455,0.377014,0.707107,0.666667,0.171573,0.707107,1.0,0.383964,4.996425e-18,0.0,0.283233,0.583336,0.414431,0.468946,0.794304,9.062803e-08,0.0,3.9947439999999997e-19,0.564057,0.017393,0.000805,0.033019,0.84375,0.26072,0.611767,0.292992,0.772748,0.652867,0.331004,0.624248,0.317959,0.790428,0.644975,0.008025
75%,4663.0,0.714286,0.957824,0.88996,0.818182,0.762999,0.866025,1.0,0.585786,0.707107,1.0,0.730422,0.4209835,0.343251,0.445688,0.692286,0.536525,0.497157,0.894825,9.062803e-08,0.0,3.9947439999999997e-19,0.659941,0.218085,0.107296,0.248262,1.0,0.374025,0.731942,0.646943,0.949839,0.88101,0.43657,0.740582,0.704234,0.95391,0.862743,0.148354
max,6217.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.834568


All looks good but to make it easier for us to explore the data some more we drop all math, sind, cos columns as they do not help us to explore the data as a human


In [36]:
# drop all columns that include cos, sin and math 
df_reduced = df.drop([col for col in df.columns if 'cos' in col.lower() or 'sin' in col.lower() or 'math' in col.lower()], axis=1)

In [37]:
df_reduced.head(2000)

Unnamed: 0,time_idx,hour_of_day,month_of_year,season_of_year,sunposition_thetaZ,sunposition_solarAzimuth,sunposition_extraTerr,sunposition_solarHeight,Albedo,DewpointTemperatureAt0,...,SurfacePressureAt0,SolarRadiationGlobalAt0,SolarRadiationDirectAt0,SolarRadiationDiffuseAt0,TotalCloudCoverAt0,LowerWindSpeed,LowerWindDirection,UpperWindSpeed,UpperWindDirection,power_normed
0,1.0,0.000000,0.909091,1.000000,1.000000,0.250553,3.654783e-18,0.000000,0.167936,0.654981,...,0.314882,0.000000,0.000000,0.000497,0.968750,0.294080,0.558633,0.359087,0.577455,0.000000
1,2.0,0.142857,0.909091,1.000000,1.000000,0.250817,3.659645e-18,0.000000,0.167554,0.661824,...,0.231609,0.000000,0.000000,0.000497,1.000000,0.399157,0.531501,0.475393,0.545543,0.000000
2,3.0,0.285714,0.909091,1.000000,1.000000,0.251081,3.664501e-18,0.000000,0.167554,0.597057,...,0.231200,0.000000,0.000000,0.000497,1.000000,0.410575,0.623444,0.472248,0.635529,0.000000
3,4.0,0.428571,0.909091,1.000000,0.708461,0.379841,3.652606e-01,0.291539,0.167554,0.598159,...,0.248368,0.057197,0.005365,0.117925,0.976562,0.581901,0.669254,0.591015,0.673493,0.035185
4,5.0,0.571429,0.909091,1.000000,0.642045,0.555848,4.449296e-01,0.357955,0.167554,0.559800,...,0.261169,0.244397,0.149812,0.267378,0.703125,0.653922,0.698923,0.641152,0.701128,0.120988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996.0,0.428571,0.545455,0.666667,0.184403,0.282726,8.705038e-01,0.815597,0.491479,0.891961,...,0.406723,0.412532,0.220896,0.510179,0.750000,0.210557,0.094284,0.180594,0.088175,0.328189
1996,1997.0,0.571429,0.545455,0.666667,0.016498,0.560383,9.894070e-01,0.983502,0.491479,0.884334,...,0.401599,0.719701,0.548149,0.588381,0.429688,0.366172,0.086747,0.318478,0.090787,0.560905
1997,1998.0,0.714286,0.545455,0.666667,0.344918,0.786007,7.315542e-01,0.655082,0.491479,0.812612,...,0.399199,0.707994,0.609710,0.448361,0.343750,0.462183,0.114897,0.434137,0.116620,0.279835
1998,1999.0,0.857143,0.545455,0.666667,0.793622,0.922593,2.477557e-01,0.206378,0.491479,0.847085,...,0.426075,0.298584,0.218214,0.261420,0.414062,0.315211,0.173271,0.388096,0.179630,0.103086


## Data exploration


Let's first examine the power generation over the different seasons of a year. This is a good indicator to gage how seansonal our data acctualy is. As the solar fram data is from Germany we would expect there to be a big difference between for example winter ans summer.

In [38]:
# plot the power_normed over time, overlay the season
# change the values of the season_of_year column to be 1.0 for winter, 0.33333 for spring, 0.66666 for summer, and 0.99999 for fall
df_name = df_reduced.copy()
df_name['season_of_year_name'] = df_reduced['season_of_year'].apply(lambda x: "winter" if x == 1.0 else "spring" if x == 0.0 else "summer" if round(x, 2) == 0.33 else "fall" )

fig = px.line(df_name, x='time_idx', y='power_normed', color='season_of_year_name')
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Normalized Power",
    legend_title="Season of Year",
    title="Normalized Power of a Solar Farm over Time",
    font=dict(
        family="Montserrat, monospace",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

The chart looks like we would expect with peak production taking place in summer and winter being the weakest over all. We should account for the high seasonality of our data in a later step.

Another sanity check we can perform is to look at the snow column as we would snow to not be present in summer.

In [39]:
#Plot the SnowDepthAt0 over time, overlay the season
fig = px.line(df_name, x='time_idx', y='SnowDepthAt0', color='season_of_year_name')
fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Normalized Snow Depth",
    legend_title="Season of Year",
    title="Normalized Snow depth of a Solar Farm over Time",
    font=dict(
        family="Montserrat, monospace",
        size=12,
        color="#7f7f7f"
    )
)
fig.show()

This is a little odd as there seems to be snow in summer. We can double check if there was snowfall in Germany at that time. TODO: check weather for this time period 

To explore the dataset some more it will be helpful to get a better understanding of the different correlations in our data. Therfore we will calculate the correlation of every value with the target.

In [40]:
# Look at correlations between the columns and the power_normed column
df_reduced.corr()['power_normed'].sort_values(ascending=False)

power_normed                        1.000000
SolarRadiationGlobalAt0             0.928351
SolarRadiationDirectAt0             0.892979
sunposition_solarHeight             0.807046
sunposition_extraTerr               0.806146
SolarRadiationDiffuseAt0            0.800645
TemperatureAt0                      0.510028
Albedo                              0.304486
sunposition_solarAzimuth            0.232058
DewpointTemperatureAt0              0.198577
hour_of_day                         0.193858
SurfacePressureAt0                  0.064476
time_idx                            0.033198
LowerWindSpeed                      0.009979
LowerWindDirection                  0.006156
UpperWindDirection                 -0.006571
SnowfallPlusStratiformSurfaceAt0   -0.061711
SnowDepthAt0                       -0.074284
season_of_year                     -0.103832
month_of_year                      -0.108827
SnowDensityAt0                     -0.112075
TotalCloudCoverAt0                 -0.139029
UpperWindS

**Interpretation of the correlations:**<br>
- Albedo (0.304486): Albedo has a weak positive correlation with power generation, indicating that a higher surface reflectivity might result in slightly increased power generation due to increased reflected radiation.


- sunposition_solarAzimuth (0.232058): Solar azimuth has a weaker positive correlation with power generation, suggesting that the sun's position might have a minor impact on power generation, depending on the solar panel's orientation. From this, we can derive that these are likely stationary solar panels. If they could orientate themselves, this should not be a relevant factor.


- DewpointTemperatureAt0 (0.198577): There is a weak positive correlation between dew point temperature and power generation, indicating that atmospheric conditions affecting dew point temperature might have a minor influence on power generation.


- hour_of_day (0.193858): Hour of the day has a weak positive correlation with power generation. This factor is likely not as pronounced as initially expected because of the different seasons. For example, while a solar panel might produce a lot of energy at 6 pm during the summer, it may not produce any during the winter.


- SurfacePressureAt0 (0.064476): Surface pressure shows a very weak positive correlation with power generation, indicating that it might not have a significant impact on solar energy production. This is interesting as I thought the pressure could correlate with cloud cover.


- time_idx (0.033198): The time index feature has a very weak positive correlation with power generation. This is a good sign, as we would not expect this to matter due to the balance in seasons.


- UpperWindDirection (-0.006571) and LowerWindDirection (0.006156): Both upper and lower wind directions show very weak correlations with power generation, indicating that wind direction might not play a significant role in solar energy generation. It might help the model to exclude this variable.


- SnowfallPlusStratiformSurfaceAt0 (-0.061711), SnowDepthAt0 (-0.074284), and SnowDensityAt0 (-0.112075): These snow-related features show weak negative correlations with power generation, suggesting that snow accumulation might have a minor adverse impact on solar energy production. TLDR-covered panels produce less snow.


- season_of_year (-0.103832) and month_of_year (-0.108827): Both season and month of the year show weak negative correlations with power generation, which could be due to seasonal variations in solar radiation and weather conditions. I would have expected these factors to be more prominent.


- TotalCloudCoverAt0 (-0.139029): Total cloud cover has a weak negative correlation with power generation, indicating that increased cloud cover reduces solar energy production. I expected this to be a more significant factor.


- PotentialVorticityAt1000 (-0.350961): Potential vorticity at 1000 hPa has a moderate negative correlation with power generation, implying that specific atmospheric circulation patterns could negatively influence solar energy production. However, one would need more domain knowledge for a detailed explanation.


- RelativeHumidityAt0 (-0.727691): Relative humidity has a strong negative correlation with power generation, indicating that higher humidity levels might significantly reduce solar energy production. This is a little puzzling to me unless humidity indicates rain which it does not, but it could be an indicator of fog which likely would have a drastic impact on power generation. https://www.weather.gov/lmk/humidity#:~:text=If%20the%20relative%20humidity%20is,temperature%20the%20air%20is%20at.


- sunposition_thetaZ (-0.807046): The solar zenith angle has a strong negative correlation with power generation, suggesting that as the angle between the sun and the zenith increases, power generation decreases. This is likely due to less direct sunlight reaching the solar panels.


The other factors were not explained because it seems obvious that solar radiation would increase power generation.



In [41]:
# Look at correlations between the columns
corr_matrix = df_reduced.corr()

fig = px.imshow(corr_matrix, x=corr_matrix.columns, y=corr_matrix.columns,
                color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.show()

## Baseline regression model

In [42]:
# Assuming you have a pandas DataFrame called 'data' with the columns specified in your question
# Remove the target column from the dataset to create the feature matrix X
X = df_reduced.drop('power_normed', axis=1)

# Set the target variable y to be the 'power_normed' column
y = df_reduced['power_normed']

# Calculate the index for the 80% split
split_index = int(len(X) * 0.8)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the target variable for the test data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R-squared: {r2:.4f}")

Mean Squared Error: 0.0038
Root Mean Squared Error: 0.0615
R-squared: 0.8628
