In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import pickle


In [2]:
data = pd.read_csv('France_wind_data.csv')

In [3]:
data.head()

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,uvindex,severerisk,sunrise,sunset,moonphase,conditions,description,icon,stations,AVERAGE of Wind_MWh
0,2022-08-10,91.3,53.0,73.4,87.6,53.0,72.1,47.9,46.5,0.0,...,9.0,,2022-08-10T06:38:04,2022-08-10T21:02:35,0.44,Clear,Clear conditions throughout the day.,clear-day,"07460099999,07379099999,LFLC,07374099999,LFLV,...",4992.125
1,2022-08-11,94.0,54.4,74.7,90.7,54.4,73.5,49.3,46.1,0.0,...,9.0,,2022-08-11T06:39:18,2022-08-11T21:01:02,0.48,Clear,Clear conditions throughout the day.,clear-day,"07460099999,07379099999,LFLC,07374099999,07477...",4184.375
2,2022-08-12,93.6,53.6,75.0,89.6,53.6,73.7,48.1,45.0,0.0,...,9.0,,2022-08-12T06:40:32,2022-08-12T20:59:27,0.5,Clear,Clear conditions throughout the day.,clear-day,"07460099999,07379099999,LFLC,07374099999,07477...",3159.208333
3,2022-08-13,92.9,53.7,74.6,88.8,53.7,73.2,46.1,42.5,0.0,...,9.0,,2022-08-13T06:41:47,2022-08-13T20:57:51,0.54,Clear,Clear conditions throughout the day.,clear-day,"07460099999,07379099999,LFLC,07374099999,07477...",2776.208333
4,2022-08-14,76.6,61.8,68.4,76.6,61.8,68.4,58.4,71.5,0.274,...,5.0,,2022-08-14T06:43:02,2022-08-14T20:56:14,0.58,"Rain, Partially cloudy",Partly cloudy throughout the day with rain.,rain,"07460099999,07379099999,LFLC,07374099999,07477...",3285.0


In [4]:
# Convert datetime to features
data['datetime'] = pd.to_datetime(data['datetime'])
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['day_of_year'] = data['datetime'].dt.dayofyear

# Extract time components from sunrise and sunset
data['sunrise_hour'] = pd.to_datetime(data['sunrise']).dt.hour
data['sunrise_minute'] = pd.to_datetime(data['sunrise']).dt.minute
data['sunset_hour'] = pd.to_datetime(data['sunset']).dt.hour
data['sunset_minute'] = pd.to_datetime(data['sunset']).dt.minute
data['daylight_hours'] = (pd.to_datetime(data['sunset']) - pd.to_datetime(data['sunrise'])).dt.total_seconds()/3600

In [5]:
data.shape

(995, 42)

In [6]:
# Handle categorical variables (conditions)
conditions_dummies = pd.get_dummies(data['conditions'], prefix='cond')
data = pd.concat([data, conditions_dummies], axis=1)

In [7]:
# Drop unnecessary columns
drop_cols = ['datetime', 'sunrise', 'sunset', 'conditions', 'description', 
             'icon', 'stations', 'preciptype', 'severerisk', 'solarenergy']
data = data.drop(columns=drop_cols)

In [8]:
data.shape

(995, 40)

In [9]:
data.columns

Index(['tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin',
       'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'uvindex', 'moonphase', 'AVERAGE of Wind_MWh', 'year', 'month', 'day',
       'day_of_year', 'sunrise_hour', 'sunrise_minute', 'sunset_hour',
       'sunset_minute', 'daylight_hours', 'cond_Clear', 'cond_Overcast',
       'cond_Partially cloudy', 'cond_Rain', 'cond_Rain, Overcast',
       'cond_Rain, Partially cloudy', 'cond_Snow, Rain, Overcast',
       'cond_Snow, Rain, Partially cloudy'],
      dtype='object')

In [10]:
# Handle missing values
data = data.fillna(data.mean())

In [11]:
# Define features and target
X = data.drop(columns=['AVERAGE of Wind_MWh'])
y = data['AVERAGE of Wind_MWh']

In [12]:
X.shape ,y.shape

((995, 39), (995,))

<h4>TRAIN TEST SPLIT</h4>

In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
with open("wind_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

<h4>GRID SEARCH CV</h4>

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0.1, 0.02, 0.03]
}

# Create the XGBoost model
xgb = XGBRegressor(random_state=42)

# Grid search with cross-validation
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=10,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Fitting 10 folds for each of 729 candidates, totalling 7290 fits


<h4>TRANING AND TESTING MODEL</h4>

In [15]:
# Train the model with best parameters
best_xgb = XGBRegressor(**best_params, random_state=42)
best_xgb.fit(X_train_scaled, y_train)

# Make predictions
y_pred = best_xgb.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 4830289.36
Mean Absolute Error: 1715.23
R-squared: 0.60


In [16]:
import joblib

# Save the model
joblib.dump(best_xgb, 'wind_model2.pkl')

['wind_model.pkl']

In [None]:
# Load the saved model
loaded_model = joblib.load('wind_model2.pkl')

# Test it on some new (or test) data
new_predictions = loaded_model.predict(X_test_scaled)
print(new_predictions[:5])

[4334.2686 9801.202  3197.6733 2011.5083 5207.754 ]


In [19]:
print(y_test[:5])

920    4647.708333
525    6504.416667
567    2964.000000
657    4233.833333
633    2489.416667
Name: AVERAGE of Wind_MWh, dtype: float64
