In [1]:
from sqlalchemy import create_engine, text
from processing_functions import prepare_time_features, get_pivoted_by_provider
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pickle

In [2]:
engine = create_engine("mysql+mysqlconnector://example/new_schema")

connection = engine.connect()
with open('condition-mappings/visualCrossingConditions.json', 'r') as file:
    visualCrossingConditions = json.load(file)
    

In [3]:
def not_in_list(toCheck, againstCheck):    
    not_in_l = [x for x in toCheck if x not in againstCheck]
    for i in not_in_l:
        print(i+'\n')
        
        
def not_in_list_comma(toCheck, againstCheck):
    not_in_l = []
    for x in toCheck:
        x = x.split(', ')
        not_in_l+=[s for s in x if s not in againstCheck]
    for i in not_in_l:
        print(i+'\n')
        
        
def print_regression_scores(model, X_t, y_t, ):
    y_pred = model.predict(X_t)

    mae = mean_absolute_error(y_t, y_pred)
    mse = mean_squared_error(y_t, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_t, y_pred)
    
    print(f'Mean Absolute Error (MAE): {mae}')
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Root Mean Squared Error (RMSE): {rmse}')
    print(f'R-squared (R²): {r2}')
    
    
def print_classification_score(model, X_test, y_test):
    y_pred_ = model.predict(X_test)

    accuracy_ = f1_score(y_test, y_pred_, average='weighted', zero_division=0.0)
    report = classification_report(y_test, y_pred_, zero_division=0.0)
    
    print(f'Accuracy: {accuracy_}')
    print('Classification Report:')
    print(report)
    
    
def class_score(y_pred_, y_t):
    accuracy_ = f1_score(y_t, y_pred_, average='weighted', zero_division=0.0)
    report = classification_report(y_t, y_pred_, zero_division=0.0)
    
    print(f'Accuracy: {accuracy_}')
    print('Classification Report:')
    print(report)
    

def write_model_to_file(file_name:str, model):
    data = {"model": model}
    with open(f'../static/{file_name}.pkl', 'wb') as saved:
        pickle.dump(data, saved)
        
def expand_conditions(df:pd.DataFrame, col_name:str):
    df_expanded = df[col_name].str.split(',', expand=True).stack().reset_index(level=1, drop=True)
    df_expanded = df_expanded.str.strip()
    df_expanded = df.drop(columns=[col_name]).join(df_expanded.rename(col_name))
    return df_expanded

In [4]:
forecast_data = pd.read_sql(text("select * from forecast"), connection)
forecast_data

Unnamed: 0,id,conditions,humidity,pressure,temperature,time,time_stamp,wind_direction,wind_speed,city_id,forecast_type_id,provider_id
0,487930,"Clear, Sunny",86.27,1010.36,20.67,2024-05-25 12:00:00,2024-05-24 12:56:05,349.18,4.52,222,2,2
1,487931,"Clear, Sunny",86.78,1010.37,20.64,2024-05-25 13:00:00,2024-05-24 12:56:05,349.61,4.41,222,2,2
2,487932,"Clear, Sunny",88.66,1010.20,20.57,2024-05-25 14:00:00,2024-05-24 12:56:05,3.54,3.78,222,2,2
3,487933,"Clear, Sunny",89.69,1010.00,20.39,2024-05-25 15:00:00,2024-05-24 12:56:05,20.33,3.28,222,2,2
4,487934,"Clear, Sunny",90.38,1009.72,20.15,2024-05-25 16:00:00,2024-05-24 12:56:05,36.57,2.86,222,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
104549,717879,Partially cloudy,33.30,1010.00,29.40,2024-05-31 20:00:00,2024-05-31 22:34:42,171.50,14.40,79,2,1
104550,717880,Partially cloudy,46.48,1011.00,27.70,2024-05-31 21:00:00,2024-05-31 22:34:42,175.10,14.80,79,2,1
104551,717881,Partially cloudy,52.65,1012.00,27.10,2024-05-31 22:00:00,2024-05-31 22:34:42,186.20,11.90,79,2,1
104552,717882,Partially cloudy,55.19,1012.00,26.30,2024-05-31 23:00:00,2024-05-31 22:34:42,200.10,7.60,79,2,1


In [5]:
historical_data = pd.read_sql(text("select * from historical_data"), connection)
historical_data

Unnamed: 0,id,conditions,humidity,pressure,temperature,time,time_stamp,wind_direction,wind_speed,city_id,forecast_type_id
0,1068353,Mainly clear,84.0,1009.9,27.40,2024-05-25 00:00:00,2024-06-04 02:57:08,201.0,11.2,146,2
1,1068354,Clear sky,84.0,1008.8,27.20,2024-05-25 01:00:00,2024-06-04 02:57:08,209.0,11.1,146,2
2,1068355,Clear sky,84.0,1008.4,27.10,2024-05-25 02:00:00,2024-06-04 02:57:08,207.0,11.3,146,2
3,1068356,Mainly clear,85.0,1007.5,26.90,2024-05-25 03:00:00,2024-06-04 02:57:08,213.0,11.2,146,2
4,1068357,Mainly clear,86.0,1007.4,26.80,2024-05-25 04:00:00,2024-06-04 02:57:08,213.0,11.2,146,2
...,...,...,...,...,...,...,...,...,...,...,...
39195,1107548,Rain:Slight,,,23.20,2024-05-27 00:00:00,2024-06-04 02:57:08,88.0,12.6,97,1
39196,1107549,Drizzle:moderate,,,19.05,2024-05-28 00:00:00,2024-06-04 02:57:08,84.0,22.4,97,1
39197,1107550,Rain:moderate,,,19.40,2024-05-29 00:00:00,2024-06-04 02:57:08,32.0,19.7,97,1
39198,1107551,Drizzle:Light,,,18.15,2024-05-30 00:00:00,2024-06-04 02:57:08,326.0,22.6,97,1


## Preprocessing 

In [6]:
prepared_historical_data = historical_data.copy()
prepared_historical_data['conditions'] = historical_data['conditions'].fillna('Unknown')
prepared_historical_data

Unnamed: 0,id,conditions,humidity,pressure,temperature,time,time_stamp,wind_direction,wind_speed,city_id,forecast_type_id
0,1068353,Mainly clear,84.0,1009.9,27.40,2024-05-25 00:00:00,2024-06-04 02:57:08,201.0,11.2,146,2
1,1068354,Clear sky,84.0,1008.8,27.20,2024-05-25 01:00:00,2024-06-04 02:57:08,209.0,11.1,146,2
2,1068355,Clear sky,84.0,1008.4,27.10,2024-05-25 02:00:00,2024-06-04 02:57:08,207.0,11.3,146,2
3,1068356,Mainly clear,85.0,1007.5,26.90,2024-05-25 03:00:00,2024-06-04 02:57:08,213.0,11.2,146,2
4,1068357,Mainly clear,86.0,1007.4,26.80,2024-05-25 04:00:00,2024-06-04 02:57:08,213.0,11.2,146,2
...,...,...,...,...,...,...,...,...,...,...,...
39195,1107548,Rain:Slight,,,23.20,2024-05-27 00:00:00,2024-06-04 02:57:08,88.0,12.6,97,1
39196,1107549,Drizzle:moderate,,,19.05,2024-05-28 00:00:00,2024-06-04 02:57:08,84.0,22.4,97,1
39197,1107550,Rain:moderate,,,19.40,2024-05-29 00:00:00,2024-06-04 02:57:08,32.0,19.7,97,1
39198,1107551,Drizzle:Light,,,18.15,2024-05-30 00:00:00,2024-06-04 02:57:08,326.0,22.6,97,1


In [7]:
prepared_forecast_data = forecast_data.copy()
prepared_forecast_data['conditions'] = prepared_forecast_data['conditions'].fillna('Unknown')
prepared_forecast_data

Unnamed: 0,id,conditions,humidity,pressure,temperature,time,time_stamp,wind_direction,wind_speed,city_id,forecast_type_id,provider_id
0,487930,"Clear, Sunny",86.27,1010.36,20.67,2024-05-25 12:00:00,2024-05-24 12:56:05,349.18,4.52,222,2,2
1,487931,"Clear, Sunny",86.78,1010.37,20.64,2024-05-25 13:00:00,2024-05-24 12:56:05,349.61,4.41,222,2,2
2,487932,"Clear, Sunny",88.66,1010.20,20.57,2024-05-25 14:00:00,2024-05-24 12:56:05,3.54,3.78,222,2,2
3,487933,"Clear, Sunny",89.69,1010.00,20.39,2024-05-25 15:00:00,2024-05-24 12:56:05,20.33,3.28,222,2,2
4,487934,"Clear, Sunny",90.38,1009.72,20.15,2024-05-25 16:00:00,2024-05-24 12:56:05,36.57,2.86,222,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
104549,717879,Partially cloudy,33.30,1010.00,29.40,2024-05-31 20:00:00,2024-05-31 22:34:42,171.50,14.40,79,2,1
104550,717880,Partially cloudy,46.48,1011.00,27.70,2024-05-31 21:00:00,2024-05-31 22:34:42,175.10,14.80,79,2,1
104551,717881,Partially cloudy,52.65,1012.00,27.10,2024-05-31 22:00:00,2024-05-31 22:34:42,186.20,11.90,79,2,1
104552,717882,Partially cloudy,55.19,1012.00,26.30,2024-05-31 23:00:00,2024-05-31 22:34:42,200.10,7.60,79,2,1


#### Open weather

In [8]:
with open('condition-mappings/openMeteoMappings.json', 'r') as file:
    openMeteoMappings = json.load(file)

cond = list(prepared_forecast_data[prepared_forecast_data['provider_id'] == 3]['conditions'].unique())

not_in_cond = [x for x in cond if x not in openMeteoMappings.keys()]
for item in not_in_cond:
    print(item)

In [9]:
prepared_forecast_data.loc[prepared_forecast_data['provider_id'] == 3, 'conditions'] = (
    prepared_forecast_data.loc[prepared_forecast_data['provider_id'] == 3, 'conditions'].map(openMeteoMappings))
not_in_list(prepared_forecast_data[prepared_forecast_data['provider_id'] == 3]['conditions'].unique().tolist(), visualCrossingConditions)

In [10]:
prepared_historical_data['conditions'] = (
    prepared_historical_data['conditions'].map(openMeteoMappings))
not_in_list(prepared_historical_data['conditions'].unique().tolist(), visualCrossingConditions)

#### Tomorrow IO

In [11]:
with open('condition-mappings/tomorrowIOMappings.json', 'r') as file:
    tomorrowIOMappings = json.load(file)
    
def mapTomorrowIO(row, TIOproviderId=2):
    if row['provider_id'] == TIOproviderId:
        conditions = row['conditions'].split(' and ')
        replaced_conditions = [tomorrowIOMappings.get(condition) for condition in conditions]
        return ', '.join(replaced_conditions)
    else:
        return row['conditions']

In [12]:
prepared_forecast_data['conditions'] = prepared_forecast_data.apply(mapTomorrowIO, axis=1)
prepared_forecast_data[prepared_forecast_data['provider_id']==2].conditions.unique()

array(['Clear', 'Drizzle', 'Rain', 'Overcast', 'Partially cloudy',
       'Light Rain', 'Fog', 'Heavy Rain'], dtype=object)

In [13]:
not_in_list_comma(prepared_forecast_data[prepared_forecast_data['provider_id'] == 2]['conditions'].unique().tolist(), visualCrossingConditions)

In [14]:
not_in_list_comma(prepared_forecast_data['conditions'].unique(), visualCrossingConditions)

#### Prepare history numerical

In [15]:
prepared_historical_data.isnull().any()

id                  False
conditions          False
humidity             True
pressure             True
temperature         False
time                False
time_stamp          False
wind_direction      False
wind_speed          False
city_id             False
forecast_type_id    False
dtype: bool

In [16]:
prepared_historical_data['humidity'] = (prepared_historical_data.groupby([prepared_historical_data['time'].dt.date, 'city_id'])['humidity']
                                        .transform(lambda x: x.fillna(x.mean())))
prepared_historical_data['humidity'].isnull().any()

False

In [17]:
prepared_historical_data['pressure'] = (prepared_historical_data.groupby([prepared_historical_data['time'].dt.date, 'city_id'])['pressure']
                                        .transform(lambda x: x.fillna(x.mean())))
prepared_historical_data['pressure'].isnull().any()

False

#### Prepare forecast numerical

In [18]:
prepared_forecast_data.isnull().any()

id                  False
conditions          False
humidity             True
pressure             True
temperature         False
time                False
time_stamp          False
wind_direction      False
wind_speed          False
city_id             False
forecast_type_id    False
provider_id         False
dtype: bool

In [19]:
prepared_forecast_data['humidity'] = (prepared_forecast_data.groupby([prepared_forecast_data['time'].dt.date, 'city_id'])['humidity']
                                        .transform(lambda x: x.fillna(x.mean())))
prepared_forecast_data['humidity'].isnull().any()

False

In [20]:
prepared_forecast_data['pressure'] = (prepared_forecast_data.groupby([prepared_forecast_data['time'].dt.date, 'city_id'])['pressure']
                                        .transform(lambda x: x.fillna(x.mean())))
prepared_forecast_data['pressure'].isnull().any()

False

In [21]:
prepared_forecast_data = prepared_forecast_data.dropna()
prepared_forecast_data.isnull().any()

id                  False
conditions          False
humidity            False
pressure            False
temperature         False
time                False
time_stamp          False
wind_direction      False
wind_speed          False
city_id             False
forecast_type_id    False
provider_id         False
dtype: bool

## Build models

### Temperature

In [23]:
temperature_forecasts = get_pivoted_by_provider(prepared_forecast_data, prepared_historical_data, 'temperature')
temperature_forecasts

Unnamed: 0,city_id,forecast_type_id,time,temperature1,temperature2,temperature3,temperature,month,day,hour
0,1,1,2024-05-25 00:00:00,20.8,23.77,20.40,20.30,5,25,0
1,1,1,2024-05-26 00:00:00,21.2,24.02,22.30,21.70,5,26,0
2,1,1,2024-05-27 00:00:00,21.8,24.73,22.15,22.65,5,27,0
3,1,1,2024-05-28 00:00:00,22.4,24.32,21.55,21.80,5,28,0
4,1,1,2024-05-29 00:00:00,21.6,24.76,22.00,20.60,5,29,0
...,...,...,...,...,...,...,...,...,...,...
26649,259,2,2024-05-31 19:00:00,20.2,18.01,19.30,17.90,5,31,19
26650,259,2,2024-05-31 20:00:00,19.4,17.42,18.70,17.40,5,31,20
26651,259,2,2024-05-31 21:00:00,18.3,16.97,18.30,17.30,5,31,21
26652,259,2,2024-05-31 22:00:00,17.4,16.81,17.80,17.00,5,31,22


In [24]:
prepare_time_features(temperature_forecasts)
temperature_forecasts

Unnamed: 0,city_id,forecast_type_id,time,temperature1,temperature2,temperature3,temperature,month,day,hour
0,1,1,2024-03-03 00:00:00,2.2,31.77,6.85,5.85,3,3,0
1,1,1,2024-04-14 00:00:00,18.3,31.55,17.15,16.50,4,14,0
2,1,1,2024-04-18 00:00:00,29.0,30.94,16.55,16.70,4,18,0
3,1,1,2024-04-19 00:00:00,7.2,30.59,16.40,15.55,4,19,0
4,1,1,2024-04-19 00:00:00,7.2,30.59,16.40,15.55,4,19,0
...,...,...,...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,20.3,16.93,18.20,17.60,5,22,21
47943,259,2,2024-05-22 22:00:00,19.1,16.21,17.70,16.90,5,22,22
47944,259,2,2024-05-22 23:00:00,18.2,15.58,17.20,16.20,5,22,23
47945,259,2,2024-05-24 00:00:00,18.5,15.58,17.20,15.80,5,24,0


In [25]:
temp_features = [col for col in temperature_forecasts.columns if (col != 'temperature' and col !='time')]
X_temp = temperature_forecasts[temp_features]
y_temp = temperature_forecasts['temperature']
X_temp

Unnamed: 0,city_id,forecast_type_id,temperature1,temperature2,temperature3,month,day,hour
0,1,1,2.2,31.77,6.85,3,3,0
1,1,1,18.3,31.55,17.15,4,14,0
2,1,1,29.0,30.94,16.55,4,18,0
3,1,1,7.2,30.59,16.40,4,19,0
4,1,1,7.2,30.59,16.40,4,19,0
...,...,...,...,...,...,...,...,...
47942,259,2,20.3,16.93,18.20,5,22,21
47943,259,2,19.1,16.21,17.70,5,22,22
47944,259,2,18.2,15.58,17.20,5,22,23
47945,259,2,18.5,15.58,17.20,5,24,0


In [26]:
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X_temp, y_temp, test_size=0.30)

In [28]:
GBR_temp = GradientBoostingRegressor()

parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

grid_GBR_temp = GridSearchCV(estimator=GBR_temp, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR_temp.fit(X_train_temp, y_train_temp)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR_temp.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR_temp.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR_temp.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.04, max_depth=8, n_estimators=1500,
                          subsample=0.9)

 The best score across ALL searched params:
 0.9781723521659625

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 8, 'n_estimators': 1500, 'subsample': 0.9}
Mean Absolute Error (MAE): 0.5856721919067772
Mean Squared Error (MSE): 0.726793821035852
Root Mean Squared Error (RMSE): 0.8525220355133655
R-squared (R²): 0.9813344723275295


In [29]:
write_model_to_file('temperature_regressor', grid_GBR_temp.best_estimator_)

In [27]:
RF_temp = RandomForestRegressor(random_state=0)

parameters = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [1,2,3,4],
}

grid_RF_temp = GridSearchCV(estimator=RF_temp, param_grid = parameters, cv = 2, n_jobs=-1)
grid_RF_temp.fit(X_train_temp, y_train_temp)

print(" Results from Random Forest " )
print("\n The best estimator across ALL searched params:\n",grid_RF_temp.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_RF_temp.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_RF_temp.best_params_)
print_regression_scores(grid_RF_temp.best_estimator_, X_test_temp, y_test_temp)

 Results from Random Forest 

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=4, n_estimators=250, random_state=0)

 The best score across ALL searched params:
 0.9422569620587663

 The best parameters across ALL searched params:
 {'max_depth': 4, 'n_estimators': 250}
Mean Absolute Error (MAE): 1.0854642110970563
Mean Squared Error (MSE): 2.499448044117192
Root Mean Squared Error (RMSE): 1.5809642766733194
R-squared (R²): 0.935809145203134


### Conditions

In [23]:
conditions1 = prepared_forecast_data[prepared_forecast_data['provider_id']==1][['city_id', 'forecast_type_id', 'time', 'conditions']]
conditions1.rename(columns={'conditions':'conditions1'}, inplace=True)
conditions1['conditions1'].unique()

array(['Rain, Partially cloudy', 'Clear', 'Partially cloudy',
       'Rain, Overcast', 'Rain', 'Overcast'], dtype=object)

In [24]:
conditions2 = prepared_forecast_data[prepared_forecast_data['provider_id']==2][['city_id', 'forecast_type_id', 'time', 'conditions']]
conditions2.rename(columns={'conditions':'conditions2'}, inplace=True)

conditions3 = prepared_forecast_data[prepared_forecast_data['provider_id']==3][['city_id', 'forecast_type_id', 'time', 'conditions']]
conditions3.rename(columns={'conditions':'conditions3'}, inplace=True)

cond_final = pd.merge(conditions1, conditions2[['city_id', 'forecast_type_id', 'time', 'conditions2']],
                       on=['city_id', 'forecast_type_id', 'time'])
cond_final = pd.merge(cond_final, conditions3[['city_id', 'forecast_type_id', 'time', 'conditions3']],
                       on=['city_id', 'forecast_type_id', 'time'])
cond_final = pd.merge(cond_final, prepared_historical_data[['city_id', 'forecast_type_id', 'time', 'conditions']],
                       on=['city_id', 'forecast_type_id', 'time'])
cond_final

Unnamed: 0,city_id,forecast_type_id,time,conditions1,conditions2,conditions3,conditions
0,146,1,2024-05-25 00:00:00,"Rain, Partially cloudy",Overcast,Unknown,Rain
1,146,2,2024-05-25 12:00:00,"Rain, Partially cloudy",Drizzle,Unknown,Light Drizzle/Rain
2,146,2,2024-05-25 13:00:00,"Rain, Partially cloudy",Rain,Unknown,Light Drizzle/Rain
3,146,2,2024-05-25 14:00:00,"Rain, Partially cloudy",Overcast,Overcast,Rain
4,146,2,2024-05-25 15:00:00,"Rain, Partially cloudy",Overcast,Light Rain And Snow,Heavy Drizzle/Rain
...,...,...,...,...,...,...,...
26649,79,2,2024-05-31 20:00:00,Partially cloudy,Overcast,Overcast,Overcast
26650,79,2,2024-05-31 21:00:00,Partially cloudy,Overcast,Overcast,Overcast
26651,79,2,2024-05-31 22:00:00,Partially cloudy,Overcast,Overcast,Overcast
26652,79,2,2024-05-31 23:00:00,Partially cloudy,Clear,Overcast,Partially cloudy


In [34]:
cond_final_exp = expand_conditions(cond_final, 'conditions1')
cond_final_exp = expand_conditions(cond_final_exp, 'conditions2')
cond_final_exp

Unnamed: 0,city_id,forecast_type_id,time,conditions3,conditions,conditions1,conditions2
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Rain,Overcast
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Rain,Overcast
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Partially cloudy,Overcast
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Partially cloudy,Overcast
1,146,2,2024-05-25 12:00:00,Unknown,Light Drizzle/Rain,Rain,Drizzle
...,...,...,...,...,...,...,...
26649,79,2,2024-05-31 20:00:00,Overcast,Overcast,Partially cloudy,Overcast
26650,79,2,2024-05-31 21:00:00,Overcast,Overcast,Partially cloudy,Overcast
26651,79,2,2024-05-31 22:00:00,Overcast,Overcast,Partially cloudy,Overcast
26652,79,2,2024-05-31 23:00:00,Overcast,Partially cloudy,Partially cloudy,Clear


In [35]:
with open('condition-mappings/visualCrossingEncodings.json', 'r') as file:
    visualCrossingEncodings = json.load(file)

cond_final_exp['c1_enc'] = cond_final_exp['conditions1'].map(visualCrossingEncodings)
cond_final_exp['c2_enc'] = cond_final_exp['conditions2'].map(visualCrossingEncodings)
cond_final_exp['c3_enc'] = cond_final_exp['conditions3'].map(visualCrossingEncodings)
cond_final_exp['c_h_enc'] = cond_final_exp['conditions'].map(visualCrossingEncodings)
cond_final_exp

Unnamed: 0,city_id,forecast_type_id,time,conditions3,conditions,conditions1,conditions2,c1_enc,c2_enc,c3_enc,c_h_enc
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Rain,Overcast,90,160,999,90
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Rain,Overcast,90,160,999,90
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Partially cloudy,Overcast,161,160,999,90
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Partially cloudy,Overcast,161,160,999,90
1,146,2,2024-05-25 12:00:00,Unknown,Light Drizzle/Rain,Rain,Drizzle,90,80,999,84
...,...,...,...,...,...,...,...,...,...,...,...
26649,79,2,2024-05-31 20:00:00,Overcast,Overcast,Partially cloudy,Overcast,161,160,160,160
26650,79,2,2024-05-31 21:00:00,Overcast,Overcast,Partially cloudy,Overcast,161,160,160,160
26651,79,2,2024-05-31 22:00:00,Overcast,Overcast,Partially cloudy,Overcast,161,160,160,160
26652,79,2,2024-05-31 23:00:00,Overcast,Partially cloudy,Partially cloudy,Clear,161,162,160,161


In [36]:
cond_final_exp['month'] = cond_final_exp['time'].dt.month
cond_final_exp['day'] = cond_final_exp['time'].dt.day
cond_final_exp['hour'] = cond_final_exp['time'].dt.hour
cond_final_exp

Unnamed: 0,city_id,forecast_type_id,time,conditions3,conditions,conditions1,conditions2,c1_enc,c2_enc,c3_enc,c_h_enc,month,day,hour
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Rain,Overcast,90,160,999,90,5,25,0
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Rain,Overcast,90,160,999,90,5,25,0
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Partially cloudy,Overcast,161,160,999,90,5,25,0
0,146,1,2024-05-25 00:00:00,Unknown,Rain,Partially cloudy,Overcast,161,160,999,90,5,25,0
1,146,2,2024-05-25 12:00:00,Unknown,Light Drizzle/Rain,Rain,Drizzle,90,80,999,84,5,25,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26649,79,2,2024-05-31 20:00:00,Overcast,Overcast,Partially cloudy,Overcast,161,160,160,160,5,31,20
26650,79,2,2024-05-31 21:00:00,Overcast,Overcast,Partially cloudy,Overcast,161,160,160,160,5,31,21
26651,79,2,2024-05-31 22:00:00,Overcast,Overcast,Partially cloudy,Overcast,161,160,160,160,5,31,22
26652,79,2,2024-05-31 23:00:00,Overcast,Partially cloudy,Partially cloudy,Clear,161,162,160,161,5,31,23


In [38]:
X_f = cond_final_exp[['city_id', 'forecast_type_id', 'c1_enc', 'c2_enc', 'c3_enc', 'month', 'day', 'hour']]
y_f = cond_final_exp['c_h_enc']
X_train_cond_f, X_test_cond_f, y_train_cond_f, y_test_cond_f = train_test_split(X_f, y_f, test_size=0.30)

In [62]:
y_f

0        161
0        161
0        161
0        161
1         84
        ... 
47942    162
47943    162
47944    161
47945    162
47946    162
Name: c_h_enc, Length: 110307, dtype: int64

In [63]:
GBC_cond_f = GradientBoostingClassifier(random_state=42)

param_grid_f = {
    'n_estimators': [150, 200],
    'learning_rate': [0.2, 0.3, 0.5],
    'max_depth': [5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_GBC_cond_f = GridSearchCV(estimator=GBC_cond_f, param_grid = param_grid_f, cv = 2, n_jobs=-1, verbose=2, scoring='accuracy')
grid_GBC_cond_f.fit(X_train_cond_f, y_train_cond_f)

print(" Results from Gradient Boosting " )
print("\n The best estimator across ALL searched params:\n",grid_GBC_cond_f.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBC_cond_f.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBC_cond_f.best_params_)

Fitting 2 folds for each of 108 candidates, totalling 216 fits
 Results from Gradient Boosting 

 The best estimator across ALL searched params:
 GradientBoostingClassifier(learning_rate=0.2, max_depth=8, min_samples_split=5,
                           n_estimators=200, random_state=42)

 The best score across ALL searched params:
 0.7994275649493615

 The best parameters across ALL searched params:
 {'learning_rate': 0.2, 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [66]:
print_classification_score(grid_GBC_cond_f.best_estimator_, X_test_cond_f, y_test_cond_f)

Accuracy: 0.7770099814896663
Classification Report:
              precision    recall  f1-score   support

          81       0.58      0.75      0.65       414
          83       0.62      0.59      0.61      1103
          84       0.74      0.67      0.70      4151
          90       0.62      0.59      0.60       987
          94       0.47      0.58      0.52       185
          95       0.73      0.59      0.65       841
         120       0.00      0.00      0.00         5
         124       0.00      0.00      0.00         5
         160       0.76      0.63      0.69      3512
         161       0.76      0.84      0.80     11392
         162       0.87      0.87      0.87     10498

    accuracy                           0.78     33093
   macro avg       0.56      0.55      0.55     33093
weighted avg       0.78      0.78      0.78     33093



In [69]:
write_model_to_file('conditions_classifier2', grid_GBC_cond_f.best_estimator_)

### Humidity

In [22]:
humidity_forecasts = get_pivoted_by_provider(prepared_forecast_data, prepared_historical_data, 'humidity')
humidity_forecasts

Unnamed: 0,city_id,forecast_type_id,time,humidity1,humidity2,humidity3,humidity
0,1,1,2024-03-03 00:00:00,74.80,92.18,75.135400,61.416667
1,1,1,2024-04-14 00:00:00,60.00,95.44,71.023585,80.458333
2,1,1,2024-04-18 00:00:00,76.90,96.68,77.895400,70.000000
3,1,1,2024-04-19 00:00:00,75.50,96.16,77.778136,52.791667
4,1,1,2024-04-19 00:00:00,75.50,96.16,77.778136,52.791667
...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,60.46,63.77,69.000000,68.000000
47943,259,2,2024-05-22 22:00:00,65.99,66.47,71.000000,72.000000
47944,259,2,2024-05-22 23:00:00,69.81,68.31,74.000000,75.000000
47945,259,2,2024-05-24 00:00:00,66.30,82.49,69.000000,75.000000


In [23]:
prepare_time_features(humidity_forecasts)
humidity_forecasts

Unnamed: 0,city_id,forecast_type_id,time,humidity1,humidity2,humidity3,humidity,month,day,hour
0,1,1,2024-03-03 00:00:00,74.80,92.18,75.135400,61.416667,3,3,0
1,1,1,2024-04-14 00:00:00,60.00,95.44,71.023585,80.458333,4,14,0
2,1,1,2024-04-18 00:00:00,76.90,96.68,77.895400,70.000000,4,18,0
3,1,1,2024-04-19 00:00:00,75.50,96.16,77.778136,52.791667,4,19,0
4,1,1,2024-04-19 00:00:00,75.50,96.16,77.778136,52.791667,4,19,0
...,...,...,...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,60.46,63.77,69.000000,68.000000,5,22,21
47943,259,2,2024-05-22 22:00:00,65.99,66.47,71.000000,72.000000,5,22,22
47944,259,2,2024-05-22 23:00:00,69.81,68.31,74.000000,75.000000,5,22,23
47945,259,2,2024-05-24 00:00:00,66.30,82.49,69.000000,75.000000,5,24,0


In [24]:
hum_features = [col for col in humidity_forecasts.columns if (col != 'humidity' and col !='time')]
X_hum = humidity_forecasts[hum_features]
y_hum = humidity_forecasts['humidity']
X_hum

Unnamed: 0,city_id,forecast_type_id,humidity1,humidity2,humidity3,month,day,hour
0,1,1,74.80,92.18,75.135400,3,3,0
1,1,1,60.00,95.44,71.023585,4,14,0
2,1,1,76.90,96.68,77.895400,4,18,0
3,1,1,75.50,96.16,77.778136,4,19,0
4,1,1,75.50,96.16,77.778136,4,19,0
...,...,...,...,...,...,...,...,...
47942,259,2,60.46,63.77,69.000000,5,22,21
47943,259,2,65.99,66.47,71.000000,5,22,22
47944,259,2,69.81,68.31,74.000000,5,22,23
47945,259,2,66.30,82.49,69.000000,5,24,0


In [25]:
X_train_hum, X_test_hum, y_train_hum, y_test_hum = train_test_split(X_hum, y_hum, test_size=0.30)

In [30]:
RF_hum = RandomForestRegressor(random_state=0)

parameters_hum = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [1,2,3,4],
}

grid_RF_hum = GridSearchCV(estimator=RF_hum, param_grid = parameters_hum, cv = 2, n_jobs=-1)
grid_RF_hum.fit(X_train_hum, y_train_hum)

print(" Results from Random Forest " )
print("\n The best estimator across ALL searched params:\n",grid_RF_hum.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_RF_hum.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_RF_hum.best_params_)
print_regression_scores(grid_RF_hum.best_estimator_, X_test_hum, y_test_hum)

 Results from Random Forest 

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=4, n_estimators=150, random_state=0)

 The best score across ALL searched params:
 0.8902737035891893

 The best parameters across ALL searched params:
 {'max_depth': 4, 'n_estimators': 150}
Mean Absolute Error (MAE): 5.561371469996174
Mean Squared Error (MSE): 56.882803923734905
Root Mean Squared Error (RMSE): 7.542068941857725
R-squared (R²): 0.8937036177255979


In [31]:
GBR_hum = GradientBoostingRegressor()

parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

grid_GBR_hum = GridSearchCV(estimator=GBR_hum, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR_hum.fit(X_train_hum, y_train_hum)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR_hum.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR_hum.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR_hum.best_params_)
print_regression_scores(grid_GBR_hum.best_estimator_, X_test_hum, y_test_hum)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.04, max_depth=8, n_estimators=1500,
                          subsample=0.9)

 The best score across ALL searched params:
 0.9410166173640073

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 8, 'n_estimators': 1500, 'subsample': 0.9}
Mean Absolute Error (MAE): 3.5470560341981696
Mean Squared Error (MSE): 24.497180575725913
Root Mean Squared Error (RMSE): 4.949462655251165
R-squared (R²): 0.9542223397669767


In [32]:
write_model_to_file('humidity_regressor', grid_GBR_hum.best_estimator_)

### Pressure

In [33]:
pressure_forecasts = get_pivoted_by_provider(prepared_forecast_data, prepared_historical_data, 'pressure')
pressure_forecasts

Unnamed: 0,city_id,forecast_type_id,time,pressure1,pressure2,pressure3,pressure
0,1,1,2024-03-03 00:00:00,1019.0,1011.44,1009.774800,1012.270833
1,1,1,2024-04-14 00:00:00,1022.0,1012.60,1020.336415,1020.120833
2,1,1,2024-04-18 00:00:00,1008.1,1010.17,1007.297400,1001.854167
3,1,1,2024-04-19 00:00:00,1004.7,1011.05,998.489492,1006.070833
4,1,1,2024-04-19 00:00:00,1004.7,1011.05,998.489492,1006.070833
...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,1016.0,1000.01,998.400000,998.300000
47943,259,2,2024-05-22 22:00:00,1017.0,1000.26,998.500000,998.500000
47944,259,2,2024-05-22 23:00:00,1017.0,1000.41,998.400000,998.500000
47945,259,2,2024-05-24 00:00:00,1021.0,1004.84,1003.400000,1002.900000


In [34]:
prepare_time_features(pressure_forecasts)
pressure_forecasts

Unnamed: 0,city_id,forecast_type_id,time,pressure1,pressure2,pressure3,pressure,month,day,hour
0,1,1,2024-03-03 00:00:00,1019.0,1011.44,1009.774800,1012.270833,3,3,0
1,1,1,2024-04-14 00:00:00,1022.0,1012.60,1020.336415,1020.120833,4,14,0
2,1,1,2024-04-18 00:00:00,1008.1,1010.17,1007.297400,1001.854167,4,18,0
3,1,1,2024-04-19 00:00:00,1004.7,1011.05,998.489492,1006.070833,4,19,0
4,1,1,2024-04-19 00:00:00,1004.7,1011.05,998.489492,1006.070833,4,19,0
...,...,...,...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,1016.0,1000.01,998.400000,998.300000,5,22,21
47943,259,2,2024-05-22 22:00:00,1017.0,1000.26,998.500000,998.500000,5,22,22
47944,259,2,2024-05-22 23:00:00,1017.0,1000.41,998.400000,998.500000,5,22,23
47945,259,2,2024-05-24 00:00:00,1021.0,1004.84,1003.400000,1002.900000,5,24,0


In [36]:
press_features = [col for col in pressure_forecasts.columns if (col != 'pressure' and col !='time')]
X_pres = pressure_forecasts[press_features]
y_pres = pressure_forecasts['pressure']
X_pres

Unnamed: 0,city_id,forecast_type_id,pressure1,pressure2,pressure3,month,day,hour
0,1,1,1019.0,1011.44,1009.774800,3,3,0
1,1,1,1022.0,1012.60,1020.336415,4,14,0
2,1,1,1008.1,1010.17,1007.297400,4,18,0
3,1,1,1004.7,1011.05,998.489492,4,19,0
4,1,1,1004.7,1011.05,998.489492,4,19,0
...,...,...,...,...,...,...,...,...
47942,259,2,1016.0,1000.01,998.400000,5,22,21
47943,259,2,1017.0,1000.26,998.500000,5,22,22
47944,259,2,1017.0,1000.41,998.400000,5,22,23
47945,259,2,1021.0,1004.84,1003.400000,5,24,0


In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_pres, y_pres, test_size=0.30)

In [39]:
RF = RandomForestRegressor(random_state=0)

parameters = {
    'n_estimators': [150, 200, 250],
    'max_depth': [1,2,3,4],
}

grid_RF = GridSearchCV(estimator=RF, param_grid = parameters, cv = 2, n_jobs=-1)
grid_RF.fit(X_train, y_train)

print(" Results from Random Forest " )
print("\n The best estimator across ALL searched params:\n",grid_RF.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_RF.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_RF.best_params_)
print_regression_scores(grid_RF.best_estimator_, X_test, y_test)

 Results from Random Forest 

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=4, n_estimators=250, random_state=0)

 The best score across ALL searched params:
 0.9940183244016493

 The best parameters across ALL searched params:
 {'max_depth': 4, 'n_estimators': 250}
Mean Absolute Error (MAE): 2.398058475700446
Mean Squared Error (MSE): 13.268151445818434
Root Mean Squared Error (RMSE): 3.642547384155549
R-squared (R²): 0.9940676241353579


In [40]:
GBR = GradientBoostingRegressor()

parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print_regression_scores(grid_GBR.best_estimator_, X_test, y_test)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.04, max_depth=6, n_estimators=1500,
                          subsample=0.5)

 The best score across ALL searched params:
 0.9996911476645115

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 6, 'n_estimators': 1500, 'subsample': 0.5}
Mean Absolute Error (MAE): 0.41236504362327103
Mean Squared Error (MSE): 0.554037488370148
Root Mean Squared Error (RMSE): 0.7443369454555833
R-squared (R²): 0.9997522820991653


In [41]:
write_model_to_file('pressure_regressor', grid_GBR.best_estimator_)


### Wind speed

In [24]:
win_speed_forecasts = get_pivoted_by_provider(prepared_forecast_data, prepared_historical_data, 'wind_speed')
win_speed_forecasts

Unnamed: 0,city_id,forecast_type_id,time,wind_speed1,wind_speed2,wind_speed3,wind_speed
0,1,1,2024-03-03 00:00:00,13.3,4.94,8.7,24.1
1,1,1,2024-04-14 00:00:00,20.5,3.16,4.9,19.6
2,1,1,2024-04-18 00:00:00,25.0,3.04,14.1,43.9
3,1,1,2024-04-19 00:00:00,19.4,3.29,15.7,49.6
4,1,1,2024-04-19 00:00:00,19.4,3.29,15.7,49.6
...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,6.5,2.45,4.7,11.9
47943,259,2,2024-05-22 22:00:00,5.8,2.31,4.3,11.0
47944,259,2,2024-05-22 23:00:00,5.0,2.26,3.3,9.6
47945,259,2,2024-05-24 00:00:00,5.4,2.68,5.4,11.9


In [25]:
prepare_time_features(win_speed_forecasts)
win_speed_forecasts

Unnamed: 0,city_id,forecast_type_id,time,wind_speed1,wind_speed2,wind_speed3,wind_speed,month,day,hour
0,1,1,2024-03-03 00:00:00,13.3,4.94,8.7,24.1,3,3,0
1,1,1,2024-04-14 00:00:00,20.5,3.16,4.9,19.6,4,14,0
2,1,1,2024-04-18 00:00:00,25.0,3.04,14.1,43.9,4,18,0
3,1,1,2024-04-19 00:00:00,19.4,3.29,15.7,49.6,4,19,0
4,1,1,2024-04-19 00:00:00,19.4,3.29,15.7,49.6,4,19,0
...,...,...,...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,6.5,2.45,4.7,11.9,5,22,21
47943,259,2,2024-05-22 22:00:00,5.8,2.31,4.3,11.0,5,22,22
47944,259,2,2024-05-22 23:00:00,5.0,2.26,3.3,9.6,5,22,23
47945,259,2,2024-05-24 00:00:00,5.4,2.68,5.4,11.9,5,24,0


In [26]:
win_speed_features = [col for col in win_speed_forecasts.columns if (col != 'wind_speed' and col !='time')]
X_win_speed = win_speed_forecasts[win_speed_features]
y_win_speed = win_speed_forecasts['wind_speed']
X_win_speed

Unnamed: 0,city_id,forecast_type_id,wind_speed1,wind_speed2,wind_speed3,month,day,hour
0,1,1,13.3,4.94,8.7,3,3,0
1,1,1,20.5,3.16,4.9,4,14,0
2,1,1,25.0,3.04,14.1,4,18,0
3,1,1,19.4,3.29,15.7,4,19,0
4,1,1,19.4,3.29,15.7,4,19,0
...,...,...,...,...,...,...,...,...
47942,259,2,6.5,2.45,4.7,5,22,21
47943,259,2,5.8,2.31,4.3,5,22,22
47944,259,2,5.0,2.26,3.3,5,22,23
47945,259,2,5.4,2.68,5.4,5,24,0


In [27]:
X_train_w_speed, X_test_w_speed, y_train_w_speed, y_test_w_speed = train_test_split(X_win_speed, y_win_speed, test_size=0.30)

In [28]:
RF_win_speed = RandomForestRegressor(random_state=0)

parameters = {
    'n_estimators': [150, 200, 250],
    'max_depth': [1,2,3,4],
}

grid_RF_w_speed = GridSearchCV(estimator=RF_win_speed, param_grid = parameters, cv = 2, n_jobs=-1)
grid_RF_w_speed.fit(X_train_w_speed, y_train_w_speed)

print(" Results from Random Forest " )
print("\n The best estimator across ALL searched params:\n",grid_RF_w_speed.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_RF_w_speed.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_RF_w_speed.best_params_)
print_regression_scores(grid_RF_w_speed.best_estimator_, X_test_w_speed, y_test_w_speed)

 Results from Random Forest 

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=4, n_estimators=200, random_state=0)

 The best score across ALL searched params:
 0.70463525901735

 The best parameters across ALL searched params:
 {'max_depth': 4, 'n_estimators': 200}
Mean Absolute Error (MAE): 2.5549820549372364
Mean Squared Error (MSE): 12.321949266666685
Root Mean Squared Error (RMSE): 3.510263418415587
R-squared (R²): 0.6867773354046554


In [35]:
GBR = GradientBoostingRegressor()

parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(X_train_w_speed, y_train_w_speed)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print_regression_scores(grid_GBR.best_estimator_, X_test_w_speed, y_test_w_speed)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.03, max_depth=6, n_estimators=1500,
                          subsample=0.9)

 The best score across ALL searched params:
 0.7914195417869951

 The best parameters across ALL searched params:
 {'learning_rate': 0.03, 'max_depth': 6, 'n_estimators': 1500, 'subsample': 0.9}
Mean Absolute Error (MAE): 2.0403426347918354
Mean Squared Error (MSE): 7.712668637342105
Root Mean Squared Error (RMSE): 2.7771691769393714
R-squared (R²): 0.8039447680356541


In [50]:
write_model_to_file('win_speed_regressor', grid_GBR.best_estimator_)

In [38]:
with open('win_speed_regressor.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
print_regression_scores(regressor_loaded, X_test_w_speed, y_test_w_speed)

Mean Absolute Error (MAE): 1.699027029485098
Mean Squared Error (MSE): 5.203611545951439
Root Mean Squared Error (RMSE): 2.2811425965843166
R-squared (R²): 0.8677247374852303


### Wind direction

In [30]:
win_dir_forecasts = get_pivoted_by_provider(prepared_forecast_data, prepared_historical_data, 'wind_direction')
win_dir_forecasts

Unnamed: 0,city_id,forecast_type_id,time,wind_direction1,wind_direction2,wind_direction3,wind_direction
0,1,1,2024-03-03 00:00:00,95.3,253.82,227.0,250.0
1,1,1,2024-04-14 00:00:00,159.6,103.79,180.0,189.0
2,1,1,2024-04-18 00:00:00,262.9,100.20,352.0,1.0
3,1,1,2024-04-19 00:00:00,265.7,91.52,326.0,329.0
4,1,1,2024-04-19 00:00:00,265.7,91.52,326.0,329.0
...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,71.1,97.80,94.0,104.0
47943,259,2,2024-05-22 22:00:00,78.0,104.94,90.0,109.0
47944,259,2,2024-05-22 23:00:00,79.4,107.98,96.0,110.0
47945,259,2,2024-05-24 00:00:00,67.0,81.60,70.0,90.0


In [31]:
prepare_time_features(win_dir_forecasts)
win_dir_forecasts

Unnamed: 0,city_id,forecast_type_id,time,wind_direction1,wind_direction2,wind_direction3,wind_direction,month,day,hour
0,1,1,2024-03-03 00:00:00,95.3,253.82,227.0,250.0,3,3,0
1,1,1,2024-04-14 00:00:00,159.6,103.79,180.0,189.0,4,14,0
2,1,1,2024-04-18 00:00:00,262.9,100.20,352.0,1.0,4,18,0
3,1,1,2024-04-19 00:00:00,265.7,91.52,326.0,329.0,4,19,0
4,1,1,2024-04-19 00:00:00,265.7,91.52,326.0,329.0,4,19,0
...,...,...,...,...,...,...,...,...,...,...
47942,259,2,2024-05-22 21:00:00,71.1,97.80,94.0,104.0,5,22,21
47943,259,2,2024-05-22 22:00:00,78.0,104.94,90.0,109.0,5,22,22
47944,259,2,2024-05-22 23:00:00,79.4,107.98,96.0,110.0,5,22,23
47945,259,2,2024-05-24 00:00:00,67.0,81.60,70.0,90.0,5,24,0


In [32]:
win_dir_features = [col for col in win_dir_forecasts.columns if (col != 'wind_direction' and col !='time')]
X_win_dir = win_dir_forecasts[win_dir_features]
y_win_dir = win_dir_forecasts['wind_direction']
X_win_dir

Unnamed: 0,city_id,forecast_type_id,wind_direction1,wind_direction2,wind_direction3,month,day,hour
0,1,1,95.3,253.82,227.0,3,3,0
1,1,1,159.6,103.79,180.0,4,14,0
2,1,1,262.9,100.20,352.0,4,18,0
3,1,1,265.7,91.52,326.0,4,19,0
4,1,1,265.7,91.52,326.0,4,19,0
...,...,...,...,...,...,...,...,...
47942,259,2,71.1,97.80,94.0,5,22,21
47943,259,2,78.0,104.94,90.0,5,22,22
47944,259,2,79.4,107.98,96.0,5,22,23
47945,259,2,67.0,81.60,70.0,5,24,0


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_win_dir, y_win_dir, test_size=0.30)

In [34]:
RF = RandomForestRegressor(random_state=0)

parameters = {
    'n_estimators': [150, 200, 250],
    'max_depth': [1,2,3,4],
}

grid_RF = GridSearchCV(estimator=RF, param_grid = parameters, cv = 2, n_jobs=-1)
grid_RF.fit(X_train, y_train)

print(" Results from Random Forest " )
print("\n The best estimator across ALL searched params:\n",grid_RF.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_RF.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_RF.best_params_)
print_regression_scores(grid_RF.best_estimator_, X_test, y_test)

 Results from Random Forest 

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=4, n_estimators=200, random_state=0)

 The best score across ALL searched params:
 0.4146943636692946

 The best parameters across ALL searched params:
 {'max_depth': 4, 'n_estimators': 200}
Mean Absolute Error (MAE): 46.09919112289741
Mean Squared Error (MSE): 5221.3347744047405
Root Mean Squared Error (RMSE): 72.25880413074064
R-squared (R²): 0.4223537765933856


In [39]:
GBR = GradientBoostingRegressor()

parameters = {'learning_rate': [0.01,0.02,0.03,0.04],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print_regression_scores(grid_GBR.best_estimator_, X_test, y_test)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.01, max_depth=8, n_estimators=1500,
                          subsample=0.9)

 The best score across ALL searched params:
 0.4506403631300671

 The best parameters across ALL searched params:
 {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1500, 'subsample': 0.9}
Mean Absolute Error (MAE): 40.63331120011522
Mean Squared Error (MSE): 4608.632001814421
Root Mean Squared Error (RMSE): 67.88690596731023
R-squared (R²): 0.49013825277608913


In [40]:
write_model_to_file('win_direction_regressor', grid_GBR.best_estimator_)