In [154]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
# Step 1: Load the historical rainfall data
data = pd.read_csv('weather.csv')

In [155]:
data.columns

Index(['Year', 'Month', 'Rainfall', 'Minimum Temperature', 'Wind Speed',
       'Wind Direction', 'Maximum Temperature', 'Relative Humidity'],
      dtype='object')

In [156]:
# Label Encoding for the 'Month' feature
label_encoder = LabelEncoder()
data['Month'] = label_encoder.fit_transform(data['Month'])

# Split the data into features and target variables
X = data[['Year', 'Month']]
y = data[['Minimum Temperature', 'Wind Speed', 'Wind Direction', 'Maximum Temperature', 'Relative Humidity', 'Rainfall']]

# Split the data into train, validation, and test sets
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42)


In [157]:
# Step 6: Model Selection and Training for each target variable
models = {}
for target_column in y.columns:
    model = RandomForestRegressor()
    model.fit(X_train, y_train[target_column])
    models[target_column] = model

# Step 7: Evaluation on validation set for each target variable
mse_val = {}
for target_column, model in models.items():
    y_val_pred = model.predict(X_val)
    mse_val[target_column] = mean_squared_error(y_val[target_column], y_val_pred)
    print(f"Mean Squared Error on Validation Set for {target_column}:", mse_val[target_column])

# Step 8: Evaluation on test set for each target variable
mse_test = {}
for target_column, model in models.items():
    y_test_pred = model.predict(X_test)
    mse_test[target_column] = mean_squared_error(y_test[target_column], y_test_pred)
    print(f"Mean Squared Error on Test Set for {target_column}:", mse_test[target_column])


Mean Squared Error on Validation Set for Minimum Temperature: 2.2238925555555533
Mean Squared Error on Validation Set for Wind Speed: 0.49270000000000014
Mean Squared Error on Validation Set for Wind Direction: 1508.4466666666665
Mean Squared Error on Validation Set for Maximum Temperature: 0.6027749999999952
Mean Squared Error on Validation Set for Relative Humidity: 60.68560000000001
Mean Squared Error on Validation Set for Rainfall: 4504.716094852225
Mean Squared Error on Test Set for Minimum Temperature: 1.3195705000000102
Mean Squared Error on Test Set for Wind Speed: 0.6138600000000001
Mean Squared Error on Test Set for Wind Direction: 1497.3020000000001
Mean Squared Error on Test Set for Maximum Temperature: 1.5608482000000004
Mean Squared Error on Test Set for Relative Humidity: 29.76901
Mean Squared Error on Test Set for Rainfall: 7489.478217663005


In [158]:
mse_df = pd.DataFrame({'Validation MSE': mse_val, 'Test MSE': mse_test})

mse_df

Unnamed: 0,Validation MSE,Test MSE
Minimum Temperature,2.223893,1.319571
Wind Speed,0.4927,0.61386
Wind Direction,1508.446667,1497.302
Maximum Temperature,0.602775,1.560848
Relative Humidity,60.6856,29.76901
Rainfall,4504.716095,7489.478218


In [159]:
# Now using XgboostRegressor

# Step 6: Model Selection and Training for each target variable
models = {}
for target_column in y.columns:
    model = XGBRegressor()
    model.fit(X_train, y_train[target_column])
    models[target_column] = model

# Step 7: Evaluation on validation set for each target variable
xgb_mse_val = {}
for target_column, model in models.items():
    y_val_pred = model.predict(X_val)
    xgb_mse_val[target_column] = mean_squared_error(y_val[target_column], y_val_pred)
    print(f"Mean Squared Error on Validation Set for {target_column}:", xgb_mse_val[target_column])

# Step 8: Evaluation on test set for each target variable
xgb_mse_test = {}
for target_column, model in models.items():
    y_test_pred = model.predict(X_test)
    xgb_mse_test[target_column] = mean_squared_error(y_test[target_column], y_test_pred)
    print(f"Mean Squared Error on Test Set for {target_column}:", xgb_mse_test[target_column])


Mean Squared Error on Validation Set for Minimum Temperature: 4.1877104946155725
Mean Squared Error on Validation Set for Wind Speed: 0.6222467658474657
Mean Squared Error on Validation Set for Wind Direction: 1296.5279374202557
Mean Squared Error on Validation Set for Maximum Temperature: 0.9558027388543662
Mean Squared Error on Validation Set for Relative Humidity: 120.60419509424375
Mean Squared Error on Validation Set for Rainfall: 6450.722808614349
Mean Squared Error on Test Set for Minimum Temperature: 0.6784084032571112
Mean Squared Error on Test Set for Wind Speed: 0.8556377795686956
Mean Squared Error on Test Set for Wind Direction: 1820.1678827874828
Mean Squared Error on Test Set for Maximum Temperature: 1.7954854586656572
Mean Squared Error on Test Set for Relative Humidity: 31.59636360953009
Mean Squared Error on Test Set for Rainfall: 12665.59796085662


In [160]:
mse_df = pd.DataFrame({'Validation MSE': xgb_mse_val, 'Test MSE': xgb_mse_test})

mse_df

Unnamed: 0,Validation MSE,Test MSE
Minimum Temperature,4.18771,0.678408
Wind Speed,0.622247,0.855638
Wind Direction,1296.527937,1820.167883
Maximum Temperature,0.955803,1.795485
Relative Humidity,120.604195,31.596364
Rainfall,6450.722809,12665.597961


In [161]:
#now using hyperparameter tuning

xgb_models = {}
params = {
    'n_estimators': [10,30,50,25,47,80, 150],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3]
}
for target_column in y.columns:
    grid_search = GridSearchCV(XGBRegressor(random_state=42), params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train[target_column])
    best_model = grid_search.best_estimator_
    xgb_models[target_column] = best_model
    print(f"Best Hyperparameters for {target_column}:", grid_search.best_params_)

#Evaluation on validation set for each target variable using best parameters
xgb_mse_val = {}
for target_column, model in xgb_models.items():
    y_val_pred = model.predict(X_val)
    xgb_mse_val[target_column] = mean_squared_error(y_val[target_column], y_val_pred)
    print(f"Mean Squared Error on Validation Set for {target_column}:", xgb_mse_val[target_column])

# Evaluation on test set for each target variable using best model parameters
xgb_mse_test = {}
for target_column, model in xgb_models.items():
    y_test_pred = model.predict(X_test)
    xgb_mse_test[target_column] = mean_squared_error(y_test[target_column], y_test_pred)
    print(f"Mean Squared Error on Test Set for {target_column}:", xgb_mse_test[target_column])


Best Hyperparameters for Minimum Temperature: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 150}
Best Hyperparameters for Wind Speed: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 25}
Best Hyperparameters for Wind Direction: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 80}
Best Hyperparameters for Maximum Temperature: {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 25}
Best Hyperparameters for Relative Humidity: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Hyperparameters for Rainfall: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 47}
Mean Squared Error on Validation Set for Minimum Temperature: 2.8089201996160993
Mean Squared Error on Validation Set for Wind Speed: 0.479647004433174
Mean Squared Error on Validation Set for Wind Direction: 2501.799754013954
Mean Squared Error on Validation Set for Maximum Temperature: 0.578656883083377
Mean Squared Error on Validation Set for Relative Humidity: 120.08105322871577
Mean Squ

In [162]:
mse_df = pd.DataFrame({'Validation MSE': xgb_mse_val, 'Test MSE': xgb_mse_test})

mse_df

Unnamed: 0,Validation MSE,Test MSE
Minimum Temperature,2.80892,0.497567
Wind Speed,0.479647,0.560375
Wind Direction,2501.799754,1926.75698
Maximum Temperature,0.578657,1.546685
Relative Humidity,120.081053,30.673239
Rainfall,7860.889953,9763.70389


In [163]:
# Sample data for prediction
sample_data = pd.DataFrame({
    'Year': [2024],
    'Month': label_encoder.transform(['August'])
})

# Predict using the best XGBRegressor models
predicted_values = {}
for target_column, model in xgb_models.items():
    predicted_values[target_column] = model.predict(sample_data)

# Display the predicted values
for target_column, value in predicted_values.items():
    print(f"Predicted {target_column} for the month:", value)


Predicted Minimum Temperature for the month: [21.779049]
Predicted Wind Speed for the month: [4.7748785]
Predicted Wind Direction for the month: [281.1286]
Predicted Maximum Temperature for the month: [30.519278]
Predicted Relative Humidity for the month: [80.49191]
Predicted Rainfall for the month: [181.40904]


In [164]:
from joblib import dump

# Save the trained XGBRegressor models
for target_column, model in xgb_models.items():
    dump(model, f'{target_column}_xgb_model.sav')


In [165]:
sample_data = pd.DataFrame({
    'Year': [2024],
    'Month': label_encoder.transform(['August'])
})

In [166]:
from joblib import load

# Dictionary to store loaded models
loaded_models = {}

# Load the saved XGBRegressor models
for target_column in y.columns:
    loaded_models[target_column] = load(f'{target_column}_xgb_model.sav')


predicted_values = {}
for target_column, model in loaded_models.items():
    predicted_values[target_column] = model.predict(sample_data)

# Display the predicted values
for target_column, value in predicted_values.items():
    print(f"Predicted {target_column} for the month:", value)


Predicted Minimum Temperature for the month: [21.779049]
Predicted Wind Speed for the month: [4.7748785]
Predicted Wind Direction for the month: [281.1286]
Predicted Maximum Temperature for the month: [30.519278]
Predicted Relative Humidity for the month: [80.49191]
Predicted Rainfall for the month: [181.40904]
