In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)

Mounted at /content/drive


In [3]:
import pandas as pd

# Load dataset
dataset = pd.read_csv('/content/drive/My Drive/Total_Combine_Dataset1.csv')

# Display the first few rows of the dataset (optional for understanding)
print(dataset.head())

# Select features and target
features = dataset.drop(columns=['DateTime', 'City', 'GHI'])  # Removing unnecessary columns
target = dataset['GHI']  # The target variable is GHI (Global Horizontal Irradiance)


        DateTime       City  Temperature  DHI  DNI  Surface Albedo  \
0  1/1/2019 0:00  Bandarban     0.286885  0.0  0.0        0.166667   
1  1/1/2019 1:00  Bandarban     0.274590  0.0  0.0        0.166667   
2  1/1/2019 2:00  Bandarban     0.262295  0.0  0.0        0.166667   
3  1/1/2019 3:00  Bandarban     0.258197  0.0  0.0        0.166667   
4  1/1/2019 4:00  Bandarban     0.254098  0.0  0.0        0.166667   

   Cloud Type  GHI  
0           3  0.0  
1           3  0.0  
2           1  0.0  
3           3  0.0  
4           3  0.0  


#Split the data


In [4]:
from sklearn.model_selection import train_test_split

# Split data: 70% training, 15% validation, 15% testing
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display split sizes
print(f"Training size: {len(X_train)}, Validation size: {len(X_val)}, Testing size: {len(X_test)}")


Training size: 564862, Validation size: 121042, Testing size: 121043


#Train and Evaluate Models

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=10, max_depth=3, random_state=42),
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=10, max_depth=5, random_state=42)
}


In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Dictionary to store performance metrics
performance = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    performance[name] = {
        "RMSE": mean_squared_error(y_val, y_pred) ** 0.5,
        "MAE": mean_absolute_error(y_val, y_pred),
        "R²": r2_score(y_val, y_pred)
    }

# Display results
for model_name, metrics in performance.items():
    print(f"{model_name}: RMSE={metrics['RMSE']}, MAE={metrics['MAE']}, R²={metrics['R²']}")


Linear Regression: RMSE=0.052962378734509805, MAE=0.03417097890003041, R²=0.9658305576231774
Random Forest: RMSE=0.04553061324008795, MAE=0.022455196796623726, R²=0.9747471775249861
Gradient Boosting: RMSE=0.11589348311372645, MAE=0.09072121194121933, R²=0.8363858135289786
Decision Tree: RMSE=0.04754395052383291, MAE=0.023502739773477728, R²=0.9724644689428071
XGBoost: RMSE=0.020392797935696563, MAE=0.01226083375447935, R²=0.9949341006349843


#Identify best model


In [7]:
# Identify the best-performing model
best_model = max(performance, key=lambda x: performance[x]['R²'])
print(f"Best model: {best_model}")


Best model: XGBoost
