In [1]:
import pandas as pd

# Replace 'your_dataset_file.csv' with the actual name of the file you downloaded
try:
    df = pd.read_csv('climate_change_dataset.csv')
    print("Dataset loaded successfully!")
    # Display the first 5 rows to get a glimpse of the data
    print(df.head().to_markdown(index=False, numalign="left", stralign="left"))
    # Display basic information about the dataset
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print("Error: Make sure 'your_dataset_file.csv' is in the correct directory.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!
| Year   | Month   | Avg_Temp (°C)   | Max_Temp (°C)   | Min_Temp (°C)   | Precipitation (mm)   | Humidity (%)      | Wind_Speed (m/s)   | Solar_Irradiance (W/m²)   | Cloud_Cover (%)   | CO2_Concentration (ppm)   | Latitude   | Longitude   | Altitude (m)   | Proximity_to_Water (km)   | Urbanization_Index   | Vegetation_Index   | ENSO_Index   | Particulate_Matter (µg/m³)   | Sea_Surface_Temp (°C)   |
|:-------|:--------|:----------------|:----------------|:----------------|:---------------------|:------------------|:-------------------|:--------------------------|:------------------|:--------------------------|:-----------|:------------|:---------------|:--------------------------|:---------------------|:-------------------|:-------------|:-----------------------------|:------------------------|
| nan    | 1       | -3.46052        | 33.0569         | nan             | 184.9                | 89.62081302618182 | 9.74289            | nan                       

In [3]:
import pandas as pd
import numpy as np

# Assuming 'df' is the DataFrame you loaded initially

print("Original number of missing values per column:")
print(df.isnull().sum().to_markdown(numalign="left", stralign="left"))

# Create a copy of the DataFrame
df_imputed = df.copy()

# Identify numerical columns for imputation
numerical_cols = df_imputed.select_dtypes(include=np.number).columns

# Impute missing values in numerical columns with the mean
for col in numerical_cols:
    if df_imputed[col].isnull().any(): # Check if the column actually has missing values
        mean_val = df_imputed[col].mean()
        df_imputed[col] = df_imputed[col].fillna(mean_val) # Avoids the Future Warning

# For columns that might be read as 'object' but contain numbers (like 'Unknown' from the snippet),
# you might need to convert them to numeric first and then impute.
# Let's try to coerce any object columns that should be numeric.
for col in df_imputed.select_dtypes(include='object').columns:
    # Attempt to convert to numeric, errors='coerce' will turn unparseable values into NaN
    df_imputed[col] = pd.to_numeric(df_imputed[col], errors='coerce')
    if df_imputed[col].isnull().any(): # Check if there are now NaNs after coercion
         # Decide on imputation strategy for these coerced columns - mean is simple
        if pd.api.types.is_numeric_dtype(df_imputed[col]):
            mean_val = df_imputed[col].mean()
            df_imputed[col] = df_imputed[col].fillna(mean_val) # Avoids the Future Warning
        # else: # Handle non-numeric object columns if necessary, maybe with mode or a placeholder string
            # df_imputed[col] = df_imputed[col].fillna('Missing')


print("\nMissing values per column after imputation:")
print(df_imputed.isnull().sum().to_markdown(numalign="left", stralign="left"))

# Display the first few rows of the imputed DataFrame to check
print("\nFirst 5 rows after imputation:")
print(df_imputed.head().to_markdown(index=False, numalign="left", stralign="left"))

Original number of missing values per column:
|                            | 0   |
|:---------------------------|:----|
| Year                       | 5   |
| Month                      | 5   |
| Avg_Temp (°C)              | 7   |
| Max_Temp (°C)              | 7   |
| Min_Temp (°C)              | 8   |
| Precipitation (mm)         | 4   |
| Humidity (%)               | 8   |
| Wind_Speed (m/s)           | 2   |
| Solar_Irradiance (W/m²)    | 5   |
| Cloud_Cover (%)            | 4   |
| CO2_Concentration (ppm)    | 6   |
| Latitude                   | 6   |
| Longitude                  | 3   |
| Altitude (m)               | 4   |
| Proximity_to_Water (km)    | 6   |
| Urbanization_Index         | 3   |
| Vegetation_Index           | 3   |
| ENSO_Index                 | 3   |
| Particulate_Matter (µg/m³) | 3   |
| Sea_Surface_Temp (°C)      | 7   |

Missing values per column after imputation:
|                            | 0   |
|:---------------------------|:----|
| Year               

In [22]:
import pandas as pd
import numpy as np

# Assuming 'df_imputed' is the DataFrame after handling missing values
# Let's create a small number of synthetic data points by adding random noise to existing ones
n_synthetic_samples = 85000 # You can adjust the number of synthetic samples to generate

# Select random rows from the imputed data to create synthetic data from
synthetic_data = df_imputed.sample(n=n_synthetic_samples, replace=True, random_state=42).copy()

# Add a small amount of random noise to the numerical columns
# The scale of the noise should be relatively small compared to the range of the data
noise_scale = 0.05 # You can adjust this value - smaller means less noise

for col in synthetic_data.select_dtypes(include=np.number).columns:
    # Calculate the range of the column to scale the noise
    col_range = synthetic_data[col].max() - synthetic_data[col].min()
    if col_range == 0: # Avoid division by zero if all values are the same
        col_range = 1
    
    # Generate random noise scaled by the column's range and the noise_scale
    noise = np.random.randn(n_synthetic_samples) * col_range * noise_scale
    
    # Add the noise to the column
    synthetic_data[col] = synthetic_data[col] + noise

# Combine the original imputed data with the synthetic data
df_combined = pd.concat([df_imputed, synthetic_data], ignore_index=True)

print(f"\nOriginal number of rows: {len(df_imputed)}")
print(f"Number of synthetic rows added: {n_synthetic_samples}")
print(f"Total number of rows in the combined dataset: {len(df_combined)}")

# Display the first 5 rows of the combined dataset to see the original and synthetic data
print("\nFirst 5 rows of the combined dataset:")
print(df_combined.head().to_markdown(index=False, numalign="left", stralign="left"))

# Display the last 5 rows to see some of the synthetic data
print("\nLast 5 rows of the combined dataset (likely synthetic):")
print(df_combined.tail().to_markdown(index=False, numalign="left", stralign="left"))


Original number of rows: 53
Number of synthetic rows added: 85000
Total number of rows in the combined dataset: 85053

First 5 rows of the combined dataset:
| Year    | Month   | Avg_Temp (°C)   | Max_Temp (°C)   | Min_Temp (°C)   | Precipitation (mm)   | Humidity (%)   | Wind_Speed (m/s)   | Solar_Irradiance (W/m²)   | Cloud_Cover (%)   | CO2_Concentration (ppm)   | Latitude   | Longitude   | Altitude (m)   | Proximity_to_Water (km)   | Urbanization_Index   | Vegetation_Index   | ENSO_Index   | Particulate_Matter (µg/m³)   | Sea_Surface_Temp (°C)   |
|:--------|:--------|:----------------|:----------------|:----------------|:---------------------|:---------------|:-------------------|:--------------------------|:------------------|:--------------------------|:-----------|:------------|:---------------|:--------------------------|:---------------------|:-------------------|:-------------|:-----------------------------|:------------------------|
| 2021.87 | 1       | -3.46052        | 

In [23]:
# --- Step 6: Re-split the Combined Data ---
print("--- Step 6: Re-splitting the Combined Data ---")

from sklearn.model_selection import train_test_split
import pandas as pd # Assuming pandas is already imported
# Assuming df_combined is available from the previous synthetic data generation step

# Define your features (X) and target (y) - Use the same ones as before
features = ['Year', 'Month', 'Avg_Temp (°C)', 'Max_Temp (°C)', 'Min_Temp (°C)',
            'Precipitation (mm)', 'Humidity (%)', 'Wind_Speed (m/s)',
            'Cloud_Cover (%)', 'CO2_Concentration (ppm)', 'Latitude', 'Longitude',
            'Altitude (m)', 'Proximity_to_Water (km)', 'Urbanization_Index',
            'Vegetation_Index', 'ENSO_Index', 'Particulate_Matter (µg/m³)',
            'Sea_Surface_Temp (°C)'] # Updated features based on your confirmation

target = 'Solar_Irradiance (W/m²)' # Your target variable

# Ensure the target column exists and features are in the combined DataFrame
if target not in df_combined.columns:
    print(f"Error: Target column '{target}' not found in the combined DataFrame. Please check the column name.")
else:
     # Drop any potential rows where the target is still missing (should be 0 after original imputation)
    df_cleaned_target_combined = df_combined.dropna(subset=[target])

    # Verify all selected features are in the DataFrame
    selected_features = [f for f in features if f in df_cleaned_target_combined.columns]
    missing_features = [f for f in features if f not in df_cleaned_target_combined.columns]

    if missing_features:
        print(f"Warning: The following feature columns were not found in the DataFrame and will be excluded: {missing_features}")

    if not selected_features:
        print("Error: No valid feature columns selected. Please check your feature list against the DataFrame columns.")
    else:
        X_combined = df_cleaned_target_combined[selected_features]
        y_combined = df_cleaned_target_combined[target]

        # Split the combined data into training and testing sets
        # We'll use 80% for training and 20% for testing, same random_state for consistency
        X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

        print("\nCombined data splitting complete.")
        print(f"Combined training features shape (X_train_combined): {X_train_combined.shape}")
        print(f"Combined testing features shape (X_test_combined): {X_test_combined.shape}")
        print(f"Combined training target shape (y_train_combined): {y_train_combined.shape}")
        print(f"Combined testing target shape (y_test_combined): {y_test_combined.shape}")

        # --- Step 7: Retrain the Model on the Combined Data ---
        print("\n--- Step 7: Retraining the Model on the Combined Data ---")

        from sklearn.ensemble import RandomForestRegressor
        # Assuming Random Forest Regressor model is your choice and necessary libraries are imported
        # Assuming X_train_combined and y_train_combined are available

        # Choose the Random Forest Regressor model with the same parameters
        model_combined = RandomForestRegressor(n_estimators=100, random_state=42)

        # Train the model using the combined training data
        print("\nTraining the Random Forest Regressor model on combined data...")
        model_combined.fit(X_train_combined, y_train_combined)
        print("Model training on combined data complete.")

        # --- Step 8: Re-evaluate the Model on the Combined Test Data ---
        print("\n--- Step 8: Re-evaluating the Model on the Combined Test Data ---")

        from sklearn.metrics import mean_absolute_error, r2_score
        import pandas as pd # Assuming pandas is already imported
        # Assuming model_combined, X_test_combined, and y_test_combined are available

        # Make predictions on the combined test data
        print("\nMaking predictions on the combined test data...")
        y_pred_combined = model_combined.predict(X_test_combined)
        print("Predictions made on combined data.")

        # Evaluate the model's performance on combined data
        mae_combined = mean_absolute_error(y_test_combined, y_pred_combined)
        r2_combined = r2_score(y_test_combined, y_pred_combined)

        print(f"\nModel Evaluation Results (Combined Data):")
        print(f"Mean Absolute Error (MAE): {mae_combined:.2f}")
        print(f"R-squared (R²): {r2_combined:.2f}")

--- Step 6: Re-splitting the Combined Data ---

Combined data splitting complete.
Combined training features shape (X_train_combined): (68042, 19)
Combined testing features shape (X_test_combined): (17011, 19)
Combined training target shape (y_train_combined): (68042,)
Combined testing target shape (y_test_combined): (17011,)

--- Step 7: Retraining the Model on the Combined Data ---

Training the Random Forest Regressor model on combined data...
Model training on combined data complete.

--- Step 8: Re-evaluating the Model on the Combined Test Data ---

Making predictions on the combined test data...
Predictions made on combined data.

Model Evaluation Results (Combined Data):
Mean Absolute Error (MAE): 9.95
R-squared (R²): 0.97


In [26]:
import joblib

# Assuming model_combined is your trained Random Forest Regressor model
# Define a filename for your model
model_filename = 'solar_potential_model.pkl'

# Save the trained model to the file
joblib.dump(model_combined, model_filename)

print(f"Trained model saved to {model_filename}")

Trained model saved to solar_potential_model.pkl


In [27]:
from sklearn.metrics import mean_squared_error
import numpy as np # Import numpy for square root
# Assuming y_test_combined and y_pred_combined are available from the previous evaluation step

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test_combined, y_pred_combined)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print(f"\nAdditional Model Evaluation Results:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


Additional Model Evaluation Results:
Mean Squared Error (MSE): 160.78
Root Mean Squared Error (RMSE): 12.68
