<a href="https://www.kaggle.com/code/atifmasih/histgradientboostingregressor-on-climate-dataset?scriptVersionId=188686319" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/climate-change-dataset2020-2024/climate_change_dataset.csv


In [2]:
# Load the dataset
df = pd.read_csv('/kaggle/input/climate-change-dataset2020-2024/climate_change_dataset.csv')
print(df.head())

     Year Month  Avg_Temp (°C)       Max_Temp (°C)       Min_Temp (°C)  \
0     NaN   1.0      -3.460516  33.056918907353285                 NaN   
1  2020.0   2.0            NaN  25.901916083791665  3.1739745975583844   
2  2020.0   3.0       7.869842   18.63424149980029   10.42485208665568   
3     NaN   4.0      -0.049863  13.030562376875533  -9.196433418151315   
4  2020.0   5.0      19.895067   35.88212887551623  20.067999652410148   

   Precipitation (mm)       Humidity (%)    Wind_Speed (m/s)  \
0  184.89969755743786  89.62081302618182   9.742885739415993   
1   2.957244192049635  95.17102000342224   10.64824600272703   
2                 NaN                NaN                 NaN   
3   102.4544605414934            Unknown   0.898697634212508   
4  185.72922332959365  73.99994640624561  13.128301152215258   

  Solar_Irradiance (W/m²)     Cloud_Cover (%)  CO2_Concentration (ppm)  \
0                     NaN  58.530797670209076                      NaN   
1       252.3136442294

In [3]:
from sklearn.impute import SimpleImputer

# Identify numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Apply imputation
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

print(df.isnull().sum())

Year                          5
Month                         5
Avg_Temp (°C)                 0
Max_Temp (°C)                 7
Min_Temp (°C)                 8
Precipitation (mm)            4
Humidity (%)                  8
Wind_Speed (m/s)              2
Solar_Irradiance (W/m²)       5
Cloud_Cover (%)               4
CO2_Concentration (ppm)       0
Latitude                      6
Longitude                     3
Altitude (m)                  4
Proximity_to_Water (km)       6
Urbanization_Index            0
Vegetation_Index              3
ENSO_Index                    3
Particulate_Matter (µg/m³)    3
Sea_Surface_Temp (°C)         7
dtype: int64


In [4]:
# Replace known irrelevant values with NaN
irrelevant_values = ['Unknown', 'NAN', 99999]
df.replace(irrelevant_values, np.nan, inplace=True)

# Debug: Check for any NaN values
print("\nNumber of NaNs after replacing irrelevant values:")
print(df.isna().sum())


Number of NaNs after replacing irrelevant values:
Year                          6
Month                         6
Avg_Temp (°C)                 0
Max_Temp (°C)                 8
Min_Temp (°C)                 9
Precipitation (mm)            5
Humidity (%)                  9
Wind_Speed (m/s)              3
Solar_Irradiance (W/m²)       6
Cloud_Cover (%)               5
CO2_Concentration (ppm)       0
Latitude                      7
Longitude                     4
Altitude (m)                  5
Proximity_to_Water (km)       7
Urbanization_Index            2
Vegetation_Index              4
ENSO_Index                    4
Particulate_Matter (µg/m³)    4
Sea_Surface_Temp (°C)         8
dtype: int64


In [5]:
# Feature Engineering
# Calculate temperature anomalies (difference from mean temperature)
df['Temp_Anomaly (°C)'] = df['Avg_Temp (°C)'] - df['Avg_Temp (°C)'].mean()

# Create lag features for temperature (previous month temperature)
df['Lag1_Avg_Temp (°C)'] = df['Avg_Temp (°C)'].shift(1)
df['Lag2_Avg_Temp (°C)'] = df['Avg_Temp (°C)'].shift(2)

# Handle NaN values created by shifting
df[['Lag1_Avg_Temp (°C)', 'Lag2_Avg_Temp (°C)']] = df[['Lag1_Avg_Temp (°C)', 'Lag2_Avg_Temp (°C)']]

# Debug: Summary after feature engineering
print("\nData summary after feature engineering:")
print(df.describe(include='all'))


Data summary after feature engineering:
          Year Month  Avg_Temp (°C)       Max_Temp (°C)       Min_Temp (°C)  \
count       47    47      53.000000                  45                  44   
unique       5    12            NaN                  45                  44   
top     2022.0   1.0            NaN  33.056918907353285  3.1739745975583844   
freq        12     5            NaN                   1                   1   
mean       NaN   NaN      13.018699                 NaN                 NaN   
std        NaN   NaN      10.751219                 NaN                 NaN   
min        NaN   NaN      -4.965473                 NaN                 NaN   
25%        NaN   NaN       4.705118                 NaN                 NaN   
50%        NaN   NaN      12.919545                 NaN                 NaN   
75%        NaN   NaN      20.751242                 NaN                 NaN   
max        NaN   NaN      34.282303                 NaN                 NaN   

        Pr

In [6]:
from sklearn.preprocessing import StandardScaler

# Normalize/scale the features
scaler = StandardScaler()

# Select features to scale (excluding non-numeric and target column)
features_to_scale = df.columns.difference(['Year', 'Month', 'Latitude', 'Longitude', 'Avg_Temp (°C)'])

# Apply scaling
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Debug: Print summary of data after scaling
print("\nData summary after scaling:")
print(df.describe(include='all'))



Data summary after scaling:
          Year Month  Avg_Temp (°C)  Max_Temp (°C)  Min_Temp (°C)  \
count       47    47      53.000000   4.500000e+01   4.400000e+01   
unique       5    12            NaN            NaN            NaN   
top     2022.0   1.0            NaN            NaN            NaN   
freq        12     5            NaN            NaN            NaN   
mean       NaN   NaN      13.018699  -5.538779e-16   1.892426e-17   
std        NaN   NaN      10.751219   1.011300e+00   1.011561e+00   
min        NaN   NaN      -4.965473  -2.138360e+00  -1.536787e-01   
25%        NaN   NaN       4.705118  -6.782717e-01  -1.532907e-01   
50%        NaN   NaN      12.919545  -5.734913e-02  -1.524079e-01   
75%        NaN   NaN      20.751242   8.474912e-01  -1.516876e-01   
max        NaN   NaN      34.282303   1.537009e+00   6.557436e+00   

        Precipitation (mm)  Humidity (%)  Wind_Speed (m/s)  \
count         4.800000e+01  4.400000e+01      5.000000e+01   
unique            

In [7]:
# Define features (X) and target (y)
X = df[features_to_scale]
y = df['Avg_Temp (°C)']

In [8]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
X_train.head()

Unnamed: 0,Altitude (m),CO2_Concentration (ppm),Cloud_Cover (%),ENSO_Index,Humidity (%),Lag1_Avg_Temp (°C),Lag2_Avg_Temp (°C),Max_Temp (°C),Min_Temp (°C),Particulate_Matter (µg/m³),Precipitation (mm),Proximity_to_Water (km),Sea_Surface_Temp (°C),Solar_Irradiance (W/m²),Temp_Anomaly (°C),Urbanization_Index,Vegetation_Index,Wind_Speed (m/s)
8,0.0,0.727217,1.444684,,1.415877,-0.733685,-1.28439,-1.818575,-0.153035,-0.143966,-0.139443,-0.213201,-0.849814,0.739331,1.996712,-0.250185,-0.30208,-0.142896
26,0.0,1.034142,-1.550447,-0.206334,1.3106,-0.93658,0.403448,,-0.151236,-0.145618,-0.139924,-0.213201,-0.076211,-0.373624,0.0,-0.250335,-0.897556,-0.142714
6,0.0,-0.925491,-1.601726,-0.206263,,-1.017513,0.665002,,-0.153672,-0.14473,-0.151812,-0.213201,1.374239,0.381216,-1.300573,-0.250381,1.694936,-0.142679
34,0.0,0.0,1.097847,-0.206288,-1.668891,-1.597217,0.01826,,-0.153578,6.928201,-0.144939,-0.213201,1.162042,-1.389453,0.913762,-0.249779,1.580049,-0.143252
4,0.0,-1.675928,-1.622846,-0.206303,0.322491,-1.226824,-0.466004,1.257132,-0.151715,-0.144808,-0.140405,4.690416,-0.680081,-0.058492,0.64571,-0.250492,,-0.14253


In [10]:
X_test.head()

Unnamed: 0,Altitude (m),CO2_Concentration (ppm),Cloud_Cover (%),ENSO_Index,Humidity (%),Lag1_Avg_Temp (°C),Lag2_Avg_Temp (°C),Max_Temp (°C),Min_Temp (°C),Particulate_Matter (µg/m³),Precipitation (mm),Proximity_to_Water (km),Sea_Surface_Temp (°C),Solar_Irradiance (W/m²),Temp_Anomaly (°C),Urbanization_Index,Vegetation_Index,Wind_Speed (m/s)
19,0.0,0.65407,0.872879,-0.206324,1.297453,-0.008793,-0.259479,1.537009,-0.152199,-0.142969,-0.143119,-0.213201,-1.402217,0.979077,-0.684386,-0.24982,0.449672,-0.142676
41,0.0,-0.152769,-1.621741,-0.206318,0.052939,-1.684974,-0.435921,0.662846,-0.151228,-0.145386,-0.141404,-0.213201,,1.626361,-0.593207,-0.249512,-0.207636,-0.142771
47,0.0,1.608722,0.597473,-0.206257,0.453742,1.835628,0.021316,-2.13836,-0.153347,-0.144479,-0.15058,4.690416,-1.018022,-0.967657,1.358406,-0.249605,0.00875,-0.142444
12,0.0,0.169123,0.806144,-0.206253,0.010982,1.730845,-0.129991,-0.780134,,-0.145697,-0.140452,-0.213201,-1.605419,-0.604434,-0.509968,-0.249556,-0.395056,-0.142892
43,0.0,1.24871,-1.207729,-0.206282,0.263822,1.484785,-0.575894,-0.801371,-0.153437,-0.145445,-0.149356,-0.213201,-0.564428,-0.090274,-0.164162,-0.249801,-0.900701,-0.1429


In [11]:
# Debug: Check for NaNs in the split datasets
print("\nCheck for NaNs in X_train:")
print(X_train.isna().sum())
print("Check for NaNs in y_train:")
print(y_train.isna().sum())

print("\nCheck for NaNs in X_test:")
print(X_test.isna().sum())
print("Check for NaNs in y_test:")
print(y_test.isna().sum())



Check for NaNs in X_train:
Altitude (m)                  5
CO2_Concentration (ppm)       0
Cloud_Cover (%)               4
ENSO_Index                    4
Humidity (%)                  8
Lag1_Avg_Temp (°C)            1
Lag2_Avg_Temp (°C)            2
Max_Temp (°C)                 8
Min_Temp (°C)                 7
Particulate_Matter (µg/m³)    4
Precipitation (mm)            4
Proximity_to_Water (km)       7
Sea_Surface_Temp (°C)         5
Solar_Irradiance (W/m²)       6
Temp_Anomaly (°C)             0
Urbanization_Index            2
Vegetation_Index              4
Wind_Speed (m/s)              3
dtype: int64
Check for NaNs in y_train:
0

Check for NaNs in X_test:
Altitude (m)                  0
CO2_Concentration (ppm)       0
Cloud_Cover (%)               1
ENSO_Index                    0
Humidity (%)                  1
Lag1_Avg_Temp (°C)            0
Lag2_Avg_Temp (°C)            0
Max_Temp (°C)                 0
Min_Temp (°C)                 2
Particulate_Matter (µg/m³)    0
Precipi

In [12]:
from sklearn.ensemble import HistGradientBoostingRegressor
# Define the HistGradientBoostingRegressor model
hgb_model = HistGradientBoostingRegressor(random_state=42)

In [13]:
# Train the model
hgb_model.fit(X_train, y_train)




In [14]:
# Make predictions
y_pred = hgb_model.predict(X_test)


In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error on test set: {mae}')
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared Error on test set: {mse}')
r2 = r2_score(y_test, y_pred)
print(f'R-squared Error on test set: {r2}')

Mean Absolute Error on test set: 8.24628469866132
Mean squared Error on test set: 94.11311849450648
R-squared Error on test set: -0.9518600405052695


In [16]:
# Compare predicted and actual values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison.head())

       Actual  Predicted
19   5.730457  15.817966
41   6.701457  18.115134
47  27.484780  19.123222
12   7.587887  22.973691
43  11.270483  17.757682
