# Gradient Boost

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler



In [2]:
# Load the dataset
file_path = r'..\Smart_City_index_headers.csv'
data = pd.read_csv(file_path)

# Prepare the data (drop non-numeric and target columns)
X = data.drop(columns=['Id', 'City', 'Country', 'SmartCity_Index', 'SmartCity_Index_relative_Edmonton'])
y = data['SmartCity_Index']

# Apply Standard Scaling to the features (scaling helps with kernel-based methods like RBF)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [3]:
# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit the Gradient Boosting Regressor model
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gbr.predict(X_test)



In [4]:
# Feature importance
feature_importance = pd.Series(gbr.feature_importances_, index=X.columns)
feature_importance_sorted = feature_importance.sort_values(ascending=False)
print("Gradient Boosting feature importance (sorted):\n", feature_importance_sorted)



Gradient Boosting feature importance (sorted):
 Smart_Living         0.673256
Smart_Environment    0.107818
Smart_People         0.069775
Smart_Economy        0.065915
Smart_Mobility       0.042582
Smart_Government     0.040653
dtype: float64


In [5]:
# Evaluate the model on the test set
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Output the results
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')


Mean Squared Error (MSE): 52320.73501414841
Root Mean Squared Error (RMSE): 228.73726197134653
R-squared (R²): 0.912871953315735


In [6]:
# Cross-validation (5-fold)
cv_scores = cross_val_score(gbr, X_scaled, y, cv=5, scoring='r2')
print(f"Average R² across 5 folds: {np.mean(cv_scores)}")

Average R² across 5 folds: 0.20106576630478318
