In [9]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import RobustScaler

# Load the dataset
file_path = r'..\Smart_City_index_headers.csv'
data = pd.read_csv(file_path)

# Select only numeric columns from the dataset
numeric_data = data.select_dtypes(include=['number'])

# Calculate the standard deviation for each numeric column
std_devs = numeric_data.std()
print("Standard Deviation for each numeric feature:")
print(std_devs)

# Calculate the variance for each numeric column
variances = numeric_data.var()
print("\nVariance for each numeric feature:")
print(variances)

Standard Deviation for each numeric feature:
Id                                     29.588849
Smart_Mobility                       1214.030137
Smart_Environment                    1724.032171
Smart_Government                     1153.375297
Smart_Economy                        1801.555148
Smart_People                         1449.096341
Smart_Living                         2286.397477
SmartCity_Index                       852.775180
SmartCity_Index_relative_Edmonton     852.783238
dtype: float64

Variance for each numeric feature:
Id                                   8.755000e+02
Smart_Mobility                       1.473869e+06
Smart_Environment                    2.972287e+06
Smart_Government                     1.330275e+06
Smart_Economy                        3.245601e+06
Smart_People                         2.099880e+06
Smart_Living                         5.227613e+06
SmartCity_Index                      7.272255e+05
SmartCity_Index_relative_Edmonton    7.272393e+05
dtype: float6

In [2]:
# Apply log transformation to high-variability features
features_to_transform = ['Smart_Living', 'Smart_Economy', 'Smart_Environment']
for feature in features_to_transform:
    data[feature] = np.log1p(data[feature])  # log1p ensures that zero values are also handled

# Prepare the data

In [3]:
# Prepare the data (drop non-numeric and target columns)
X = data.drop(columns=['Id', 'City', 'Country', 'SmartCity_Index', 'SmartCity_Index_relative_Edmonton'])
y = data['SmartCity_Index']

# Apply Standard Scaling to the features (scaling helps with kernel-based methods like RBF)
scaler = scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit Random Forest model

In [4]:
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}


# Fit GridSearchCV to the training data
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=2,
                           scoring='r2',
                           error_score='raise')

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)  # Ensure fit() is called before accessing best_estimator_

# Step 7: Get the best parameters and evaluate the model
print("Best Hyperparameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Step 8: Predict on the test set and evaluate
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R² on the test set: {r2}")

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Best Hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
R² on the test set: 0.8992303268587287


In [5]:
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model

In [6]:

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Output the results
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')

# Display feature importance
feature_importance = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature Importance (sorted):")
print(feature_importance)

Mean Squared Error (MSE): 60512.585401951445
Root Mean Squared Error (RMSE): 245.99305966216087
R-squared (R²): 0.8992303268587287
Feature Importance (sorted):
Smart_Living         0.396012
Smart_Environment    0.201108
Smart_Government     0.133822
Smart_People         0.113129
Smart_Economy        0.083616
Smart_Mobility       0.072314
dtype: float64


In [14]:
# Cross-validation (5-fold)
cv_scores = cross_val_score(best_model, X_scaled, y, cv=12, scoring='r2')
print(f"Average R² across 5 folds: {np.mean(cv_scores)}")


Average R² across 5 folds: -0.06936002799162856
