# Random Forest

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the dataset
file_path = 'Smart_City_index_headers.csv'
data = pd.read_csv(file_path)

# Prepare the data

In [2]:

X = data.drop(columns=['SmartCity_Index'])
y = data['SmartCity_Index']

# Drop non-numeric columns that are not useful for modeling
X = data.drop(columns=['Id', 'City', 'Country', 'SmartCity_Index', 'SmartCity_Index_relative_Edmonton'])
y = data['SmartCity_Index']

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest model

In [3]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test)

# Evaluate the model

In [4]:

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Output the results
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')

# Display feature importance
feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Feature Importance (sorted):")
print(feature_importance)

Mean Squared Error (MSE): 64195.54439047616
Root Mean Squared Error (RMSE): 253.36839658978025
R-squared (R²): 0.8930972130444514
Feature Importance (sorted):
Smart_Living         0.680798
Smart_Environment    0.117973
Smart_People         0.085488
Smart_Economy        0.048494
Smart_Government     0.035911
Smart_Mobility       0.031337
dtype: float64
