### **Import all packages** ##

In [7]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# Set style for better looking plots
plt.style.use('default')
%matplotlib inline

print("All packages imported successfully!")

All packages imported successfully!


### **Load objectives data**

In [6]:
df = pd.read_csv(r'C:\Users\ABI\OneDrive - NIVA\Documents\GitHub\SWMM_MOO\10_Analysis\1002_PCA\1002_All_objectives.csv', 
                 sep=';', 
                 index_col='sim')

print("Objectives Data Shape:", df.shape)
print("\nFirst 5 rows of objectives data:")
display(df.head())

# Display basic statistics
print("\nBasic statistics of objectives data:")
display(df.describe())

# Identify pollutant columns
pollutant_columns = ['CO2', 'TSS', 'TP', 'TN', 'Cu', 'Pb', 'Zn']
print("Data for sim42 (the outlier):")
print(df.loc['sim42', pollutant_columns])

# Create a clean copy of the entire DataFrame
df_clean = df.copy()

# Apply capping to handle outliers in pollutant columns
for col in pollutant_columns:
    upper_cap = df_clean[col].quantile(0.99)
    lower_cap = df_clean[col].quantile(0.01)
    df_clean[col] = df_clean[col].clip(lower=lower_cap, upper=upper_cap)

print("\nAfter capping - value for sim42 TSS:", df_clean.loc['sim42', 'TSS'])
print("After capping - value for sim42 CO2:", df_clean.loc['sim42', 'CO2'])

Objectives Data Shape: (150, 19)

First 5 rows of objectives data:


Unnamed: 0_level_0,PR2,PR5,PR10,PR20,TR,PHI,Temp,BGF,Inv,Maint,UNA,GA,CO2,TSS,TP,TN,Cu,Pb,Zn
sim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
sim1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sim2,0.29,0.24,0.25,0.28,0.59,0.55,0.002,0.45,2275334.205,122996.9785,105.342624,1614.908265,3148.518124,0.83,0.79,0.8,0.81,0.83,0.83
sim3,0.4,0.29,0.25,0.36,0.36,0.34,0.0012,0.22,1386333.5,76818.38861,82.742087,1069.562179,895.855142,0.71,0.71,0.71,0.71,0.71,0.71
sim4,0.22,0.22,0.25,0.23,0.44,0.41,0.0016,0.24,2046027.988,117080.8556,73.216244,1554.823203,1050.302508,0.45,0.4,0.41,0.42,0.45,0.45
sim5,0.46,0.33,0.27,0.33,0.4,0.36,0.0023,0.42,2113380.266,119223.7817,98.284197,1578.227187,3238.029588,0.96,0.92,0.94,0.94,0.96,0.96



Basic statistics of objectives data:


Unnamed: 0,PR2,PR5,PR10,PR20,TR,PHI,Temp,BGF,Inv,Maint,UNA,GA,CO2,TSS,TP,TN,Cu,Pb,Zn
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,0.338867,0.279,0.2832,0.3124,0.418667,0.389467,0.001907,0.337933,2087523.0,119473.435617,74.747369,1575.257126,2209.14305,0.6204,0.5596,0.5798,0.585067,0.620267,0.620933
std,0.147787,0.139128,0.136619,0.13848,0.127493,0.115937,0.000638,0.147936,976377.4,59170.85184,19.632477,744.20913,1476.004562,0.4665,0.488493,0.476312,0.47443,0.467257,0.466438
min,-0.02,-0.02,-0.01,-0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.56,-3.56,-3.56,-3.56,-3.56,-3.56
25%,0.24,0.2,0.2,0.22,0.33,0.32,0.0016,0.23,1388448.0,77056.584015,62.792727,1034.613167,1131.541054,0.4825,0.43,0.4325,0.44,0.48,0.4825
50%,0.33,0.27,0.28,0.32,0.43,0.39,0.002,0.315,2072482.0,118223.73065,73.26476,1554.07823,1856.002862,0.735,0.685,0.695,0.705,0.74,0.74
75%,0.44,0.36,0.37,0.4175,0.5075,0.475,0.0023,0.4375,2725578.0,158099.009875,88.749441,2064.411671,3099.275358,0.8975,0.84,0.8575,0.8675,0.89,0.9
max,0.81,0.72,0.71,0.69,0.72,0.64,0.0035,0.72,4054589.0,235240.6254,113.552311,3013.974665,5989.04749,1.0,1.0,1.0,1.0,1.0,1.0


Data for sim42 (the outlier):
CO2    368.124472
TSS     -3.560000
TP      -3.560000
TN      -3.560000
Cu      -3.560000
Pb      -3.560000
Zn      -3.560000
Name: sim42, dtype: float64

After capping - value for sim42 TSS: -0.5224
After capping - value for sim42 CO2: 368.1244718


### **Load and Prepare Input Parameters**

In [8]:
# Load your LID scenarios file
input_params_df = pd.read_csv(r'C:\Users\ABI\OneDrive - NIVA\Documents\GitHub\SWMM_MOO\01_Preprocessing\0103_Data_cleaned_random_generated_scenarios.csv', 
                             sep=';', 
                             index_col='sim')

print("Input parameters shape:", input_params_df.shape)
print("\nFirst 5 rows of input parameters:")
display(input_params_df.head())

# Extract only the columns we need (LID implementation areas)
# Exclude the '*_Aimp' columns and focus on the area columns
area_columns = [col for col in input_params_df.columns if not col.endswith('_Aimp') and col not in ['Type']]
print(f"\nLID Area columns to use: {area_columns}")

# Keep the 'Type' column and area columns, then convert categorical 'Type' to numerical
X_inputs = input_params_df[['Type'] + area_columns].copy()
X_inputs = pd.get_dummies(X_inputs, columns=['Type'], prefix='Type')

print("\nProcessed input features for regression:")
print("Shape:", X_inputs.shape)
display(X_inputs.head())

Input parameters shape: (150, 23)

First 5 rows of input parameters:


Unnamed: 0_level_0,Type,S1,S2,S3,S4,S5_Type,S5,S5_Aimp,S7_Type,S7,...,S8_Aimp,S9_Type,S9,S9_Aimp,S10_Type,S10,S10_Aimp,S11_Type,S11,S11_Aimp
sim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,GR20,0.0,0.0,0.0,0.0,GS,0.0,273.431835,GS,0.0,...,318.73243,TRE,0.0,322.042932,BC,0.0,306.671996,GS,0.0,214.960412
2,GR20,938.857898,0.0,505.979207,0.0,GS,35.546139,273.431835,BC,22.115649,...,318.73243,BC,35.424723,322.042932,BC,46.000799,306.671996,GS,15.047229,214.960412
3,GR5,0.0,0.0,0.0,921.110789,TRE,14.0,259.431835,GS,10.052568,...,318.73243,GS,22.543005,322.042932,TRE,21.0,285.671996,BC,25.795249,214.960412
4,GR5,0.0,0.0,505.979207,921.110789,GS,21.874547,273.431835,BC,6.031541,...,318.73243,GS,35.424723,322.042932,GS,39.867359,306.671996,BC,8.598416,214.960412
5,GR20,0.0,0.0,505.979207,921.110789,BC,32.81182,273.431835,GS,12.063081,...,283.73243,TRE,14.0,308.042932,BC,21.46704,306.671996,BC,25.795249,214.960412



LID Area columns to use: ['S1', 'S2', 'S3', 'S4', 'S5_Type', 'S5', 'S7_Type', 'S7', 'S8_Type', 'S8', 'S9_Type', 'S9', 'S10_Type', 'S10', 'S11_Type', 'S11']

Processed input features for regression:
Shape: (150, 18)


Unnamed: 0_level_0,S1,S2,S3,S4,S5_Type,S5,S7_Type,S7,S8_Type,S8,S9_Type,S9,S10_Type,S10,S11_Type,S11,Type_GR20,Type_GR5
sim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0.0,0.0,0.0,0.0,GS,0.0,GS,0.0,GS,0.0,TRE,0.0,BC,0.0,GS,0.0,True,False
2,938.857898,0.0,505.979207,0.0,GS,35.546139,BC,22.115649,GS,15.936621,BC,35.424723,BC,46.000799,GS,15.047229,True,False
3,0.0,0.0,0.0,921.110789,TRE,14.0,GS,10.052568,GS,35.060567,GS,22.543005,TRE,21.0,BC,25.795249,False,True
4,0.0,0.0,505.979207,921.110789,GS,21.874547,BC,6.031541,GS,15.936621,GS,35.424723,GS,39.867359,BC,8.598416,False,True
5,0.0,0.0,505.979207,921.110789,BC,32.81182,GS,12.063081,TRE,35.0,TRE,14.0,BC,21.46704,BC,25.795249,True,False


### **Build Regression Models for the selected Objectives**