### **Import Packages & Load Data**

In [1]:
# Import Packages & Load Data
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# Load your data
df = pd.read_csv(r'C:\Users\ABI\OneDrive - NIVA\Documents\GitHub\SWMM_MOO\10_Analysis\1002_PCA\1002_All_objectives.csv', sep=';', index_col='sim')
print("Data Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

Data Shape: (150, 19)

First 5 rows:


Unnamed: 0_level_0,PR2,PR5,PR10,PR20,TR,PHI,Temp,BGF,Inv,Maint,UNA,GA,CO2,TSS,TP,TN,Cu,Pb,Zn
sim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
sim1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sim2,0.29,0.24,0.25,0.28,0.59,0.55,0.002,0.45,2275334.205,122996.9785,105.342624,1614.908265,3148.518124,0.83,0.79,0.8,0.81,0.83,0.83
sim3,0.4,0.29,0.25,0.36,0.36,0.34,0.0012,0.22,1386333.5,76818.38861,82.742087,1069.562179,895.855142,0.71,0.71,0.71,0.71,0.71,0.71
sim4,0.22,0.22,0.25,0.23,0.44,0.41,0.0016,0.24,2046027.988,117080.8556,73.216244,1554.823203,1050.302508,0.45,0.4,0.41,0.42,0.45,0.45
sim5,0.46,0.33,0.27,0.33,0.4,0.36,0.0023,0.42,2113380.266,119223.7817,98.284197,1578.227187,3238.029588,0.96,0.92,0.94,0.94,0.96,0.96


### **Data Cleaning & Inspection**

In [3]:
pollutant_columns = ['CO2', 'TSS', 'TP', 'TN', 'Cu', 'Pb', 'Zn']
print("Data for sim42 (the outlier):")
print(df.loc['sim42', pollutant_columns])

df_clean = df.copy()

for col in pollutant_columns:
    upper_cap = df_clean[col].quantile(0.99)
    lower_cap = df_clean[col].quantile(0.01)
    df_clean[col] = df_clean[col].clip(lower=lower_cap, upper=upper_cap)

print("\nCapped value for sim42 TSS:", df_clean.loc['sim42', 'TSS'])
print("Capped value for sim42 CO2:", df_clean.loc['sim42', 'CO2'])

Data for sim42 (the outlier):
CO2    368.124472
TSS     -3.560000
TP      -3.560000
TN      -3.560000
Cu      -3.560000
Pb      -3.560000
Zn      -3.560000
Name: sim42, dtype: float64

Capped value for sim42 TSS: -0.5224
Capped value for sim42 CO2: 368.1244718


### **Perform PCA on Selected Objectives**

In [6]:
# A correlation analysis was initially carried out in excel. One objective was selected if two or more are highly correlated. 
# Then seven objectives were selected that did not have high correlation with one another

In [7]:
selected_objectives = ['PR10', 'TR', 'BGF', 'Inv', 'UNA', 'CO2', 'TSS']
print(f"\nPerforming PCA on selected objectives: {selected_objectives}")

X_selected = df_clean[selected_objectives].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

pca = PCA()
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)

loadings_df = pd.DataFrame(pca.components_.T,
                           columns=[f'PC{i+1}' for i in range(pca.n_components_)],
                           index=selected_objectives)

print("\nExplained Variance Ratio for each PC:")
print(pd.Series(pca.explained_variance_ratio_, index=loadings_df.columns))
print("\nCumulative Explained Variance:")
print(np.cumsum(pca.explained_variance_ratio_))

print("\n" + "="*60)
print("PCA LOADINGS TABLE")
print("="*60)
loadings_df


Performing PCA on selected objectives: ['PR10', 'TR', 'BGF', 'Inv', 'UNA', 'CO2', 'TSS']

Explained Variance Ratio for each PC:
PC1    0.427281
PC2    0.163781
PC3    0.140876
PC4    0.127334
PC5    0.096525
PC6    0.043071
PC7    0.001131
dtype: float64

Cumulative Explained Variance:
[0.4272813  0.59106263 0.73193893 0.8592726  0.95579779 0.99886915
 1.        ]

PCA LOADINGS TABLE


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7
PR10,0.183428,-0.056438,0.666632,0.705539,-0.141652,0.027348,-0.013122
TR,0.109408,0.42635,0.647723,-0.617349,-0.04568,0.05622,0.01853
BGF,0.563333,-0.171989,-0.066815,-0.074543,0.021906,-0.08566,0.797018
Inv,0.506898,-0.18236,-0.09146,-0.079,0.082709,0.759779,-0.333299
UNA,0.274073,0.562679,-0.123918,0.237442,0.708722,-0.157687,-0.096901
CO2,0.505515,-0.267677,-0.004284,-0.161077,-0.135285,-0.620514,-0.493457
TSS,0.222336,0.60328,-0.328411,0.164234,-0.670774,0.039824,-0.01642
