In [1]:
import numpy as np
import pandas as pd

# Load data
df = pd.read_excel("Data_PLS.xlsx")

# Drop empty Excel artifact columns
# df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Sanity check
print(df.shape)
print(df.columns.tolist())


(392, 28)
['OAC1', 'OAC2', 'OAC3', 'OAC4', 'DIR1', 'DIR2', 'DIR3', 'DIR4', 'RAI1', 'RAI2', 'RAI3', 'RAI4', 'PVO1', 'PVO2', 'PVO3', 'PVO4', 'IP1', 'IP2', 'IP3', 'IP4', 'EST1', 'EST2', 'EST3', 'EST4', 'AIGR1', 'AIGR2', 'AIGR3', 'AIGR4']


In [2]:
constructs = {
    "OAC": ["OAC1", "OAC2", "OAC3","OAC4"],
    "DIR": ["DIR1", "DIR2", "DIR3","DIR4"],
    "RAI": ["RAI1", "RAI2", "RAI3", "RAI4"],
    "PVO": ["PVO1", "PVO2", "PVO3","PVO4"],
    "EST": ["EST1", "EST2", "EST3","EST4"],
    "AIGR": ["AIGR1", "AIGR2", "AIGR3","AIGR4"],
}

all_indicators = [ind for inds in constructs.values() for ind in inds]


In [3]:
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [4]:
models = {
    "LinearRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LinearRegression())
    ]),

    "RidgeRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", Ridge(alpha=1.0))
    ]),

    "RandomForest": RandomForestRegressor(
        n_estimators=300,
        random_state=42
    ),

    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=300,
        random_state=42
    )
}


In [5]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)



In [6]:
target_construct = "AIGR"  

In [7]:


results = []

valid_columns = set(df.columns)

for construct, indicators in constructs.items():

    # ðŸ”´ Skip all other constructs
    if construct != target_construct:
        continue

    # Inputs = indicators of all OTHER constructs
    input_features = [
        ind for ind in all_indicators
        if ind not in indicators and ind in valid_columns
    ]

    if len(input_features) == 0:
        raise ValueError(f"No input features found for construct {construct}")

    X = df[input_features]

    for target in indicators:

        if target not in valid_columns:
            continue

        y = df[target]

        for model_name, model in models.items():

            y_pred = cross_val_predict(model, X, y, cv=kf)

            # --- METRICS ---
            rmse = np.sqrt(mean_squared_error(y, y_pred))
            mae = mean_absolute_error(y, y_pred)
            r2 = r2_score(y, y_pred)

            # QÂ²predict (mean baseline)
            q2 = 1 - np.sum((y - y_pred) ** 2) / np.sum((y - y.mean()) ** 2)

            results.append({
                "Construct": construct,
                "Indicator": target,
                "Model": model_name,
                "RMSE": rmse,
                "MAE": mae,
                "R2": r2,
                "Q2": q2
            })


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret 

In [8]:
# Convert results list to DataFrame
results_df = pd.DataFrame(results)

# Sanity check
print("Number of rows:", results_df.shape[0])
print(results_df.head())

# Save to CSV
results_df.to_csv(
    "Results/PLSpredict_Aligned_ML_Validation.csv",
    index=False
)

print("CSV file saved successfully.")


Number of rows: 16
  Construct Indicator             Model      RMSE       MAE        R2  \
0      AIGR     AIGR1  LinearRegression  0.753696  0.615593  0.353063   
1      AIGR     AIGR1   RidgeRegression  0.753267  0.615181  0.353799   
2      AIGR     AIGR1      RandomForest  0.794028  0.648376  0.281972   
3      AIGR     AIGR1  GradientBoosting  0.827010  0.673345  0.221082   
4      AIGR     AIGR2  LinearRegression  0.739346  0.600341  0.370176   

         Q2  
0  0.353063  
1  0.353799  
2  0.281972  
3  0.221082  
4  0.370176  
CSV file saved successfully.


In [9]:
import pandas as pd

ml_df = pd.read_csv("Results/PLSpredict_Aligned_ML_Validation.csv")

print(ml_df.shape)   # should be (56, 7)
ml_df.head()


(16, 7)


Unnamed: 0,Construct,Indicator,Model,RMSE,MAE,R2,Q2
0,AIGR,AIGR1,LinearRegression,0.753696,0.615593,0.353063,0.353063
1,AIGR,AIGR1,RidgeRegression,0.753267,0.615181,0.353799,0.353799
2,AIGR,AIGR1,RandomForest,0.794028,0.648376,0.281972,0.281972
3,AIGR,AIGR1,GradientBoosting,0.82701,0.673345,0.221082,0.221082
4,AIGR,AIGR2,LinearRegression,0.739346,0.600341,0.370176,0.370176


In [14]:
#select best model for each item
best_ml = (
    ml_df
    .sort_values("RMSE")
    .groupby(["Construct", "Indicator"], as_index=False)
    .first()
)

best_ml


# Save to CSV
best_ml.to_csv(
    "Results/best_models_per_indicator.csv",
    index=False
)



In [11]:
import os
from joblib import dump

# Create folder if it does not exist
MODEL_DIR = "Models"
os.makedirs(MODEL_DIR, exist_ok=True)

saved_models = {}

for _, row in best_ml.iterrows():

    construct = row["Construct"]
    indicator = row["Indicator"]
    model_name = row["Model"]

    # Skip missing ML cases (e.g., PBUG1, PBUG2)
    if pd.isna(model_name):
        print(f"Skipping {indicator} (no valid ML model)")
        continue

    # Define input features (same logic as training)
    input_features = [
        ind for ind in all_indicators
        if ind not in constructs[construct] and ind in df.columns
    ]

    X = df[input_features]
    y = df[indicator]

    # Get model template
    model = models[model_name]

    # Refit on FULL data
    model.fit(X, y)

    # File path (inside Models/)
    filename = os.path.join(
        MODEL_DIR,
        f"ML_BestModel_{construct}_{indicator}_{model_name}.joblib"
    )

    # Save model
    dump(model, filename)

    # Store metadata for later use
    saved_models[indicator] = {
        "construct": construct,
        "model_name": model_name,
        "features": input_features,
        "file": filename
    }

    print(f"Saved: {filename}")


Saved: Models/ML_BestModel_AIGR_AIGR1_RidgeRegression.joblib
Saved: Models/ML_BestModel_AIGR_AIGR2_RidgeRegression.joblib
Saved: Models/ML_BestModel_AIGR_AIGR3_RidgeRegression.joblib
Saved: Models/ML_BestModel_AIGR_AIGR4_RidgeRegression.joblib


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [15]:
# PLSpredict table (AIGR construct)
plspredict = pd.DataFrame({
    "Construct": ["AIGR", "AIGR", "AIGR", "AIGR"],
    "Indicator": ["AIGR1", "AIGR2", "AIGR3", "AIGR4"],
    "Q2_predict": [0.395, 0.388, 0.401, 0.376],
    "PLS_SEM_RMSE": [0.731, 0.731, 0.758, 0.739],
    "PLS_SEM_MAE": [0.592, 0.592, 0.609, 0.608]
})


In [17]:
#merge with ml
comparison = plspredict.merge(
    best_ml,
    on=["Construct", "Indicator"],
    how="left"
)

comparison = comparison.rename(columns={
    "Model": "Best_ML_Model",
    "RMSE": "ML_RMSE",
    "MAE": "ML_MAE",
    "Q2": "ML_Q2",
    "R2": "ML_R2"
})

comparison


Unnamed: 0,Construct,Indicator,Q2_predict,PLS_SEM_RMSE,PLS_SEM_MAE,Best_ML_Model,ML_RMSE,ML_MAE,ML_R2,ML_Q2
0,AIGR,AIGR1,0.395,0.731,0.592,RidgeRegression,0.753267,0.615181,0.353799,0.353799
1,AIGR,AIGR2,0.388,0.731,0.592,RidgeRegression,0.738954,0.600093,0.370844,0.370844
2,AIGR,AIGR3,0.401,0.758,0.609,RidgeRegression,0.770809,0.628146,0.377696,0.377696
3,AIGR,AIGR4,0.376,0.739,0.608,RidgeRegression,0.748547,0.617413,0.355451,0.355451


In [19]:
comparison["RMSE_Delta"] = comparison["PLS_SEM_RMSE"] - comparison["ML_RMSE"]
comparison["Q2_Delta"] = comparison["ML_Q2"] - comparison["Q2_predict"]

comparison


Unnamed: 0,Construct,Indicator,Q2_predict,PLS_SEM_RMSE,PLS_SEM_MAE,Best_ML_Model,ML_RMSE,ML_MAE,ML_R2,ML_Q2,RMSE_Delta,Q2_Delta
0,AIGR,AIGR1,0.395,0.731,0.592,RidgeRegression,0.753267,0.615181,0.353799,0.353799,-0.022267,-0.041201
1,AIGR,AIGR2,0.388,0.731,0.592,RidgeRegression,0.738954,0.600093,0.370844,0.370844,-0.007954,-0.017156
2,AIGR,AIGR3,0.401,0.758,0.609,RidgeRegression,0.770809,0.628146,0.377696,0.377696,-0.012809,-0.023304
3,AIGR,AIGR4,0.376,0.739,0.608,RidgeRegression,0.748547,0.617413,0.355451,0.355451,-0.009547,-0.020549


In [20]:
comparison.to_csv(
    "Results/PLSpredict_vs_ML_BestModel_Comparison.csv",
    index=False
)


In [29]:
def compute_ai_governance_readiness(
    df,
    items=("AIGR1", "AIGR2", "AIGR3", "AIGR4"),
    method="mean",
    new_col="AI_Governance_Readiness_score",
    pct_col="AI_Governance_Readiness_Pct",
    scale_min=1,
    scale_max=5
):
    """
    Compute the AI Governance Vision (AIGV) score and its percentage version.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the VUTAI items.
    items : tuple or list
        Column names corresponding to VUTAI indicators.
    method : str
        Aggregation method: 'mean' or 'sum'.
    new_col : str
        Name of the column for the raw vision score.
    pct_col : str
        Name of the column for the percentage vision score.
    scale_min : int or float
        Minimum value of the Likert scale.
    scale_max : int or float
        Maximum value of the Likert scale.

    Returns
    -------
    pandas.DataFrame
        DataFrame with both vision score columns added.
    """

    # Safety checks
    missing = [col for col in items if col not in df.columns]
    if missing:
        raise ValueError(f"Missing AIGR items in DataFrame: {missing}")

    if method == "mean":
        score = df[list(items)].mean(axis=1)
    elif method == "sum":
        score = df[list(items)].sum(axis=1)
    else:
        raise ValueError("method must be either 'mean' or 'sum'")

    # Raw score
    df[new_col] = score

    # Percentage transformation (scale-aware)
    df[pct_col] = ((score - scale_min) / (scale_max - scale_min)) * 100

    return df


In [30]:
readiness = compute_ai_governance_readiness(df)

readiness[[
    "AIGR1", "AIGR2", "AIGR3","AIGR4",
    "AI_Governance_Readiness_score",
    "AI_Governance_Readiness_Pct"
]]


Unnamed: 0,AIGR1,AIGR2,AIGR3,AIGR4,AI_Governance_Readiness_score,AI_Governance_Readiness_Pct
0,4,4,3,4,3.75,68.75
1,4,4,3,4,3.75,68.75
2,5,5,5,5,5.00,100.00
3,4,4,5,4,4.25,81.25
4,3,4,3,3,3.25,56.25
...,...,...,...,...,...,...
387,5,5,5,5,5.00,100.00
388,4,5,5,5,4.75,93.75
389,4,4,4,4,4.00,75.00
390,4,4,3,4,3.75,68.75


In [31]:
# Save to CSV
readiness.to_csv(
    "Results/readiness.csv",
    index=False
)

In [32]:
# University-level readiness (mean across all respondents)
university_readiness = readiness[[
    "AI_Governance_Readiness_score",
    "AI_Governance_Readiness_Pct"
]].mean()

university_readiness

AI_Governance_Readiness_score     3.817602
AI_Governance_Readiness_Pct      70.440051
dtype: float64