In [4]:
# =============================================================================
# Section 1: Import Libraries
# =============================================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from joblib import dump
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Set plotting style for better visuals
sns.set_style("whitegrid")
print("Libraries imported successfully.")


# =============================================================================
# Section 2: Data Loading, Cleaning, and Merging for Structural Data
# =============================================================================
print("\n--- Section 2: Loading, Cleaning & Merging Structural Data ---")

def clean_dataframe(df):
    """A reusable function to perform initial cleaning on a dataframe."""
    df.rename(columns={'Unnamed: 0': 'Project_Name'}, inplace=True)
    df.drop(columns=['Structural aspect'], inplace=True, errors='ignore')
    df['Project_Name'] = df['Project_Name'].str.strip().str.lower()
    
    # Updated feature columns for structural data
    feature_cols = [
        'Gross floor area (Cement floor finish)', 'Gross floor area (unglazed tiles)',
        'Grade 40', 'Grade 60', 'Volume of structural concrete (cu.m.) columns CLASS A 28 DAYS',
        'Volume of structural concrete (cu.m.) suspended slab CLASS A 28 DAYS',
        'Volume of structural concrete (cu.m.) beams/girders CLASS A 28 DAYS',
        'Area of formworks (sq.m.)'
    ]
    
    for col in feature_cols:
        if col in df.columns:
            # Handle potential non-numeric entries before converting
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Impute missing values with the median after cleaning
    for col in df.columns:
        if df[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df[col]):
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
    return df

def extract_budget(text):
    """Extracts the budget value from the 'Year/Budget' string."""
    if isinstance(text, str):
        # Updated regex to be more robust
        matches = re.findall(r'[\d,]+\.?\d*', text)
        if matches:
            # Take the last match as it's typically the budget
            return float(matches[-1].replace(',', ''))
    return None

try:
    # Load the new STRUCTURAL datasets
    df_quantity = pd.read_csv('Stuctural Quantity Cost.csv')
    print("Structural Quantity data loaded.")
    df_unit_cost = pd.read_csv('Stuctural Unit Cost.csv')
    print("Structural Unit Cost data loaded.")
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure both 'Stuctural' CSV files are in the correct directory.")
    exit()

# --- Clean and Merge Dataframes ---
df_quantity_cleaned = clean_dataframe(df_quantity.copy())
df_unit_cost_cleaned = clean_dataframe(df_unit_cost.copy())

# Rename the second column which contains the budget string
df_quantity_cleaned.rename(columns={df_quantity_cleaned.columns[1]: 'Year/Budget'}, inplace=True)

df_quantity_cleaned['Budget'] = df_quantity_cleaned['Year/Budget'].apply(extract_budget)
df_quantity_cleaned.drop(columns=['Year/Budget'], inplace=True)

# Drop non-numeric columns from unit cost that are not needed for merging
cols_to_drop_unit_cost = [col for col in df_unit_cost_cleaned.columns if col not in df_quantity_cleaned.columns and col != 'Project_Name']
df_unit_cost_cleaned.drop(columns=cols_to_drop_unit_cost, inplace=True, errors='ignore')


df_merged = pd.merge(
    df_quantity_cleaned,
    df_unit_cost_cleaned,
    on='Project_Name',
    suffixes=('_qty', '_cost')
)
df_merged = df_merged.dropna(subset=['Budget'])
df_merged = df_merged[df_merged['Budget'] > 100000].copy()
print(f"Dataframes merged. Working with {len(df_merged)} common projects.")


# =============================================================================
# Section 3: **UPDATED** - Granular Feature Engineering & Visualization for Structural Data
# =============================================================================
print("\n--- Section 3: Engineering Granular Features & Analysis ---")

# --- Step 1: Multiply Quantity by Unit Cost for Granular Features ---
individual_cost_features = []
base_feature_cols = [
    'Gross floor area (Cement floor finish)', 'Gross floor area (unglazed tiles)',
    'Grade 40', 'Grade 60', 'Volume of structural concrete (cu.m.) columns CLASS A 28 DAYS',
    'Volume of structural concrete (cu.m.) suspended slab CLASS A 28 DAYS',
    'Volume of structural concrete (cu.m.) beams/girders CLASS A 28 DAYS',
    'Area of formworks (sq.m.)'
]

name_mapper = {
    'Gross floor area (Cement floor finish)': 'Floor_Finish_Est_Cost',
    'Gross floor area (unglazed tiles)': 'Unglazed_Tiles_Est_Cost',
    'Grade 40': 'Grade_40_Steel_Est_Cost',
    'Grade 60': 'Grade_60_Steel_Est_Cost',
    'Volume of structural concrete (cu.m.) columns CLASS A 28 DAYS': 'Concrete_Columns_Est_Cost',
    'Volume of structural concrete (cu.m.) suspended slab CLASS A 28 DAYS': 'Concrete_Slab_Est_Cost',
    'Volume of structural concrete (cu.m.) beams/girders CLASS A 28 DAYS': 'Concrete_Beams_Est_Cost',
    'Area of formworks (sq.m.)': 'Formworks_Est_Cost'
}

for col in base_feature_cols:
    qty_col = col + '_qty'
    cost_col = col + '_cost'
    new_cost_feature = name_mapper[col]
    
    if qty_col in df_merged and cost_col in df_merged:
        df_merged[new_cost_feature] = df_merged[qty_col] * df_merged[cost_col]
        individual_cost_features.append(new_cost_feature)

print("Created new granular features for each structural component.")

# --- Step 2: Combine Highly Correlated Features ---
df_merged['Total_Concrete_Est_Cost'] = (df_merged['Concrete_Columns_Est_Cost'] +
                                      df_merged['Concrete_Slab_Est_Cost'] +
                                      df_merged['Concrete_Beams_Est_Cost'])
features_to_remove = ['Concrete_Columns_Est_Cost', 'Concrete_Slab_Est_Cost', 'Concrete_Beams_Est_Cost']
final_granular_features = [f for f in individual_cost_features if f not in features_to_remove]
final_granular_features.append('Total_Concrete_Est_Cost')
print("Combined concrete features into 'Total_Concrete_Est_Cost'.")

# --- Step 3: Create Contextual and Target Features ---
df_merged['Num_Storeys'] = df_merged['Project_Name'].str.extract(r'(\d+)\s*sty').astype(float)
df_merged['Num_Classrooms'] = df_merged['Project_Name'].str.extract(r'(\d+)\s*cl').astype(float)
df_merged['Num_Storeys'].fillna(df_merged['Num_Storeys'].median(), inplace=True)
df_merged['Num_Classrooms'].fillna(df_merged['Num_Classrooms'].median(), inplace=True)
df_merged['Budget_log'] = np.log1p(df_merged['Budget']) # Log-transform target

# --- Step 4: Visualization and Analysis ---
print("\n--- Generating and Saving Visualizations for Analysis ---")

# Create the directory for images if it doesn't exist
output_dir = 'visualization images'
os.makedirs(output_dir, exist_ok=True)

# 4.1: Justifying Log-Transformation (Section 1.1 of Report)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.histplot(df_merged['Budget'], kde=True, ax=axes[0], bins=30)
axes[0].set_title('Distribution of Original Budget (Right-Skewed)')
axes[0].set_xlabel('Budget (PHP)')
axes[0].ticklabel_format(style='plain', axis='x')
sns.histplot(df_merged['Budget_log'], kde=True, ax=axes[1], color='green', bins=30)
axes[1].set_title('Distribution of Log-Transformed Budget (Normalized)')
axes[1].set_xlabel('Log(1 + Budget)')
plt.suptitle('Effect of Log-Transformation on Target Variable', size=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(os.path.join(output_dir, 'log_transformation_effect.png'), bbox_inches='tight')
plt.close(fig) # Close the figure to free memory
print(f"Saved: {os.path.join(output_dir, 'log_transformation_effect.png')}")

# 4.2: Correlation Matrix of Final Engineered Features
plt.figure(figsize=(14, 12))
heatmap_cols = final_granular_features + ['Num_Storeys', 'Num_Classrooms', 'Budget']
correlation_matrix = df_merged[heatmap_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', fmt='.2f')
plt.title('Correlation Matrix of Final Engineered Features', size=16)
plt.savefig(os.path.join(output_dir, 'structural_features_correlation_matrix.png'), bbox_inches='tight')
plt.close() # Close the current figure
print(f"Saved: {os.path.join(output_dir, 'structural_features_correlation_matrix.png')}")

# 4.3: Justifying the Need for Standardization (Section 1.3 of Report)
final_feature_columns = final_granular_features + ['Num_Storeys', 'Num_Classrooms']
X_for_viz = df_merged[final_feature_columns].copy()
X_for_viz.fillna(X_for_viz.median(), inplace=True)

plt.figure(figsize=(10, 8))
sns.boxplot(data=X_for_viz, orient='h')
plt.title('Feature Scales Before Standardization')
plt.xlabel('Original Feature Values (Varying Scales)')
plt.xscale('log') # Use log scale to handle the wide range of values
plt.savefig(os.path.join(output_dir, 'feature_scales_before_standardization.png'), bbox_inches='tight')
plt.close() # Close the current figure
print(f"Saved: {os.path.join(output_dir, 'feature_scales_before_standardization.png')}")


# =============================================================================
# Section 4: **UPDATED** - Data Preparation for the ANN Model
# =============================================================================
print("\n--- Section 4: Preparing Data for the ANN Model ---")

# --- Define the features (X) and the target (y) ---
# The feature set now consists of the individual estimated costs for each structural component
final_feature_columns = final_granular_features + ['Num_Storeys', 'Num_Classrooms']
X = df_merged[final_feature_columns]
y = df_merged[['Budget_log']]

# Fill any potential NaN values in the final feature set just in case
X.fillna(X.median(), inplace=True)


print(f"\nTraining model with {X.shape[1]} input features.")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Feature and Target Scaling ---
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

# --- Convert to PyTorch Tensors ---
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_scaled, dtype=torch.float32)

print("Data has been split, scaled, and converted to tensors.")


# =============================================================================
# Section 5: Build and Train the Artificial Neural Network (Architecture Unchanged)
# =============================================================================
print("\n--- Section 5: Building and Training the ANN ---")

class RegressionNet(nn.Module):
    def __init__(self, input_features):
        super(RegressionNet, self).__init__()
        self.layer1 = nn.Linear(input_features, 128)
        self.dropout1 = nn.Dropout(0.3)
        self.layer2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.2)
        self.layer3 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = self.dropout1(x)
        x = F.relu(self.layer2(x))
        x = self.dropout2(x)
        x = F.relu(self.layer3(x))
        x = torch.sigmoid(self.output_layer(x))
        return x

# --- Model Initialization and Training Setup ---
# The input_size is now automatically determined by the number of structural features.
input_size = X_train_tensor.shape[1]
model = RegressionNet(input_features=input_size)
print("Model Architecture:")
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
batch_size = 16
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# --- Training Loop ---
epochs = 200
train_losses = []
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for features, targets in train_loader:
        outputs = model(features)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_loss)
    if (epoch + 1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.6f}')

print("Neural Network training complete.")

# --- Plot and Save Training Loss ---
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.title('Structural Model Training Loss Over Epochs', size=16)
plt.xlabel('Epoch', size=12)
plt.ylabel('Mean Squared Error Loss', size=12)
plt.legend()
plt.savefig(os.path.join(output_dir, 'model_training_loss_curve.png'), bbox_inches='tight')
plt.close()
print(f"Saved: {os.path.join(output_dir, 'model_training_loss_curve.png')}")


# =============================================================================
# Section 6: Model Evaluation
# =============================================================================
print("\n--- Section 6: Evaluating the ANN Model (with Structural Features) ---")

model.eval()
with torch.no_grad():
    scaled_predictions = model(X_test_tensor).numpy()

# --- Inverse Transform Predictions to Original Scale ---
log_predictions = scaler_y.inverse_transform(scaled_predictions)
final_predictions = np.expm1(log_predictions).flatten()
y_test_actual = np.expm1(y_test.values).flatten()

# --- Calculate and Display Performance Metrics ---
r2_ann = r2_score(y_test_actual, final_predictions)
mae_ann = mean_absolute_error(y_test_actual, final_predictions)
rmse_ann = np.sqrt(mean_squared_error(y_test_actual, final_predictions))

print("\n--- Final Model Performance ---")
print(f"R-squared (R²): {r2_ann:.4f}")
print(f"Mean Absolute Error (MAE): ₱{mae_ann:,.2f}")
print(f"Root Mean Squared Error (RMSE): ₱{rmse_ann:,.2f}")

# --- Visualization: Actual vs. Predicted Budget ---
plt.figure(figsize=(10, 7))
plt.scatter(y_test_actual, final_predictions, alpha=0.6, edgecolors='w', label='Predictions')
plt.plot([y_test_actual.min(), y_test_actual.max()], [y_test_actual.min(), y_test_actual.max()],
         'r--', lw=2, label='Perfect Fit')
plt.title('Actual vs. Predicted Project Budget (Structural Model)', size=16)
plt.xlabel('Actual Budget (PHP)', size=12)
plt.ylabel('Predicted Budget (PHP)', size=12)
plt.ticklabel_format(style='plain', axis='both')
plt.legend()
plt.savefig(os.path.join(output_dir, 'actual_vs_predicted_budget.png'), bbox_inches='tight')
plt.close()
print(f"Saved: {os.path.join(output_dir, 'actual_vs_predicted_budget.png')}")


# =============================================================================
# Section 7: Save Final Assets
# =============================================================================
print("\n--- Section 7: Saving Final Model and Scalers ---")

# Save the model and scalers with new names to distinguish them from the architectural model
torch.save(model.state_dict(), 'ann_structural_model.pth')
print("ANN model saved as 'ann_structural_model.pth'")
dump(scaler_X, 'scaler_X_structural.joblib')
dump(scaler_y, 'scaler_y_structural.joblib')
print("Feature and target scalers saved.")

print("\nProcess finished successfully.")

Libraries imported successfully.

--- Section 2: Loading, Cleaning & Merging Structural Data ---
Structural Quantity data loaded.
Structural Unit Cost data loaded.
Dataframes merged. Working with 131 common projects.

--- Section 3: Engineering Granular Features & Analysis ---
Created new granular features for each structural component.
Combined concrete features into 'Total_Concrete_Est_Cost'.

--- Generating and Saving Visualizations for Analysis ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

Saved: visualization images\log_transformation_effect.png
Saved: visualization images\structural_features_correlation_matrix.png
Saved: visualization images\feature_scales_before_standardization.png

--- Section 4: Preparing Data for the ANN Model ---

Training model with 8 input features.
Data has been split, scaled, and converted to tensors.

--- Section 5: Building and Training the ANN ---
Model Architecture:
RegressionNet(
  (layer1): Linear(in_features=8, out_features=128, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
  (layer2): Linear(in_features=128, out_features=64, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (layer3): Linear(in_features=64, out_features=32, bias=True)
  (output_layer): Linear(in_features=32, out_features=1, bias=True)
)
Epoch [20/200], Loss: 0.007470


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace=True)


Epoch [40/200], Loss: 0.005100
Epoch [60/200], Loss: 0.004460
Epoch [80/200], Loss: 0.004456
Epoch [100/200], Loss: 0.003500
Epoch [120/200], Loss: 0.003560
Epoch [140/200], Loss: 0.003123
Epoch [160/200], Loss: 0.002921
Epoch [180/200], Loss: 0.002554
Epoch [200/200], Loss: 0.003863
Neural Network training complete.
Saved: visualization images\model_training_loss_curve.png

--- Section 6: Evaluating the ANN Model (with Structural Features) ---

--- Final Model Performance ---
R-squared (R²): 0.7759
Mean Absolute Error (MAE): ₱3,366,374.26
Root Mean Squared Error (RMSE): ₱6,136,331.24
Saved: visualization images\actual_vs_predicted_budget.png

--- Section 7: Saving Final Model and Scalers ---
ANN model saved as 'ann_structural_model.pth'
Feature and target scalers saved.

Process finished successfully.
