# Phase 1: Establishing the PIML Baseline
## Ali4Concrete Nexus Framework

**Objective:**
To prepare the global foundation model (UCI Dataset) for Transfer Learning. This notebook focuses on **Data Hygiene** and **Physics-Based Feature Engineering** to ensure the model learns causal relationships (Cause-Effect), not just statistical correlations.

**The Physics Constraints:**
We are testing the data against **Abrams' Law**:
$$f_c = \frac{A}{B^{w/c}}$$
Where we expect a strict monotonic decrease in strength as $w/c$ increases.

In [8]:
import pandas as pd
import numpy as np

# Display configuration to ensure we see all columns/rows when needed
pd.set_option('display.max_columns', None)

print("Libraries loaded successfully!")

Libraries loaded successfully!


In [9]:
def load_and_inspect_data(filepath: str) -> pd.DataFrame:
    """
    Loads the concrete dataset and performs an initial inspection.
    
    Args:
        filepath (str): The path to the CSV file.
        
    Returns:
        pd.DataFrame: The loaded raw dataframe.
    """
    # Loading data using Pandas (read_csv is optimized for speed)
    df = pd.read_csv(filepath)
    
    # Visual inspection of the first 5 rows (DataCamp Habit: Always look at your data first)
    print("--- First 5 Rows ---")
    display(df.head())
    
    # Structural inspection (Check for Data Types & Non-Null counts)
    print("\n--- Dataset Info ---")
    print(df.info())
    
    # Statistical inspection (Check for Physics constraints e.g., negative values)
    print("\n--- Statistical Summary ---")
    display(df.describe())
    
    return df

# Execute the function
df_raw = load_and_inspect_data('concrete_data.csv')

--- First 5 Rows ---


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3



--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  1030 non-null   float64
 2   Fly Ash             1030 non-null   float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    1030 non-null   float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB
None

--- Statistical Summary ---


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [10]:
# --- Step 2 & 3: Data Cleaning and Feature Engineering ---

def preprocess_concrete_data(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Performs data cleaning (renaming, de-duplication) and 
    feature engineering (physics-based ratios).
    
    Args:
        df_raw (pd.DataFrame): The raw dataframe loaded from CSV.
        
    Returns:
        pd.DataFrame: The cleaned and engineered dataframe.
    """
    # 1. Standardize Column Names
    # Mapping raw CSV names to Pythonic snake_case conventions
    column_mapping = {
        'Cement': 'cement',
        'Blast Furnace Slag': 'slag',
        'Fly Ash': 'fly_ash',
        'Water': 'water',
        'Superplasticizer': 'superplasticizer',
        'Coarse Aggregate': 'coarse_agg',
        'Fine Aggregate': 'fine_agg',
        'Age': 'age',
        'Strength': 'strength'
    }
    df_clean = df_raw.rename(columns=column_mapping)
    
    # 2. Remove Duplicates
    # Duplicate entries in material science data usually indicate data entry errors
    initial_count = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    dropped_count = initial_count - len(df_clean)
    
    print(f"Data Cleaning Report:")
    print(f"- Columns Renamed: Yes")
    print(f"- Duplicates Dropped: {dropped_count} samples")
    
    # 3. Feature Engineering (Physics-Informed)
    df_eng = df_clean.copy()
    
    # A. Water-to-Cement Ratio (w/c) - The fundamental law of concrete strength (Abrams' Law)
    df_eng['w_c_ratio'] = df_eng['water'] / df_eng['cement']
    
    # B. Water-to-Binder Ratio (w/b) - Crucial for modern HPC containing SCMs (Slag/Fly Ash)
    # Binder = Cement + Slag + Fly Ash
    df_eng['binder'] = df_eng['cement'] + df_eng['slag'] + df_eng['fly_ash']
    df_eng['w_b_ratio'] = df_eng['water'] / df_eng['binder']
    
    print(f"Feature Engineering Report:")
    print(f"- New Physics Features: ['w_c_ratio', 'w_b_ratio']")
    print(f"- Final Dataset Shape: {df_eng.shape}")
    
    return df_eng

# Execute the pipeline
# Note: We reload df_raw to ensure a fresh start
df_raw = load_and_inspect_data('concrete_data.csv')
df = preprocess_concrete_data(df_raw)

# Preview the engineered data
display(df[['cement', 'slag', 'water', 'w_c_ratio', 'w_b_ratio', 'strength']].head())

--- First 5 Rows ---


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3



--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  1030 non-null   float64
 2   Fly Ash             1030 non-null   float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    1030 non-null   float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB
None

--- Statistical Summary ---


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


Data Cleaning Report:
- Columns Renamed: Yes
- Duplicates Dropped: 25 samples
Feature Engineering Report:
- New Physics Features: ['w_c_ratio', 'w_b_ratio']
- Final Dataset Shape: (1005, 12)


Unnamed: 0,cement,slag,water,w_c_ratio,w_b_ratio,strength
0,540.0,0.0,162.0,0.3,0.3,79.99
1,540.0,0.0,162.0,0.3,0.3,61.89
2,332.5,142.5,228.0,0.685714,0.48,40.27
3,332.5,142.5,228.0,0.685714,0.48,41.05
4,198.6,132.4,192.0,0.966767,0.58006,44.3


In [11]:
# --- Step 3: Feature Engineering (Physics-Informed) ---

def add_engineering_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates new physics-based features (ratios) to help the model learn 
    Abrams' Law and other concrete behaviors.
    """
    df_eng = df.copy()
    
    # 1. Water-to-Cement Ratio (Traditional Abrams' Law)
    # Why? Fundamental predictor of strength in plain concrete.
    df_eng['w_c_ratio'] = df_eng['water'] / df_eng['cement']
    
    # 2. Water-to-Binder Ratio (Modern HPC Approach)
    # Why? In modern concrete with SCMs (Slag, Fly Ash), these contribute to strength.
    # Binder = Cement + Slag + Fly Ash
    df_eng['binder'] = df_eng['cement'] + df_eng['slag'] + df_eng['fly_ash']
    df_eng['w_b_ratio'] = df_eng['water'] / df_eng['binder']
    
    # Visual check to ensure no Division by Zero errors (Infinite values)
    # DataCamp Tip: Always sanitize your new features.
    if np.isinf(df_eng['w_c_ratio']).any():
        print("Warning: Division by zero detected in w/c ratio!")
        
    print("Feature Engineering Complete.")
    print(f"New Columns Added: {['w_c_ratio', 'w_b_ratio']}")
    
    return df_eng

# Apply the function
df = add_engineering_features(df)

# Check the first few rows to see the new columns
display(df[['cement', 'slag', 'fly_ash', 'water', 'w_c_ratio', 'w_b_ratio']].head())

Feature Engineering Complete.
New Columns Added: ['w_c_ratio', 'w_b_ratio']


Unnamed: 0,cement,slag,fly_ash,water,w_c_ratio,w_b_ratio
0,540.0,0.0,0.0,162.0,0.3,0.3
1,540.0,0.0,0.0,162.0,0.3,0.3
2,332.5,142.5,0.0,228.0,0.685714,0.48
3,332.5,142.5,0.0,228.0,0.685714,0.48
4,198.6,132.4,0.0,192.0,0.966767,0.58006


In [12]:
# --- Step 4: Exploratory Data Analysis (EDA) ---

def perform_eda(df: pd.DataFrame):
    """
    Performs basic statistical analysis to understand relationships
    between mix components and strength.
    """
    print("--- 1. Strength Development over Time (Age) ---")
    # Grouping by 'age' to see the average strength trend
    # Engineering logic: Strength must increase with age (hydration process)
    age_strength = df.groupby('age')['strength'].mean()
    display(age_strength.sort_index())
    
    print("\n--- 2. Correlation Matrix (The 'Relationships' Map) ---")
    # Correlation measures linear relationships (-1 to +1)
    # +1: Strong positive (More cement -> More strength)
    # -1: Strong negative (More water -> Less strength)
    corr_matrix = df.corr()
    
    # We focus on how everything correlates with 'strength'
    # Sorting values to see the most influential factors at the top
    strength_corr = corr_matrix['strength'].sort_values(ascending=False)
    display(strength_corr)
    
    return strength_corr

# Execute EDA
correlations = perform_eda(df)

--- 1. Strength Development over Time (Age) ---


age
1       9.455000
3      18.378140
7      25.182049
14     28.750968
28     36.429570
56     50.715233
90     40.480370
91     68.675882
100    47.668846
120    39.646667
180    41.730385
270    51.272308
360    40.696667
365    43.557857
Name: strength, dtype: float64


--- 2. Correlation Matrix (The 'Relationships' Map) ---


strength            1.000000
binder              0.598103
cement              0.488283
superplasticizer    0.344209
age                 0.337367
slag                0.103374
fly_ash            -0.080648
coarse_agg         -0.144717
fine_agg           -0.186448
water              -0.269624
w_c_ratio          -0.489401
w_b_ratio          -0.610843
Name: strength, dtype: float64

In [13]:
# --- Step 5: Exporting Clean Data ---

def export_data(df: pd.DataFrame, filename: str):
    """
    Saves the processed dataframe to a CSV file for the next research phase.
    """
    try:
        df.to_csv(filename, index=False)
        print(f"✅ Success! Data exported to '{filename}'")
        print(f"Ready for Phase 2 (Machine Learning Modeling).")
        
        # Verify file creation (DataCamp Habit)
        import os
        if os.path.exists(filename):
            file_size = os.path.getsize(filename) / 1024 # KB
            print(f"File Size: {file_size:.2f} KB")
            
    except Exception as e:
        print(f"❌ Error exporting data: {e}")

# Save the file
export_data(df, 'uci_concrete_clean_physics_engineered.csv')

✅ Success! Data exported to 'uci_concrete_clean_physics_engineered.csv'
Ready for Phase 2 (Machine Learning Modeling).
File Size: 88.09 KB
