In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


In [2]:
# Data loading
dir_path = os.path.join(os.path.dirname(os.getcwd()), "data", "processed")
data_path = os.path.join(dir_path, "Concrete_processed_data.xlsx")
df = pd.read_excel(data_path)
df.head(10)

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)",water_cement_ratio,water_binder_ratio,agg_binder_ratio,total_binder_content,total_mix,cement_ratio,log_age,age_group,slag_age_interaction,cement_age_interaction,sp_flag,slag_flag
0,540.0,0.0,0.0,162.0,2.5,28,79.986111,0.3,0.3,3.177778,540.0,2418.0,1.0,3.367296,Standard,0.0,1818.339748,1,0
1,540.0,0.0,0.0,162.0,2.5,28,61.887366,0.3,0.3,3.205556,540.0,2433.0,1.0,3.367296,Standard,0.0,1818.339748,1,0
2,332.5,142.5,0.0,228.0,0.0,270,40.269535,0.685714,0.48,3.212632,475.0,2229.0,0.7,5.602119,Long-term,798.301932,1862.704508,0,1
3,332.5,142.5,0.0,228.0,0.0,365,41.05278,0.685714,0.48,3.212632,475.0,2229.0,0.7,5.902633,Long-term,841.12525,1962.625583,0,1
4,198.6,132.4,0.0,192.0,0.0,360,44.296075,0.966767,0.58006,5.449849,331.0,2326.9,0.6,5.888878,Long-term,779.687442,1169.531163,0,1
5,266.0,114.0,0.0,228.0,0.0,90,47.029847,0.857143,0.6,4.215789,380.0,2210.0,0.7,4.51086,Mature,514.237984,1199.888629,0,1
6,380.0,95.0,0.0,228.0,0.0,365,43.698299,0.6,0.48,3.212632,475.0,2229.0,0.8,5.902633,Long-term,560.750167,2243.000667,0,1
7,380.0,95.0,0.0,228.0,0.0,28,36.44777,0.6,0.48,3.212632,475.0,2229.0,0.8,3.367296,Standard,319.893104,1279.572415,0,1
8,266.0,114.0,0.0,228.0,0.0,28,45.854291,0.857143,0.6,4.215789,380.0,2210.0,0.7,3.367296,Standard,383.871725,895.700691,0,1
9,475.0,0.0,0.0,228.0,0.0,28,39.28979,0.48,0.48,3.212632,475.0,2229.0,1.0,3.367296,Standard,0.0,1599.465519,0,0


In [3]:
df.dtypes

Cement (component 1)(kg in a m^3 mixture)                float64
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    float64
Fly Ash (component 3)(kg in a m^3 mixture)               float64
Water  (component 4)(kg in a m^3 mixture)                float64
Superplasticizer (component 5)(kg in a m^3 mixture)      float64
Age (day)                                                  int64
Concrete compressive strength(MPa, megapascals)          float64
water_cement_ratio                                       float64
water_binder_ratio                                       float64
agg_binder_ratio                                         float64
total_binder_content                                     float64
total_mix                                                float64
cement_ratio                                             float64
log_age                                                  float64
age_group                                                    str
slag_age_interaction     

In [4]:
df.shape

(1005, 19)

In [None]:
# Defining x and y
df.columns.str.strip()
target = 'Concrete compressive strength(MPa, megapascals)'
X = df.drop(columns=[target])
y = df[target]

print(X.shape)
print(y.shape)

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42, shuffle = True)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
y_test.describe()

In [None]:
y_train.describe()

### Why Check Mean and Std of y_train and y_test

When splitting a dataset into training and test sets, the **target variable distribution** should be roughly the same in both sets.

- **Mean similarity:** Ensures that the average target value is consistent between train and test.
  - If one set has mostly high values and the other low, the model might see biased patterns during training.

- **Standard deviation similarity:** Ensures that the variability of the target is similar in both sets.
  - This helps the model generalize well, rather than overfitting to the narrower range of values in the training set.

**Summary:** Close mean and std in `y_train` and `y_test` confirms that the split preserves the overall target distribution, giving a fair evaluation during testing.

In [None]:
# Removing outlier on the training data only to prevent data leakage
def detecting_outlier_iqr(y_train):

    Q1 = y_train.quantile(0.25)
    Q3 = y_train.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = (y_train < lower_bound) | (y_train > upper_bound)

    return outliers

outliers = detecting_outlier_iqr(y_train)
print(outliers.sum())

In [None]:
# Visualizing the outliers
plt.figure(figsize=(10,10))
sns.boxplot(data= y_train)
plt.title("Boxplot of outliers")
plt.xticks(rotation = 45)
plt.show()

In [None]:
outliers_mask = detecting_outlier_iqr(y_train)
y_train_clean = y_train[~outliers_mask]  # remove outliers  using ~ o tilde ensures to return true where the data is clean
X_train_clean = X_train[~outliers_mask]  # remove same rows from features

X_train_clean = X_train_clean.reset_index(drop=True)
y_train_clean = y_train_clean.reset_index(drop=True)

In [None]:
print(X_train_clean.shape)
print(y_train_clean.shape)

### Why Remove Target Outliers Only

When cleaning data for regression, focus on **target (`y`) outliers**, not the feature (`X`) outliers.

- **Impact of target outliers:** Extreme values in the target can disproportionately influence the regression line, causing poor predictions on normal samples.
- **Impact of feature outliers:** Many models, especially tree-based ones, are robust to extreme feature values, so removing them is often unnecessary.

**Summary:** Removing only target outliers prevents the model from being skewed by extreme responses, while preserving the natural variability of the features.

**Note:** Always remove outliers **only from the training set** to avoid leaking information from the test set.

## SKLEARN ML PIPELINE

### 1.  LINEAR REGRESSION MODEL

In [None]:
# Setting up the numeric transformer
# Identifying column types
numerical_features = [col for col in X_train.columns
                      if col != target and col!= "age_group"
                      ]
categorical_features = ['age_group']

# Combining the  two to a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Creating the linear regression pipeline.
lr_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]
)

print(lr_pipeline)

In [None]:
# Fitting the pipeline to the training data
lr_pipeline.fit(X_train, y_train)

In [None]:
#  Making predictions.
y_pred_lr = lr_pipeline.predict(X_test)

In [None]:
# Model evaluation metrics
rmse_lr = root_mean_squared_error(y_test, y_pred_lr)
mae_lr= mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)


print("LINEAR REGRESSION MODEL")
print("=========================")
print(f"RMSE: {rmse_lr}")
print(f"MAE: {mae_lr}")
print(f"R2: {r2_lr}")

### RANDOM FOREST

In [None]:
# Building the random forest pipeline
rf_pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators = 100, random_state = 42))
    ]
)

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
# make predictions
y_pred_rf = rf_pipeline.predict(X_test)

In [None]:
# Model evaluation metrics
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)


print("RANDOM FOREST REGRESSION MODEL")
print("=========================")
print(f"RMSE: {rmse_rf}")
print(f"MAE: {mae_rf}")
print(f"R2: {r2_rf}")

In [None]:
print("Model Comparison")
print("="*45)
print(f"{'Model':<20} {'RMSE':>8} {'MAE':>8} {'RÂ²':>8}")
print("-"*45)
print(f"{'Linear Regression':<20} {rmse_lr:>8.2f} {mae_lr:>8.2f} {r2_lr:>8.4f}")
print(f"{'Random Forest':<20} {rmse_rf:>8.2f} {mae_rf:>8.2f} {r2_rf:>8.4f}")