In [1]:
# 1. SETUP: IMPORT LIBRARIES
# ==============================================================================
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib  # Library for saving and loading models
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
# 2. DATA LOADING
# ==============================================================================
# --- Load the dataset from a CSV file ---
# IMPORTANT: Make sure your CSV file is in the same directory as this script,
# or provide the full path to the file.
file_path = 'concrete_data.csv'  # <-- CHANGE THIS TO YOUR FILENAME

In [3]:
try:
    df = pd.read_csv(file_path)
    print(f"Successfully loaded data from '{file_path}'.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    print("Please make sure the CSV file is in the correct directory and the filename is correct.")
    exit() # Exit the script if the file doesn't exist

Successfully loaded data from 'concrete_data.csv'.


In [4]:
# 3. DATA CLEANING
# ==============================================================================
print("\n--- Data Cleaning and Preparation ---")


--- Data Cleaning and Preparation ---


In [5]:
# --- Check for Missing Values ---
if df.isnull().sum().sum() > 0:
    print("Missing values found. Consider a strategy to handle them (e.g., filling with mean).")
else:
    print("No missing values found.")

No missing values found.


In [6]:
# --- Check for and Remove Duplicates ---
duplicates = df.duplicated().sum()
if duplicates > 0:
    print(f"Found and removed {duplicates} duplicate rows.")
    df.drop_duplicates(inplace=True)
else:
    print("No duplicate rows found.")

Found and removed 25 duplicate rows.


In [7]:
# 4. PREPARE DATA FOR MODELING
# ==============================================================================
# --- Define Features (X) and Target (y) ---
X = df.drop('Strength', axis=1)
y = df['Strength']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
print(f"\nData successfully split: {len(X_train)} training samples and {len(X_test)} testing samples.")


Data successfully split: 804 training samples and 201 testing samples.


In [13]:
print("\n--- Training XGBoost Model ---")
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=150,          # Increased number of trees
    learning_rate=0.1,         # Step size shrinkage
    max_depth=5,               # Maximum depth of a tree
    random_state=42,
    n_jobs=-1                  # Use all available CPU cores
)
xgb_model.fit(X_train_scaled, y_train)
print("Model training complete.")


--- Training XGBoost Model ---
Model training complete.


In [27]:
print("\n--- XGBoost Model Performance Evaluation ---")
y_pred = xgb_model.predict(X_test_scaled)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

accuracy_score = r2 * 100

print(f"Model Accuracy (R-squared as %): {accuracy_score:.2f}%")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f} MPa")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f} MPa")


--- XGBoost Model Performance Evaluation ---
Model Accuracy (R-squared as %): 93.53%
R-squared (R²): 0.9353
Mean Absolute Error (MAE): 2.8730 MPa
Root Mean Squared Error (RMSE): 4.3927 MPa


In [15]:
joblib.dump(xgb_model, 'xgboost_concrete_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("\n--- Model and scaler have been saved to files ('xgboost_concrete_model.pkl', 'scaler.pkl') ---")


--- Model and scaler have been saved to files ('xgboost_concrete_model.pkl', 'scaler.pkl') ---


In [26]:
print("\n--- Example of Loading Model for New Predictions ---")

# Load the saved model and scaler
loaded_model = joblib.load('xgboost_concrete_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
print("Model and scaler loaded successfully.")

# Create a hypothetical new concrete mixture to predict its strength
# (Cement, Slag, Ash, Water, Superplasticizer, Coarse Agg, Fine Agg, Age)
new_concrete_mix = np.array([[350, 150, 0, 180, 5.5, 950, 750, 90]])

# IMPORTANT: You MUST scale the new data using the SAME scaler that was fit on the training data
new_mix_scaled = loaded_scaler.transform(new_concrete_mix)

# Make the prediction
predicted_strength = loaded_model.predict(new_mix_scaled)

print(f"\nPrediction for the new concrete mix:")
print(f"  -> Predicted Strength: {predicted_strength[0]:.2f} MPa")


--- Example of Loading Model for New Predictions ---
Model and scaler loaded successfully.

Prediction for the new concrete mix:
  -> Predicted Strength: 53.51 MPa
