In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv(r"C:\Users\Aman Singh\Downloads\archive (2)\concrete_data.csv")
data.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
data.shape

(1030, 9)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   cement                         1030 non-null   float64
 1   blast_furnace_slag             1030 non-null   float64
 2   fly_ash                        1030 non-null   float64
 3   water                          1030 non-null   float64
 4   superplasticizer               1030 non-null   float64
 5   coarse_aggregate               1030 non-null   float64
 6   fine_aggregate                 1030 non-null   float64
 7   age                            1030 non-null   int64  
 8   concrete_compressive_strength  1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [5]:
data.describe()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


## Prepocessing

In [6]:
data

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [13]:
def preprocess_input(df):
    df = df.copy()
    
    y = df['concrete_compressive_strength'].copy()
    X = df.drop('concrete_compressive_strength', axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [14]:
X_train, X_test, y_train, y_test = preprocess_input(data)

In [18]:
print(X_train.mean())

cement               -1.244959e-16
blast_furnace_slag   -1.321181e-16
fly_ash              -3.372245e-17
water                 6.572028e-16
superplasticizer     -1.624529e-16
coarse_aggregate      1.417421e-15
fine_aggregate       -3.336443e-16
age                   2.617724e-17
dtype: float64


In [19]:
print(X_train.var())

cement                1.001389
blast_furnace_slag    1.001389
fly_ash               1.001389
water                 1.001389
superplasticizer      1.001389
coarse_aggregate      1.001389
fine_aggregate        1.001389
age                   1.001389
dtype: float64


## Model Selection

In [21]:
models = {
    "                     Linear Regression" : LinearRegression(),
    "                 L2 (Ridge) Regression" : Ridge(),
    "Support Vector Machine (Linear Kernel)" : LinearSVR(),
    "   Support Vector Machine (RBF Kernel)" : SVR(),
    "                         Decision Tree" : DecisionTreeRegressor(),
    "                        Neural Network" : MLPRegressor(),
    "                         Random Forest" : RandomForestRegressor(),
    "                     Gradient Boosting" : GradientBoostingRegressor(),
    "                              AdaBoost" : AdaBoostRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " model trained.")

                     Linear Regressionmodel trained.
                 L2 (Ridge) Regressionmodel trained.
Support Vector Machine (Linear Kernel)model trained.
   Support Vector Machine (RBF Kernel)model trained.
                         Decision Treemodel trained.




                        Neural Networkmodel trained.
                         Random Forestmodel trained.
                     Gradient Boostingmodel trained.
                              AdaBoostmodel trained.


In [23]:
for name, model in models.items():
    print(name + " R^2 : {:.5f}".format(model.score(X_test,y_test)))

                     Linear Regression R^2 : 0.59438
                 L2 (Ridge) Regression R^2 : 0.59508
Support Vector Machine (Linear Kernel) R^2 : 0.55744
   Support Vector Machine (RBF Kernel) R^2 : 0.60928
                         Decision Tree R^2 : 0.81917
                        Neural Network R^2 : 0.45350
                         Random Forest R^2 : 0.88725
                     Gradient Boosting R^2 : 0.89107
                              AdaBoost R^2 : 0.77817


## Model Optimization

In [24]:
best_model = GradientBoostingRegressor()
best_model.fit(X_train, y_train)

print("Model R^2 (Before Optimization) : {:.5f}".format(best_model.score(X_test,y_test)))

Model R^2 (Before Optimization) : 0.89128


In [26]:
params = {
    'learning_rate' : [0.01, 0.1, 1.0],
    'n_estimators' : [100, 150, 200],
    'max_depth' : [3, 4, 5]
}

clf = GridSearchCV(best_model, params)
clf.fit(X_train, y_train)

clf.best_params_

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}

In [27]:
print("Model R^2 (After Optimization) : {:.5f}".format(clf.score(X_test,y_test)))

Model R^2 (After Optimization) : 0.92183
