In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error

import warnings as wr
wr.filterwarnings('ignore')

In [2]:
df = pd.read_csv('concrete_data.csv')
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
df.corr()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
Cement,1.0,-0.275216,-0.397467,-0.081587,0.092386,-0.109349,-0.222718,0.081946,0.497832
Blast Furnace Slag,-0.275216,1.0,-0.32358,0.107252,0.04327,-0.283999,-0.281603,-0.044246,0.134829
Fly Ash,-0.397467,-0.32358,1.0,-0.256984,0.377503,-0.009961,0.079108,-0.154371,-0.105755
Water,-0.081587,0.107252,-0.256984,1.0,-0.657533,-0.182294,-0.450661,0.277618,-0.289633
Superplasticizer,0.092386,0.04327,0.377503,-0.657533,1.0,-0.265999,0.222691,-0.1927,0.366079
Coarse Aggregate,-0.109349,-0.283999,-0.009961,-0.182294,-0.265999,1.0,-0.178481,-0.003016,-0.164935
Fine Aggregate,-0.222718,-0.281603,0.079108,-0.450661,0.222691,-0.178481,1.0,-0.156095,-0.167241
Age,0.081946,-0.044246,-0.154371,0.277618,-0.1927,-0.003016,-0.156095,1.0,0.328873
Strength,0.497832,0.134829,-0.105755,-0.289633,0.366079,-0.164935,-0.167241,0.328873,1.0


In [4]:
x_train, x_test, y_train, y_test = train_test_split(df[['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age']], df['Strength'], test_size=0.3, random_state=138)

## Simple Linear Regression

In [5]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(x_train, y_train)
y_pred_lr = linear_reg_model.predict(x_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
accuracy_lr = linear_reg_model.score(x_test, y_test)
print(f"mse with linear regression: {mse_lr}\nacccuracy: {round(accuracy_lr*100, 2)}%")

mse with linear regression: 121.70082059087312
acccuracy: 58.56%


## Polynomial Regression
Performing Cross Validation to check the <strong><i>Optimal Degree</i></strong> for the PolynomialFeature

In [6]:
poly_mses = []
poly_degs = range(2,6)

for deg in poly_degs:
    poly_reg = LinearRegression()
    poly = PolynomialFeatures(degree=deg)
    x_train_poly = poly.fit_transform(x_train)
    x_test_poly = poly.fit_transform(x_test)
    poly_reg.fit(x_train_poly, y_train)
    y_pred_poly = poly_reg.predict(x_test_poly)
    print(poly_reg.score(x_test_poly, y_test))
    poly_mse = mean_squared_error(y_test, y_pred_poly)
    poly_mses.append(poly_mse)

poly_mses

0.7806916615339661
0.8995281199227331
-19.466564691675078
-42911834.114884384


[64.41260912389235, 29.50939296073279, 6011.193376479391, 12603548416.705826]

<strong> Degree 3 is optimal for this dataset</strong><br/>
### Polynomial Regression with Degree 3

In [7]:
poly_reg = LinearRegression()
poly_features = PolynomialFeatures(degree=3)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.fit_transform(x_test)
poly_reg.fit(x_train_poly, y_train)
y_pred_poly = poly_reg.predict(x_test_poly)

mse_poly = mean_squared_error(y_test, y_pred_poly)
accuracy_poly = poly_reg.score(x_test_poly, y_test)
print(f"mse with polynomial regression: {mse_poly}\naccuracy: {round(accuracy_poly*100, 2)}%")

mse with polynomial regression: 29.50939296073279
accuracy: 89.95%


## Linear Regression with Regularization (Lasso & Ridge)
### Lasso Regression

In [8]:
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(x_train, y_train)
y_pred_lasso = lasso_reg.predict(x_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
accuracy_lasso = lasso_reg.score(x_test, y_test)
print(f"mse with lasso regression: {mse_lasso}\naccuracy: {round(accuracy_lasso*100, 2)}%")

mse with lasso regression: 121.75262768287445
accuracy: 58.55%


### Ridge Regression

In [9]:
ridge_reg = Ridge(alpha=0.1)
ridge_reg.fit(x_train, y_train)
y_pred_ridge = ridge_reg.predict(x_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
accuracy_ridge = ridge_reg.score(x_test, y_test)
print(f"mse with ridge regression: {mse_ridge}\naccuracy: {round(accuracy_ridge*100, 2)}%")

mse with ridge regression: 121.70083374857936
accuracy: 58.56%


## Polyomial Regression with Regularization (Lasso & Ridge)
### Polynomial Lasso Regression (Degree 3)

In [10]:
lasso_2 = Lasso(alpha=0.1)
lasso_2.fit(x_train_poly, y_train)
y_pred_lasso_2 = lasso_2.predict(x_test_poly)
mse_lasso_2 = mean_squared_error(y_test, y_pred_lasso_2)
accuracy_lasso_2 = lasso_2.score(x_test_poly, y_test)
print(f"mse with polynomial lasso regression: {mse_lasso_2}\naccuracy: {round(accuracy_lasso_2*100, 2)}%")

mse with polynomial lasso regression: 37.69210719380275
accuracy: 87.17%


### Polynomial Ridge Regression (Degree 3)

In [11]:
ridge_2 = Ridge(alpha=0.1)
ridge_2.fit(x_train_poly, y_train)
y_pred_ridge_2 = ridge_2.predict(x_test_poly)
mse_ridge_2 = mean_squared_error(y_test, y_pred_ridge_2)
accuracy_ridge_2 = ridge_2.score(x_test_poly, y_test)
print(f"mse with polynomial ridge regression: {mse_ridge_2}\naccuracy: {round(accuracy_ridge_2*100, 2)}%")

mse with polynomial ridge regression: 29.540878700828472
accuracy: 89.94%


## Polynomial Rgression with Regularization and Scaling
### Polynomial Lasso Regression with Normalization (Degree 3)

In [12]:
norm_scaler = MinMaxScaler()
x_train_norm, x_test_norm = norm_scaler.fit_transform(x_train), norm_scaler.fit_transform(x_test)
xtrain_norm_poly, xtest_norm_poly = poly.fit_transform(x_train_norm), poly.fit_transform(x_test_norm)

lasso_3 = Lasso(alpha=0.1)
lasso_3.fit(xtrain_norm_poly, y_train)
y_pred_lasso_3 = lasso_3.predict(xtest_norm_poly)
mse_lasso_3 = mean_squared_error(y_test, y_pred_lasso_3)
accuracy_lasso_3 = lasso_3.score(xtest_norm_poly, y_test)
print(f"mse with polynomial lasso regression and normalization: {mse_lasso_3}\naccuracy: {round(accuracy_lasso_3*100, 2)}%")

mse with polynomial lasso regression and normalization: 99.9671987979908
accuracy: 65.96%


### Polynomial Lasso Regression with Standardization (Degree 3)

In [13]:
std_scaler = StandardScaler()
x_train_std, x_test_std = std_scaler.fit_transform(x_train), std_scaler.fit_transform(x_test)
xtrain_std_poly, xtest_std_poly = poly.fit_transform(x_train_std), poly.fit_transform(x_test_std)

lasso_4 = Lasso(alpha=0.1)
lasso_4.fit(xtrain_std_poly, y_train)
y_pred_lasso_4 = lasso_4.predict(xtest_std_poly)
mse_lasso_4 = mean_squared_error(y_test, y_pred_lasso_4)
accuracy_lasso_4 = lasso_4.score(xtest_std_poly, y_test)
print(f"mse with polynomial lasso regression and standardization: {mse_lasso_4}\naccuracy: {round(accuracy_lasso_4*100, 2)}%")

mse with polynomial lasso regression and standardization: 33.02069473374429
accuracy: 88.76%


### Polynomial Ridge Regression with Normalization (Degree 3)

In [14]:
ridge_3 = Ridge(alpha=0.1)
ridge_3.fit(xtrain_norm_poly, y_train)
y_pred_ridge_3 = ridge_3.predict(xtest_norm_poly)
mse_ridge_3 = mean_squared_error(y_test, y_pred_ridge_3)
accuracy_ridge_3 = ridge_3.score(xtest_norm_poly, y_test)
print(f"mse with polynomial ridge regression and normalization: {mse_ridge_3}\naccuracy: {round(accuracy_ridge_3*100, 2)}%")

mse with polynomial ridge regression and normalization: 42.39674687263751
accuracy: 85.57%


### Polynomial Ridge Regression with Standardization (Degree 3)

In [15]:
ridge_4 = Ridge(alpha=0.1)
ridge_4.fit(xtrain_std_poly, y_train)
y_pred_ridge_4 = ridge_4.predict(xtest_std_poly)
mse_ridge_4 = mean_squared_error(y_test, y_pred_ridge_4)
accuracy_ridge_4 = ridge_4.score(xtest_std_poly, y_test)
print(f"mse with polynomial ridge regression and standardization: {mse_ridge_4}\naccuracy: {round(accuracy_ridge_4*100, 2)}%")

mse with polynomial ridge regression and standardization: 632.5689474572594
accuracy: -115.37%


<strong>Polynomial Regression (Degree 3)</strong> is the optimal model for this Dataset with an MSE of <strong>29.51</strong> and <strong>89.95%</strong> accuracy

In [18]:
df_result = pd.DataFrame({'Strength': y_test, 'Predicted Strength': y_pred_poly})
df_result['Error(%)'] = round((abs(df_result['Strength']-df_test['Predicted Strength'])/df_test['Strength'])*100, 2)

29.50939296073279