In [1]:
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('concrete_data.csv')
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
x = df[['Cement','Blast Furnace Slag','Fly Ash','Water','Superplasticizer','Coarse Aggregate','Coarse Aggregate','Fine Aggregate','Age']]

In [4]:
x.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Coarse Aggregate.1,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,978.4,825.5,360


In [5]:
y = df[['Strength']]
y.head()

Unnamed: 0,Strength
0,79.99
1,61.89
2,40.27
3,41.05
4,44.3


In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [7]:
xtrain.shape

(772, 9)

In [8]:
xtest.shape

(258, 9)

# OLS

In [9]:
model = LinearRegression()
model.fit(xtrain, ytrain)

In [10]:
y_pred = model.predict(xtest)

In [11]:
y_pred

array([[39.54173004],
       [14.34766636],
       [61.33148723],
       [54.02870482],
       [24.44257981],
       [53.63094459],
       [45.98973566],
       [27.31187439],
       [53.23177832],
       [37.08298577],
       [16.75983422],
       [39.4324516 ],
       [29.69334674],
       [35.87961914],
       [47.63884285],
       [56.70862412],
       [35.79871479],
       [29.36308725],
       [48.69052196],
       [35.33194214],
       [53.80404465],
       [32.10561488],
       [33.046423  ],
       [48.23970516],
       [23.61819651],
       [23.53591425],
       [69.42292047],
       [26.76815501],
       [56.70862412],
       [48.23338891],
       [18.9707685 ],
       [36.43996199],
       [17.42660579],
       [19.52438719],
       [24.61010586],
       [15.2843709 ],
       [53.18012778],
       [28.15463908],
       [27.19220514],
       [26.32272738],
       [52.53006928],
       [30.42115481],
       [25.78127338],
       [35.57455811],
       [53.778552  ],
       [53

In [12]:
mse = mean_squared_error(ytest, y_pred)
print('MSE:', mse)

MSE: 95.88863777582235


In [13]:
model.score(xtest, ytest) #testing score

0.6234704748356753

In [15]:
model.score(xtrain, ytrain) # training score

0.6108763464892633

# Polynomial

In [16]:
from sklearn.preprocessing import PolynomialFeatures

# Degree 2

In [17]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(x)

In [18]:
pd.DataFrame(X_poly).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,1.0,540.0,0.0,0.0,162.0,2.5,1040.0,1040.0,676.0,28.0,...,1081600.0,1081600.0,703040.0,29120.0,1081600.0,703040.0,29120.0,456976.0,18928.0,784.0
1,1.0,540.0,0.0,0.0,162.0,2.5,1055.0,1055.0,676.0,28.0,...,1113025.0,1113025.0,713180.0,29540.0,1113025.0,713180.0,29540.0,456976.0,18928.0,784.0
2,1.0,332.5,142.5,0.0,228.0,0.0,932.0,932.0,594.0,270.0,...,868624.0,868624.0,553608.0,251640.0,868624.0,553608.0,251640.0,352836.0,160380.0,72900.0
3,1.0,332.5,142.5,0.0,228.0,0.0,932.0,932.0,594.0,365.0,...,868624.0,868624.0,553608.0,340180.0,868624.0,553608.0,340180.0,352836.0,216810.0,133225.0
4,1.0,198.6,132.4,0.0,192.0,0.0,978.4,978.4,825.5,360.0,...,957266.56,957266.56,807669.2,352224.0,957266.56,807669.2,352224.0,681450.25,297180.0,129600.0


In [19]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly, y, test_size=0.25, random_state=0)

In [20]:
xtrain.shape

(772, 55)

In [21]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [22]:
y_pred2 = model2.predict(xtest)

In [23]:
y_pred2

array([[35.49812176],
       [11.24812176],
       [76.24812176],
       [53.87312176],
       [16.99812176],
       [57.74812176],
       [50.74812176],
       [25.24812176],
       [61.24812176],
       [49.24812176],
       [16.49812176],
       [51.99812176],
       [32.74812176],
       [24.74812176],
       [56.62312176],
       [67.74812176],
       [30.49812176],
       [36.49812176],
       [51.24812176],
       [33.49812176],
       [59.49812176],
       [38.74812176],
       [29.49812176],
       [50.24812176],
       [21.99812176],
       [23.99812176],
       [52.12312176],
       [25.24812176],
       [67.74812176],
       [61.99812176],
       [17.49812176],
       [44.74812176],
       [19.49812176],
       [29.24812176],
       [23.74812176],
       [ 9.99812176],
       [56.24812176],
       [22.74812176],
       [23.24812176],
       [22.74812176],
       [47.62312176],
       [38.99812176],
       [30.24812176],
       [34.74812176],
       [56.24812176],
       [43

In [27]:
mse = mean_squared_error(ytest, y_pred2)
print('MSE:', mse)

MSE: 61.5436186179508


In [28]:
model2.score(xtest, ytest) #testing score

0.7583343550120365

In [29]:
model2.score(xtrain, ytrain) # training score

0.8183031716441485

# Degree 3

In [30]:
poly = PolynomialFeatures(degree=3)
X_poly_deg3 = poly.fit_transform(x)

In [31]:
X_poly_deg3.shape

(1030, 220)

In [32]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly_deg3, y, test_size=0.25, random_state=0)

In [33]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [34]:
y_pred3 = model2.predict(xtest)

In [35]:
y_pred3

array([[27.49973965],
       [ 9.66504041],
       [77.35197039],
       [57.1737338 ],
       [10.94053178],
       [41.29410365],
       [64.00790872],
       [20.66680623],
       [74.59501765],
       [48.2731725 ],
       [16.06983044],
       [43.25319123],
       [38.47608094],
       [18.3861814 ],
       [65.51188518],
       [61.68401736],
       [32.54564208],
       [37.38906401],
       [56.10617903],
       [38.20007134],
       [54.3781857 ],
       [25.17674452],
       [30.62887975],
       [54.49062003],
       [21.96703059],
       [27.40204118],
       [69.12491056],
       [22.29261454],
       [61.68401736],
       [70.55567193],
       [22.39857106],
       [49.04148858],
       [22.22752844],
       [33.96840682],
       [22.94584953],
       [ 7.61611016],
       [38.13671117],
       [25.20705732],
       [26.29098315],
       [29.47528105],
       [52.33070676],
       [41.38251249],
       [26.25384644],
       [41.36661283],
       [63.0812947 ],
       [38

In [36]:
mse = mean_squared_error(ytest, y_pred3)
print('MSE:', mse)

MSE: 52.250992022143706


In [37]:
model2.score(xtest, ytest) # testing

0.7948240618303647

In [38]:
model2.score(xtrain, ytrain) # training

0.9286334838236247

Degree 3 give best testing performance.

# Degree 4

In [39]:
poly = PolynomialFeatures(degree=4)
X_poly_deg4 = poly.fit_transform(x)

In [40]:
X_poly_deg4.shape

(1030, 715)

In [41]:
xtrain, xtest, ytrain, ytest = train_test_split(X_poly_deg4, y, test_size=0.25, random_state=0)

In [42]:
model2 = LinearRegression()
model2.fit(xtrain, ytrain)

In [43]:
y_pred4 = model2.predict(xtest)

In [44]:
y_pred4

array([[ 3.17448859e+01],
       [ 1.01003111e+01],
       [ 8.02135644e+01],
       [ 2.21221027e+02],
       [ 1.21048403e+01],
       [ 4.40667982e+01],
       [ 4.25329608e+02],
       [ 2.22515016e+01],
       [ 1.27789867e+02],
       [ 5.23184562e+01],
       [ 1.81772018e+01],
       [ 3.68900075e+01],
       [ 3.77015262e+01],
       [ 1.67398288e+01],
       [ 9.51433997e+01],
       [ 5.16999622e+01],
       [ 3.37382031e+01],
       [ 3.74627233e+01],
       [ 5.73107219e+01],
       [ 4.30975976e+01],
       [ 3.92790151e+01],
       [ 2.38594203e+01],
       [ 3.14837845e+01],
       [ 5.52715793e+01],
       [ 1.86892128e+01],
       [ 2.68744426e+01],
       [ 8.06033406e+01],
       [ 2.27357078e+01],
       [ 5.16999622e+01],
       [ 7.93375030e+01],
       [ 2.05412097e+01],
       [ 5.15564609e+01],
       [ 1.99323793e+01],
       [ 4.21407867e+01],
       [ 2.12190819e+01],
       [ 8.66816380e+00],
       [ 4.04069438e+01],
       [ 2.55470433e+01],
       [ 2.7

In [45]:
mse = mean_squared_error(ytest, y_pred4)
print('MSE:', mse)

MSE: 9010.427614213324


In [46]:
model2.score(xtest, ytest) # testing

-34.38158544956101

In [47]:
model2.score(xtrain, ytrain) # training

0.9845789399412525

# Regularization

In [48]:
x = df[['Cement','Blast Furnace Slag','Fly Ash','Water','Superplasticizer','Coarse Aggregate','Coarse Aggregate','Fine Aggregate','Age']]

In [49]:
y = df[['Strength']]

In [50]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=0)

In [51]:
xtrain.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Coarse Aggregate.1,Fine Aggregate,Age
79,531.3,0.0,0.0,141.8,28.2,852.1,852.1,893.7,3
193,233.8,0.0,94.6,197.9,4.6,947.0,947.0,852.2,100
367,214.9,53.8,121.9,155.6,9.6,1014.3,1014.3,780.6,56
181,389.9,189.0,0.0,145.9,22.0,944.7,944.7,755.8,91
711,305.3,203.5,0.0,203.5,0.0,965.4,965.4,631.0,90


# L1 Regularization with LR - Lasso

In [52]:
lasso_model = Lasso(alpha=0.1) # Tune alpha parameter
lasso_model.fit(xtrain, ytrain)

# Performance

In [54]:
lasso_predictions = lasso_model.predict(xtest)
lasso_mse = mean_squared_error(ytest, lasso_predictions)
print('Lasso MSE:', lasso_mse)

Lasso MSE: 95.85112533879942


In [55]:
lasso_model.score(xtest, ytest)

0.6236177763348688

# L2 Regularization with LR - Ridge

In [56]:
ridge_model = Ridge(alpha=0.1) # Tune alpha parameter
ridge_model.fit(xtrain, ytrain)

In [57]:
ridge_predictions = ridge_model.predict(xtest)
ridge_mse = mean_squared_error(ytest, ridge_predictions)

print('Ridge MSE:', ridge_mse)

Ridge MSE: 95.88862773914813


In [58]:
ridge_model.score(xtest, ytest)

0.623470514247062