In [1]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
#Importing the dataset
df = pd.read_csv('https://raw.githubusercontent.com/Premalatha-success/NIIT-Batch/main/concrete.csv')

In [3]:
#Displaying data
df.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,141.3,212.0,0.0,203.5,0.0,971.8,748.5,28,29.89
1,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,14,23.51
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,28,29.22
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,154.8,183.4,0.0,193.3,9.1,1047.4,696.7,28,18.29


In [4]:
#Checking for inconsistent data types
df.dtypes

cement          float64
slag            float64
ash             float64
water           float64
superplastic    float64
coarseagg       float64
fineagg         float64
age               int64
strength        float64
dtype: object

In [5]:
#Checking general info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cement        1030 non-null   float64
 1   slag          1030 non-null   float64
 2   ash           1030 non-null   float64
 3   water         1030 non-null   float64
 4   superplastic  1030 non-null   float64
 5   coarseagg     1030 non-null   float64
 6   fineagg       1030 non-null   float64
 7   age           1030 non-null   int64  
 8   strength      1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [6]:
#checking for duplicates
duplicate = df.duplicated()
duplicate.sum()

25

In [7]:
#Dropping duplicate data
df.drop_duplicates(inplace=True)

In [8]:
#Checking for null values in the dataset
df.isnull().sum()

cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

In [9]:
#Calculating median to replace zero values with
slag_med =df["slag"].median()
slag_med

20.0

In [10]:
df["slag"].replace("0",slag_med,inplace=True)

In [11]:
age_med =df["age"].median() 
age_med

28.0

In [12]:
df["age"].replace("0",age_med,inplace=True)

In [13]:
sup_med =df["superplastic"].median() 
sup_med

6.1

In [14]:
df["superplastic"].replace("0",sup_med,inplace=True)

In [15]:
df.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,141.3,212.0,0.0,203.5,0.0,971.8,748.5,28,29.89
1,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,14,23.51
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,28,29.22
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,154.8,183.4,0.0,193.3,9.1,1047.4,696.7,28,18.29


In [16]:
df.isnull().sum()

cement          0
slag            0
ash             0
water           0
superplastic    0
coarseagg       0
fineagg         0
age             0
strength        0
dtype: int64

In [17]:
#Linear Regression model. Selecting independent and dependent variables
X = df.drop(['strength'], axis = 1)
Y = df['strength']

In [18]:
#Splitinto training and testing 
X_train, X_test, Y_train,Y_test = train_test_split(X, Y, test_size=0.30, random_state=0)

In [19]:
#Linear Regression Model
model_1 = LinearRegression()
model_1.fit(X_train, Y_train)

LinearRegression()

In [20]:
model_1.score(X_train, Y_train)

0.6027622225737224

In [21]:
model_1.score(X_test, Y_test)

0.5955172905549901

In [22]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

In [23]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train1 = poly.fit_transform(X_train)
X_test1 = poly.fit_transform(X_test)
poly_clf = linear_model.LinearRegression()
poly_clf.fit(X_train1, Y_train)
print(poly_clf.score(X_train1, Y_train))

0.7648934341294654


In [24]:
print(poly_clf.score(X_test1, Y_test))

0.6805256594384602


In [25]:
#Decision tree model
from sklearn.tree import DecisionTreeRegressor

In [26]:
model_2 = DecisionTreeRegressor(max_depth = 5)
model_2.fit(X_train, Y_train)

DecisionTreeRegressor(max_depth=5)

In [27]:
model_2.score(X_train, Y_train) 

0.7934570189958494

In [28]:
model_2.score(X_test, Y_test)

0.774669593455146

In [29]:
#KNR Model
from sklearn.neighbors import KNeighborsRegressor

In [30]:
model_3 = KNeighborsRegressor(leaf_size=15)
model_3.fit(X_train, Y_train)

KNeighborsRegressor(leaf_size=15)

In [31]:
model_3.score(X_train, Y_train)

0.7949950550864587

In [32]:
model_3.score(X_test, Y_test)

0.5941652259104868

In [33]:
#Bagging Regressor Model
from sklearn.ensemble import BaggingRegressor

In [34]:
model_4 = BaggingRegressor(n_estimators=30)
model_4.fit(X_train, Y_train)

BaggingRegressor(n_estimators=30)

In [35]:
model_4.score(X_train, Y_train)

0.9807141354480216

In [36]:
model_4.score(X_test, Y_test)

0.8951674384074112

In [37]:
#AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor

In [38]:
model_5 = AdaBoostRegressor()
model_5.fit(X_train, Y_train)

AdaBoostRegressor()

In [39]:
model_5.score(X_train, Y_train)

0.8042361477972436

In [40]:
model_5.score(X_test, Y_test)

0.7180672952749587

In [41]:
#Gradient Boost Regressor
from sklearn.ensemble import GradientBoostingRegressor

In [42]:
model_6 = GradientBoostingRegressor()
model_6.fit(X_train, Y_train)

GradientBoostingRegressor()

In [43]:
model_6.score(X_train, Y_train)

0.9484678597953874

In [44]:
model_6.score(X_test, Y_test)

0.8879258508023735

In [45]:
#Random Forest Model
from sklearn.ensemble import RandomForestRegressor

In [46]:
model_7 = RandomForestRegressor()
model_7.fit(X_train, Y_train)

RandomForestRegressor()

In [47]:
model_7.score(X_train, Y_train)

0.9829080698060285

In [48]:
model_7.score(X_test, Y_test)

0.8975722980936706

In [49]:
#SVR Regressor
from sklearn.svm import SVR

In [50]:
model_8 = SVR()
model_8.fit(X_train, Y_train)

SVR()

In [51]:
model_8.score(X_train, Y_train)

0.21608277831105271

In [52]:
model_8.score(X_test, Y_test)

0.22883948950566801