## **cement strenght prediction model** ##


In [1]:
# import manipulation lybrary
import pandas as pd
import numpy as np

# import visualization lybrary
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning lybrary
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.svm import SVR


In [3]:
# data ingestion
def data_ingestion():

    data = pd.read_csv(r'C:\Cement_Strenght_Prediction_Model\data\raw\Concrete_Compressive_Strength.csv')
    return data


In [4]:
# data exploration

from collections import OrderedDict

def data_exploration(data):

    # segregate categorical and numerical columns 
    numerical_col = data.select_dtypes(exclude='object').columns
    categorical_col = data.select_dtypes(include='object').columns

    # numerical stats
    numerical_stats = []

    Q1 = data[numerical_col].quantile(0.25)
    Q3 = data[numerical_col].quantile(0.75)
    IQR = Q3 - Q1
    LW = Q1 - 1.5*IQR
    UW = Q3 + 1.5*IQR
    Outlier_Count = (data[numerical_col] < LW) | (data[numerical_col] > UW)
    Outlier_Percentage = (Outlier_Count.sum()/len(data))*100

    for i in numerical_col:
        num_stats = OrderedDict({
            "Featues":i,
            "count":data[i].count(),
            "Maximum": data[i].max(),
            "Minimum": data[i].min(),
            "Mean":data[i].mean(),
        "Median":data[i].median(),
        "Q1":Q1,
        "Q3":Q3,
        "IQR":IQR,
        "Lower_Whisker":LW,
        "Upper_Whisker":UW,
        "Outlier_Count": Outlier_Count.sum(),
        "Outlier_Percentage":Outlier_Percentage,
        "Skewness":data[i].skew(),
        "Kurtosis":data[i].kurtosis(),
        "Standard Deviation":data[i].std()

        })


        numerical_stats.append(num_stats)
        numerical_stats_report = pd.DataFrame(numerical_stats)
    return numerical_stats_report

numerical_stats_report = data_exploration(data)
numerical_stats_report
    





Unnamed: 0,Featues,count,Maximum,Minimum,Mean,Median,Q1,Q3,IQR,Lower_Whisker,Upper_Whisker,Outlier_Count,Outlier_Percentage,Skewness,Kurtosis,Standard Deviation
0,cement,1030,540.0,102.0,281.167864,272.9,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,0.509481,-0.520652,104.506364
1,slag,1030,359.4,0.0,73.895825,22.0,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,0.800717,-0.508175,86.279342
2,ash,1030,200.1,0.0,54.18835,0.0,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,0.537354,-1.328746,63.997004
3,water,1030,247.0,121.8,181.567282,185.0,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,0.074628,0.122082,21.354219
4,superplastic,1030,32.2,0.0,6.20466,6.4,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,0.907203,1.411269,5.973841
5,coarseagg,1030,1145.0,801.0,972.918932,968.0,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,-0.04022,-0.599016,77.753954
6,fineagg,1030,992.6,594.0,773.580485,779.5,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,-0.25301,-0.102177,80.17598
7,age,1030,365.0,1.0,45.662136,28.0,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,3.269177,12.168989,63.169912
8,strength,1030,82.6,2.33,35.817961,34.445,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,0.416977,-0.313725,16.705742
9,water_cement_ratio,1030,1.882334,0.266892,0.748263,0.675346,cement 192.375000 slag ...,cement 350.000000 slag ...,cement 157.625000 slag ...,cement -44.062500 slag ...,cement 586.437500 slag...,cement 0 slag ...,cement 0.000000 slag ...,0.958065,0.734109,0.314003


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   cement                    1030 non-null   float64
 1   slag                      1030 non-null   float64
 2   ash                       1030 non-null   float64
 3   water                     1030 non-null   float64
 4   superplastic              1030 non-null   float64
 5   coarseagg                 1030 non-null   float64
 6   fineagg                   1030 non-null   float64
 7   age                       1030 non-null   int64  
 8   strength                  1030 non-null   float64
 9   water_cement_ratio        1030 non-null   float64
 10  total_binder              1030 non-null   float64
 11  aggregate_to_cement       1030 non-null   float64
 12  cement_water_interaction  1030 non-null   float64
 13  age_strength_proxy        1030 non-null   float64
dtypes: float

In [6]:
data['age'] = data['age'].astype('float64')

In [11]:
def data_preprocessing(data):
    X = data.drop(columns='strength')
    y = data['strength']

    # use train and test 
    X_train, X_test , y_train, y_test = train_test_split(X,y,
                                                         random_state=1,
                                                         test_size=0.3)
    
    # use scalling technique
    rs = RobustScaler().fit(X_train,X_test)
    X_train = rs.fit_transform(X_train)
    X_test = rs.transform(X_test)

    return X_train, X_test,y_train,y_test

X_train, X_test,y_train,y_test = data_preprocessing(data)
X_train.shape




(721, 13)

In [12]:
def model_building(data):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "Bagging Regressor": BaggingRegressor(),
        "AdaBoost Regressor": AdaBoostRegressor(),
        "Support Vector Regressor": SVR(),
        "K-Neighbors Regressor": KNeighborsRegressor()
    }
    return models 

models = model_building(data)


In [22]:
from sklearn.metrics import r2_score as r2_scorer  

def model_evaluation(models, X_train, X_test, y_train, y_test):
    r2_scores_dict = {}  
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        r2_scores_dict[model_name] = r2_scorer(y_test, y_pred)
        
        print(f"Model: {model_name}")  
        print(f"R2 score: {r2_scores_dict[model_name]}")
        print("-" * 30)
    
    return r2_scores_dict

r2_scores = model_evaluation(models, X_train, X_test, y_train, y_test)
print(r2_scores)
  


Model: Linear Regression
R2 score: 0.8173690536956675
------------------------------
Model: Decision Tree
R2 score: 0.862975539780072
------------------------------
Model: Random Forest
R2 score: 0.9103531360452232
------------------------------
Model: Gradient Boosting
R2 score: 0.9094160777120259
------------------------------
Model: Bagging Regressor
R2 score: 0.8951466972779261
------------------------------
Model: AdaBoost Regressor
R2 score: 0.8129169556445169
------------------------------
Model: Support Vector Regressor
R2 score: 0.7671012945149633
------------------------------
Model: K-Neighbors Regressor
R2 score: 0.8077377472879232
------------------------------
{'Linear Regression': 0.8173690536956675, 'Decision Tree': 0.862975539780072, 'Random Forest': 0.9103531360452232, 'Gradient Boosting': 0.9094160777120259, 'Bagging Regressor': 0.8951466972779261, 'AdaBoost Regressor': 0.8129169556445169, 'Support Vector Regressor': 0.7671012945149633, 'K-Neighbors Regressor': 0.807