In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import pickle
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import joblib
import copy

In [2]:
data=pd.read_csv('cluster_data.csv')

In [4]:
data.drop(columns='Unnamed: 0',axis=1,inplace=True)

In [5]:
data.head()

Unnamed: 0,Cement,Blast Furnace Slag _component_2,Fly Ash _component_3,Water_component_4,Superplasticizer_component_5,Coarse Aggregate_component_6,Fine Aggregate_component_7,Age_day,Concrete_compressive _strength,cluster
0,6.293419,0.0,0.0,5.09375,1.252763,6.947937,6.517671,3.367296,79.99,0
1,6.293419,0.0,0.0,5.09375,1.252763,6.962243,6.517671,3.367296,61.89,0
2,5.809643,4.966335,0.0,5.433722,0.0,6.838405,6.388561,5.602119,40.27,1
3,5.809643,4.966335,0.0,5.433722,0.0,6.838405,6.388561,5.902633,41.05,1
4,5.296315,4.893352,0.0,5.26269,0.0,6.88694,6.7172,5.888878,44.3,1


In [6]:
data.columns

Index(['Cement', 'Blast Furnace Slag _component_2', 'Fly Ash _component_3',
       'Water_component_4', 'Superplasticizer_component_5',
       'Coarse Aggregate_component_6', 'Fine Aggregate_component_7', 'Age_day',
       'Concrete_compressive _strength', 'cluster'],
      dtype='object')

In [7]:
y=data['Concrete_compressive _strength']

In [8]:
models = {'RandomForest': RandomForestRegressor(random_state=42),
          'LinearRegression': LinearRegression(),
          'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
          'decision_tree' : DecisionTreeRegressor(random_state=42)}

In [9]:
best_models = {}

for cluster in data['cluster'].unique():
    cluster_data = data[data['cluster'] == cluster]
    X = cluster_data.drop(['Concrete_compressive _strength', 'cluster'], axis=1)
    y = cluster_data['Concrete_compressive _strength']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    print(f"\nCluster {cluster}:")
    best_r2_score = float("-inf")  
    best_model_name = None
    best_model = None
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)
        print(f"{model_name} R^2 score: {r2}")
        
        if r2 > best_r2_score: 
            best_r2_score = r2
            best_model_name = model_name
            best_model = copy.deepcopy(model)
    
    print(f"Best model for Cluster {cluster} is {best_model_name} with R^2 score: {best_r2_score}")
    best_models[cluster] = best_model 



Cluster 0:
RandomForest R^2 score: 0.9240629259933576
LinearRegression R^2 score: 0.8596346480620781
XGBoost R^2 score: 0.94932017982938
decision_tree R^2 score: 0.8862245153883558
Best model for Cluster 0 is XGBoost with R^2 score: 0.94932017982938

Cluster 1:
RandomForest R^2 score: 0.9238834311715596
LinearRegression R^2 score: 0.8735043910892509
XGBoost R^2 score: 0.9480180479748053
decision_tree R^2 score: 0.8929592235156807
Best model for Cluster 1 is XGBoost with R^2 score: 0.9480180479748053

Cluster 3:
RandomForest R^2 score: 0.8982871602097449
LinearRegression R^2 score: 0.896560786489033
XGBoost R^2 score: 0.9401221814383163
decision_tree R^2 score: 0.8671342013214467
Best model for Cluster 3 is XGBoost with R^2 score: 0.9401221814383163

Cluster 2:
RandomForest R^2 score: 0.8305178215711204
LinearRegression R^2 score: 0.7639946503530364
XGBoost R^2 score: 0.884895937653696
decision_tree R^2 score: 0.6001820222668665
Best model for Cluster 2 is XGBoost with R^2 score: 0.884

In [21]:
xgb_param_grid = {
    'n_estimators': [20,30,50,100, 200, 300,400,500,150,800],
    'max_depth': [3, 4, 5,6,7,8,9,2,12],
    'learning_rate': [0.01, 0.1, 0.2,0.3,0.4,0.05,0.6,0.0001,0.003],
}

In [11]:
rf_param_grid = {
    'n_estimators': [50,100, 200,300,400],
    'max_depth': [None,5, 10, 20,30],
    'min_samples_split': [2,4, 5,6],
    'min_samples_leaf': [1, 2,3,4]
}

In [41]:
cluster_data = data[data['cluster'] ==0]
X = cluster_data.drop(['Concrete_compressive _strength', 'cluster'], axis=1)
y = cluster_data['Concrete_compressive _strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [45]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search_xgb_0 = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=8, scoring='r2', verbose=1, n_jobs=-1)
grid_search_xgb_0.fit(X_train, y_train)

Fitting 8 folds for each of 990 candidates, totalling 7920 fits


In [46]:
pred=grid_search_xgb_0.predict(X_test)

In [47]:
r2_score(y_test,pred)

0.923167702431819

In [48]:
grid_search_xgb_0=grid_search_xgb_0.best_estimator_

In [49]:
grid_search_xgb_0

In [51]:
joblib.dump(grid_search_xgb_0, 'XGB_model_cluster_0.pkl')

['XGB_model_cluster_0.pkl']

In [65]:
'''rf_model = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)'''

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


In [66]:
#pred=grid_search_rf.predict(X_test)

In [67]:
#r2_score(y_test,pred)

0.834757088625376

In [53]:
cluster_data = data[data['cluster'] ==1]
X = cluster_data.drop(['Concrete_compressive _strength', 'cluster'], axis=1)
y = cluster_data['Concrete_compressive _strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [54]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search_xgb_1 = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=8, scoring='r2', verbose=1, n_jobs=-1)
grid_search_xgb_1.fit(X_train, y_train)

Fitting 8 folds for each of 990 candidates, totalling 7920 fits


In [55]:
pred=grid_search_xgb_1.predict(X_test)

In [56]:
r2_score(y_test,pred)

0.9664025378194531

In [57]:
grid_search_xgb_1=grid_search_xgb_1.best_estimator_

In [58]:
grid_search_xgb_1

In [59]:
joblib.dump(grid_search_xgb_1, 'XGB_model_cluster_1.pkl')

['XGB_model_cluster_1.pkl']

In [112]:
cluster_data = data[data['cluster'] ==2]
X = cluster_data.drop(['Concrete_compressive _strength', 'cluster'], axis=1)
y = cluster_data['Concrete_compressive _strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [113]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search_xgb_2 = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=6, scoring='r2', verbose=1, n_jobs=-1)
grid_search_xgb_2.fit(X_train, y_train)

Fitting 6 folds for each of 990 candidates, totalling 5940 fits


In [114]:
pred=grid_search_xgb_2.predict(X_test)

In [115]:
r2_score(y_test,pred)

0.8892699996415018

In [85]:
grid_search_xgb_2=grid_search_xgb_2.best_estimator_

In [86]:
grid_search_xgb_2

In [87]:
joblib.dump(grid_search_xgb_2, 'XGB_model_cluster_2.pkl')

['XGB_model_cluster_2.pkl']

In [92]:
cluster_data = data[data['cluster'] ==3]
X = cluster_data.drop(['Concrete_compressive _strength', 'cluster'], axis=1)
y = cluster_data['Concrete_compressive _strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [93]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search_xgb_3 = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=6, scoring='r2', verbose=1, n_jobs=-1)
grid_search_xgb_3.fit(X_train, y_train)

Fitting 6 folds for each of 990 candidates, totalling 5940 fits


In [96]:
pred=grid_search_xgb_3.predict(X_test)

In [97]:
r2_score(y_test,pred)

0.9430886119384522

In [98]:
grid_search_xgb_3=grid_search_xgb_3.best_estimator_

In [99]:
grid_search_xgb_3

In [100]:
joblib.dump(grid_search_xgb_3, 'XGB_model_cluster_3.pkl')

['XGB_model_cluster_3.pkl']