In [29]:
import numpy as np
import pandas as pd

In [16]:
import joblib

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
test_data = pd.read_csv('../data/diamonds_test_data.csv')

price = test_data['price']

test_data = test_data.drop(columns = ['price'], axis = 1)
test_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.71,Very Good,I,VS1,59.0,57.0,5.85,5.88,3.46
1,1.5,Premium,F,SI1,62.9,59.0,7.34,7.29,4.6
2,1.27,Ideal,J,SI2,61.2,56.0,6.99,6.95,4.27
3,0.52,Very Good,E,VVS2,61.2,58.0,5.17,5.19,3.17
4,0.32,Ideal,I,SI1,62.4,57.0,4.37,4.35,2.72


STEP 1 : Changing categorical values into numerical

In [24]:
def transform_diamond_features(row):
    """
    Transform categorical features (cut, color, clarity) into numerical values for a single diamond row.
    
    Parameters:
    row (pd.Series): A single row from the diamonds dataset
    
    Returns:
    pd.Series: Transformed row with numerical values
    """
    # Define the mappings
    cut_class = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
    cut_numeric = list(range(1, 6))
    color_class = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
    color_numeric = list(range(8, 1, -1))
    clarity_class = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
    clarity_numeric = list(range(1, 9))
    
    # Create mapping dictionaries
    cut_map = dict(zip(cut_class, cut_numeric))
    color_map = dict(zip(color_class, color_numeric))
    clarity_map = dict(zip(clarity_class, clarity_numeric))
    
    # Create a copy of the row to avoid modifying the original
    transformed_row = row.copy()
    
    # Transform each feature
    transformed_row['cut'] = cut_map.get(row['cut'], row['cut'])
    transformed_row['color'] = color_map.get(row['color'], row['color'])
    transformed_row['clarity'] = clarity_map.get(row['clarity'], row['clarity'])
    
    return transformed_row

In [25]:
# Transform just one row
# single_row = test_data.iloc[0]  # Get first row
# transformed_row = transform_diamond_features(single_row)

for i in range(len(test_data)) :
    test_data.iloc[i] = transform_diamond_features(test_data.iloc[i])

test_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.71,3,3,5,59.0,57.0,5.85,5.88,3.46
1,1.5,4,6,3,62.9,59.0,7.34,7.29,4.6
2,1.27,5,2,2,61.2,56.0,6.99,6.95,4.27
3,0.52,3,7,6,61.2,58.0,5.17,5.19,3.17
4,0.32,5,3,3,62.4,57.0,4.37,4.35,2.72


STEP 2 : Determining price classes

In [38]:
from sklearn.preprocessing import StandardScaler

clusters_scaler = joblib.load('../models/clusters/clusters_scaler.joblib')

test_data_copy = test_data.copy()

# Function to scale a single row
def scale_for_clusters(input_row):
    # Ensure the row is a 2D array (reshape for scaler)
    scaled_row = clusters_scaler.transform(input_row.values.reshape(1, -1))  # Reshape to 2D
    return scaled_row.flatten()  # Flatten back to 1D for assignment

In [39]:
for i in range(len(test_data_copy)) :
    test_data_copy.iloc[i] = scale_for_clusters(test_data_copy.iloc[i])

test_data_copy.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,-0.186732,-0.810303,-1.409522,0.985015,-1.933303,-0.207915,0.10365,0.128008,-0.116785
1,1.473296,0.087356,0.352879,-0.810303,0.808672,0.688044,1.432308,1.385185,1.529446
2,0.989996,0.985015,-1.996989,-1.707961,-0.386548,-0.655894,1.120207,1.082036,1.052906
3,-0.585979,-0.810303,0.940346,1.882674,-0.386548,0.240065,-0.502718,-0.487207,-0.535563
4,-1.006239,0.985015,-1.409522,-0.810303,0.457136,-0.207915,-1.216091,-1.236163,-1.18539


In [40]:
from sklearn.ensemble import VotingClassifier

lgbm_class_model = joblib.load('../models/clusters/model_lgbm.joblib')
logreg_class_model = joblib.load('../models/clusters/model_logreg.joblib')
rf_class_model = joblib.load('../models/clusters/model_rf.joblib')
xgb_class_model = joblib.load('../models/clusters/model_xgb.joblib')

models = [lgbm_class_model, logreg_class_model, rf_class_model, xgb_class_model]

# Function to make a soft voting prediction
def soft_voting_classifier(models, input_row):
    """
    Computes a soft voting prediction from multiple models.

    Args:
        models (list): A list of trained models.
        input_row (pd.DataFrame or np.ndarray): A single row of predictors (2D format).

    Returns:
        int: The predicted class label from the soft voting classifier.
    """
    
    # Ensure input_row is 2D
    input_row_2d = input_row.values.reshape(1, -1)

    # Collect probabilities from all models
    probabilities = [model.predict_proba(input_row_2d) for model in models]
    
    # Average the probabilities across models
    avg_probabilities = np.mean(probabilities, axis = 0)
    
    # Return the class with the highest average probability
    predicted_class = np.argmax(avg_probabilities, axis = 1)
    
    return predicted_class[0]

In [42]:
cluster = []

for i in range(len(test_data_copy)) :
    cluster.append(soft_voting_classifier(models, test_data_copy.iloc[i]))

In [43]:
test_data['cluster'] = cluster
test_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,cluster
0,0.71,3,3,5,59.0,57.0,5.85,5.88,3.46,1
1,1.5,4,6,3,62.9,59.0,7.34,7.29,4.6,3
2,1.27,5,2,2,61.2,56.0,6.99,6.95,4.27,3
3,0.52,3,7,6,61.2,58.0,5.17,5.19,3.17,1
4,0.32,5,3,3,62.4,57.0,4.37,4.35,2.72,0


STEP 3 : Predicting price

In [66]:
c0_model = joblib.load('../models/cluster 0/cat_0_best_model.joblib')
c0_scaler = joblib.load('../models/cluster 0/scaler_0.joblib')

c1_model = joblib.load('../models/cluster 1/xgb_1_best_model.joblib')
c1_scaler = joblib.load('../models/cluster 1/scaler_1.joblib')

c2_model = joblib.load('../models/cluster 2/cat_2_best_model.joblib')
c2_scaler = joblib.load('../models/cluster 2/scaler_2.joblib')

c3_model = joblib.load('../models/cluster 3/lgbm_3_best_model.joblib')
c3_scaler = joblib.load('../models/cluster 3/scaler_3.joblib')

c4_model = joblib.load('../models/cluster 4/xgb_4_best_model.joblib')
c4_scaler = joblib.load('../models/cluster 4/scaler_4.joblib')

In [76]:
preds = []

def cluster0(x : pd.Series) -> None :
    x_scaled = c0_scaler.transform(x.values.reshape(1, -1))
    pred = c0_model.predict(x_scaled)
    preds.append(pred)

def cluster1(x : pd.Series) -> None :
    x_scaled = c1_scaler.transform(x.values.reshape(1, -1))
    pred = c1_model.predict(x_scaled)
    preds.append(pred)

def cluster2(x : pd.Series) -> None :
    x_scaled = c2_scaler.transform(x.values.reshape(1, -1))
    pred = c2_model.predict(x_scaled)
    preds.append(pred)

def cluster3(x : pd.Series) -> None :
    x_scaled = c3_scaler.transform(x.values.reshape(1, -1))
    pred = c3_model.predict(x_scaled)
    preds.append(pred)

def cluster4(x : pd.Series) -> None :
    x_scaled = c4_scaler.transform(x.values.reshape(1, -1))
    pred = c4_model.predict(x_scaled)
    preds.append(pred)

In [77]:
for i in range(len(test_data)) :

    input = test_data.iloc[i, :9]

    if test_data['cluster'].iloc[i] == 0 :
        cluster0(input)
    elif test_data['cluster'].iloc[i] == 1 :
        cluster1(input)
    elif test_data['cluster'].iloc[i] == 2 :
        cluster2(input)
    elif test_data['cluster'].iloc[i] == 3 :
        cluster3(input)
    elif test_data['cluster'].iloc[i] == 4 :
        cluster4(input)

In [79]:
test_data['price'] = price
test_data['predicted price'] = np.round(preds, 2)
test_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,cluster,price,predicted price
0,0.71,3,3,5,59.0,57.0,5.85,5.88,3.46,1,2328,1986.54
1,1.5,4,6,3,62.9,59.0,7.34,7.29,4.6,3,11878,9881.59
2,1.27,5,2,2,61.2,56.0,6.99,6.95,4.27,3,5021,5631.45
3,0.52,3,7,6,61.2,58.0,5.17,5.19,3.17,1,2254,1974.99
4,0.32,5,3,3,62.4,57.0,4.37,4.35,2.72,0,554,755.99


In [82]:
from sklearn.metrics import root_mean_squared_error

test_data_rmse = root_mean_squared_error(test_data['price'], test_data['predicted price'])
print(test_data_rmse)
print(test_data_rmse / test_data['price'].mean())

1226.3185572899401
0.30845689254768566


In [86]:
test_data_rmse_c0 = root_mean_squared_error(test_data.query("cluster == 0")['price'], test_data.query("cluster == 0")['predicted price'])
print(test_data_rmse_c0)
print(test_data_rmse_c0 /test_data.query("cluster == 0")['price'].mean())

212.72001192038385
0.2856038357143427


In [88]:
test_data_rmse_c1 = root_mean_squared_error(test_data.query("cluster == 1")['price'], test_data.query("cluster == 1")['predicted price'])
print(test_data_rmse_c1)
print(test_data_rmse_c1 /test_data.query("cluster == 1")['price'].mean())

599.1047886309484
0.35401482109184623


In [89]:
test_data_rmse_c2 = root_mean_squared_error(test_data.query("cluster == 2")['price'], test_data.query("cluster == 2")['predicted price'])
print(test_data_rmse_c2)
print(test_data_rmse_c2 /test_data.query("cluster == 2")['price'].mean())

744.4991903412986
0.198876679518859


In [90]:
test_data_rmse_c3 = root_mean_squared_error(test_data.query("cluster == 3")['price'], test_data.query("cluster == 3")['predicted price'])
print(test_data_rmse_c3)
print(test_data_rmse_c3 /test_data.query("cluster == 3")['price'].mean())

1969.4539303709962
0.26258275931404623


In [91]:
test_data_rmse_c4 = root_mean_squared_error(test_data.query("cluster == 4")['price'], test_data.query("cluster == 4")['predicted price'])
print(test_data_rmse_c4)
print(test_data_rmse_c4 /test_data.query("cluster == 4")['price'].mean())

2543.433392670693
0.17682772060593557


In [103]:
a = np.round(len(test_data.query('cluster == 0 | cluster == 2 | cluster == 3 | cluster == 4')) / len(test_data_copy) * 100, 2)
b = np.round(len(test_data.query('cluster == 2 | cluster == 4')) / len(test_data_copy) * 100, 2)

print(f'Models based on clusters returned better RMSE to mean ratio in {a}% cases than in non-cluster approach (RMSE to mean ratio < 0.3)')
print(f'Models based on clusters returned significantly better RMSE to mean ratio in {b}% cases than in non-cluster approach (RMSE to mean ratio < 0.2)')

Models based on clusters returned better RMSE to mean ratio in 77.5% cases than in non-cluster approach (RMSE to mean ratio < 0.3)
Models based on clusters returned significantly better RMSE to mean ratio in 26.54% cases than in non-cluster approach (RMSE to mean ratio < 0.2)


# Summary :

1. test dataset performed worse than test data after training
2. test data was from the begining distributed among clusters, while test dataset had been applied classification first, so part od entries were misclassified what had impact on regression models' performance
3. however, on the longer run models in some clusters performed better, so using and devloping this approach makes sense