In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/diamonds_raw.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.shape

(50000, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    50000 non-null  float64
 1   cut      50000 non-null  object 
 2   color    50000 non-null  object 
 3   clarity  50000 non-null  object 
 4   depth    50000 non-null  float64
 5   table    50000 non-null  float64
 6   price    50000 non-null  int64  
 7   x        50000 non-null  float64
 8   y        50000 non-null  float64
 9   z        50000 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.8+ MB


In [5]:
categoricals = ['cut', 'color', 'clarity']
categoricals

['cut', 'color', 'clarity']

In [6]:
for category in categoricals :

    print(df[category].value_counts(), '\n')

cut
Ideal        19938
Premium      12806
Very Good    11204
Good          4557
Fair          1495
Name: count, dtype: int64 

color
G    10452
E     9085
F     8864
H     7711
D     6224
I     5058
J     2606
Name: count, dtype: int64 

clarity
SI1     12115
VS2     11404
SI2      8519
VS1      7579
VVS2     4694
VVS1     3369
IF       1632
I1        688
Name: count, dtype: int64 



In [7]:
numericals = df.drop(columns = categoricals).columns.tolist()
numericals

['carat', 'depth', 'table', 'price', 'x', 'y', 'z']

In [8]:
pd.DataFrame(data = df, columns = numericals).describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
carat,50000.0,0.799444,0.475173,0.2,0.4,0.7,1.04,5.01
depth,50000.0,61.753006,1.431088,43.0,61.0,61.8,62.5,79.0
table,50000.0,57.45783,2.232092,43.0,56.0,57.0,59.0,95.0
price,50000.0,3944.80544,3997.938105,326.0,951.0,2410.0,5351.0,18823.0
x,50000.0,5.734403,1.123077,0.0,4.71,5.7,6.54,10.74
y,50000.0,5.737956,1.145579,0.0,4.72,5.71,6.54,58.9
z,50000.0,3.541056,0.707065,0.0,2.91,3.53,4.04,31.8


In [9]:
import json

with open('../data/metadata.json', 'r') as file:
    metadata = json.load(file)

print("Variables' description: \n")
print(json.dumps(metadata, indent = 4))

Variables' description: 

{
    "metadata": {
        "carat": "diamond weight in carat",
        "cut": "diamond cutting qualityuring",
        "color": "diamond color from J (worst) to D (best)",
        "clarity": "measure of diamond clarity (from left to right is worst to best: I1, SI2, SI1, VS2, VS1, VVS2, VVS1, IF)",
        "depth": "percentage depth that is equal to z / mean(x,y)",
        "table": "width of the widest point at the top of the diamond",
        "price": "diamond price",
        "x": "diamond length in mm",
        "y": "diamond width in mm",
        "z": "diamond depth in mm"
    }
}


Categorical features (cut, color, clarity) are ordinal, so can be mapped with numerical features instead converting into dummy variables.

In [10]:
#Mapping cut classes with numerics

cut_class = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal' ]
cut_numeric = list(range(1, 6))

color_class = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
color_numeric = list(range(8, 1, -1))

clarity_class = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
clarity_numeric = list(range(1, 9))

In [11]:
maps = ['cut_map', 'color_map', 'clarity_map']
classes = [cut_class, color_class, clarity_class]
numerics = [cut_numeric, color_numeric, clarity_numeric]

for map_name, clas, numeric in zip(maps, classes, numerics):
    globals()[map_name] = dict(zip(clas, numeric))

print("cut_map:", cut_map)
print("color_map:", color_map)
print("clarity_map:", clarity_map)

cut_map: {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
color_map: {'D': 8, 'E': 7, 'F': 6, 'G': 5, 'H': 4, 'I': 3, 'J': 2}
clarity_map: {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}


In [12]:
mappings = {
    'cut_map': {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5},
    'color_map': {'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1},
    'clarity_map': {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}
}

with open('../data/mappings.json', 'w') as f:
    json.dump(mappings, f, indent = 4)

In [13]:
new_df = df.copy()

new_df['cut'] = new_df['cut'].map(cut_map)
new_df['color'] = new_df['color'].map(color_map)
new_df['clarity'] = new_df['cut'].replace(clarity_map) #why .map() is generating NaN here??

new_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,5,7,5,61.5,55.0,326,3.95,3.98,2.43
1,0.21,4,7,4,59.8,61.0,326,3.89,3.84,2.31
2,0.23,2,7,2,56.9,65.0,327,4.05,4.07,2.31
3,0.29,4,3,4,62.4,58.0,334,4.2,4.23,2.63
4,0.31,2,2,2,63.3,58.0,335,4.34,4.35,2.75


In [14]:
new_df.corr()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
carat,1.0,-0.135135,-0.29153,-0.135135,0.027734,0.183639,0.921804,0.975037,0.950035,0.9527
cut,-0.135135,1.0,0.019548,1.0,-0.223898,-0.432154,-0.053537,-0.125738,-0.121335,-0.14983
color,-0.29153,0.019548,1.0,0.019548,-0.047426,-0.027513,-0.172629,-0.270529,-0.263395,-0.268388
clarity,-0.135135,1.0,0.019548,1.0,-0.223898,-0.432154,-0.053537,-0.125738,-0.121335,-0.14983
depth,0.027734,-0.223898,-0.047426,-0.223898,1.0,-0.293012,-0.012731,-0.025563,-0.029809,0.094337
table,0.183639,-0.432154,-0.027513,-0.432154,-0.293012,1.0,0.129848,0.197198,0.185248,0.153161
price,0.921804,-0.053537,-0.172629,-0.053537,-0.012731,0.129848,1.0,0.884919,0.864393,0.860963
x,0.975037,-0.125738,-0.270529,-0.125738,-0.025563,0.197198,0.884919,1.0,0.972977,0.970122
y,0.950035,-0.121335,-0.263395,-0.121335,-0.029809,0.185248,0.864393,0.972977,1.0,0.95003
z,0.9527,-0.14983,-0.268388,-0.14983,0.094337,0.153161,0.860963,0.970122,0.95003,1.0


In [16]:
#new_df.to_csv('../data/diamonds_preprocessed.csv', index = False)

In [15]:
df = new_df.sample(frac = 1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
28377,0.31,5,8,5,62.0,55.0,734,4.36,4.38,2.71
36301,0.35,5,7,5,61.2,56.0,1063,4.55,4.53,2.78
4604,1.06,4,2,4,61.3,60.0,3740,6.59,6.52,4.02
3753,0.31,2,8,2,63.4,58.0,571,4.32,4.35,2.75
6515,0.85,5,5,5,61.6,54.0,4164,6.08,6.12,3.76


In [16]:
y = df['price']
X = df.drop(columns = ['price'], axis = 1)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [33]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

index = [
    'Linear',
    'Elastic Net',
    'Random Forest',
    'XGBoost',
    'LightGBM',
    'CatBoost',
    'SVM'
]

rmse = []
mae = []

In [34]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

lr_y_pred = lr.predict(X_test)

lr_rmse = root_mean_squared_error(y_test, lr_y_pred)
lr_mae = mean_absolute_error(y_test, lr_y_pred)

rmse.append(lr_rmse)
mae.append(lr_mae)

print("Linear Regression RMSE:", lr_rmse)
print("Linear Regression MAE:", lr_mae)

Linear Regression RMSE: 1416.278595163559
Linear Regression MAE: 876.2264104228138


In [35]:
from sklearn.linear_model import ElasticNet

en = ElasticNet(alpha = 0.05, l1_ratio = 0.95, random_state = 42)  # Adjust alpha and l1_ratio as needed
en.fit(X_train, y_train)

en_y_pred = en.predict(X_test)

en_rmse = root_mean_squared_error(y_test, en_y_pred)
en_mae = mean_absolute_error(y_test, en_y_pred)

rmse.append(en_rmse)
mae.append(en_mae)

print("ElasticNet Regression RMSE:", en_rmse)
print("ElasticNet Regression MAE:", en_mae)

ElasticNet Regression RMSE: 1414.893764360218
ElasticNet Regression MAE: 890.2595264478367


In [36]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

rf_y_pred = rf.predict(X_test)

rf_rmse = root_mean_squared_error(y_test, rf_y_pred)
rf_mae = mean_absolute_error(y_test, rf_y_pred)

rmse.append(rf_rmse)
mae.append(rf_mae)

print("Random Forest Regressor RMSE:", rf_rmse)
print("Random Forest Regressor MAE:", rf_mae)

Random Forest Regressor RMSE: 1176.5525703974206
Random Forest Regressor MAE: 640.900701468254


In [37]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 42)
xgb.fit(X_train, y_train)

xgb_y_pred = xgb.predict(X_test)

xgb_rmse = root_mean_squared_error(y_test, xgb_y_pred)
xgb_mae = mean_absolute_error(y_test, xgb_y_pred)

rmse.append(xgb_rmse)
mae.append(xgb_mae)

print("XGB Regressor RMSE:", xgb_rmse)
print("XGB Regressor MAE:", xgb_mae)

XGB Regressor RMSE: 1156.8857044649399
XGB Regressor MAE: 634.9913528564454


In [38]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 42, verbose = -1)
lgbm.fit(X_train, y_train)

lgbm_y_pred = lgbm.predict(X_test)

lgbm_rmse = root_mean_squared_error(y_test, lgbm_y_pred)
lgbm_mae = mean_absolute_error(y_test, lgbm_y_pred)

rmse.append(lgbm_rmse)
mae.append(lgbm_mae)

print("LGBM Regressor RMSE:", lgbm_rmse)
print("LGBM Regressor MAE:", lgbm_mae)

LGBM Regressor RMSE: 1160.9280039188463
LGBM Regressor MAE: 637.7579081363968


In [39]:
from catboost import CatBoostRegressor

catboost = CatBoostRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 42, verbose = 0, allow_writing_files = False)
catboost.fit(X_train, y_train)

cat_y_pred = catboost.predict(X_test)

cat_rmse = root_mean_squared_error(y_test, cat_y_pred)
cat_mae = mean_absolute_error(y_test, cat_y_pred)

rmse.append(cat_rmse)
mae.append(cat_mae)

print("Catboost Regressor RMSE:", cat_rmse)
print("Catboost Regressor MAE:", cat_mae)

Catboost Regressor RMSE: 1173.1500958464221
Catboost Regressor MAE: 650.0360734607181


In [40]:
from sklearn.svm import SVR

svr = SVR(kernel = 'rbf', C = 1.0, epsilon = 0.1)  # Adjust C and epsilon as needed
svr.fit(X_train, y_train)

svm_y_pred = svr.predict(X_test)

svm_rmse = root_mean_squared_error(y_test, svm_y_pred)
svm_mae = mean_absolute_error(y_test, svm_y_pred)

rmse.append(svm_rmse)
mae.append(svm_mae)

print("SVM Regressor RMSE:", svm_rmse)
print("SVM Regressor MAE:", svm_mae)

SVM Regressor RMSE: 2832.140879482138
SVM Regressor MAE: 1401.9094673034397


In [41]:
metrics = pd.DataFrame({
    'METHOD': index,
    'RMSE': np.round(rmse, 2),
    'MAE': np.round(mae, 2)
})

metrics.set_index('METHOD', inplace = True)
metrics['RMSA to mean'] = metrics['RMSE'] / y_test.mean()

metrics

Unnamed: 0_level_0,RMSE,MAE,RMSA to mean
METHOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Linear,1416.28,876.23,0.358125
Elastic Net,1414.89,890.26,0.357774
Random Forest,1176.55,640.9,0.297506
XGBoost,1156.89,634.99,0.292535
LightGBM,1160.93,637.76,0.293557
CatBoost,1173.15,650.04,0.296647
SVM,2832.14,1401.91,0.716144


In [42]:
df['price'].mean()

3944.80544

INSIGHTS :
- significant difference between RMSE and MAE (implies some higher residuals)
- RMSE to mean ratio suggest models' performance far from optimal
- it is worth to try clustering the data first and next develop models dedicated to particular cluster