In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diamond-price-prediciton-2024/train.csv
/kaggle/input/diamond-price-prediciton-2024/test.csv


In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [3]:
# reading data
df=pd.read_csv("/kaggle/input/diamond-price-prediciton-2024/train.csv")
df.head()

Unnamed: 0,Id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,1.06,Ideal,I,SI2,61.8,57.0,4270,6.57,6.6,4.07
1,2,1.51,Premium,G,VVS2,60.9,58.0,15164,7.38,7.42,4.51
2,3,0.32,Ideal,F,VS2,61.3,56.0,828,4.43,4.41,2.71
3,4,0.53,Ideal,G,VS2,61.2,56.0,1577,5.19,5.22,3.19
4,5,0.7,Premium,H,VVS2,61.0,57.0,2596,5.76,5.72,3.5


In [4]:
# checking the data content
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43152 entries, 0 to 43151
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Id       43152 non-null  int64  
 1   carat    43152 non-null  float64
 2   cut      43152 non-null  object 
 3   color    43152 non-null  object 
 4   clarity  43152 non-null  object 
 5   depth    43152 non-null  float64
 6   table    43152 non-null  float64
 7   price    43152 non-null  int64  
 8   x        43152 non-null  float64
 9   y        43152 non-null  float64
 10  z        43152 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 3.6+ MB


In [5]:
# checking the description of the data
df.describe()

Unnamed: 0,Id,carat,depth,table,price,x,y,z
count,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0
mean,21576.5,0.797855,61.747177,57.458347,3929.491912,5.731568,5.735018,3.538568
std,12457.053745,0.473594,1.435454,2.233904,3985.527795,1.121279,1.148809,0.708238
min,1.0,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,10788.75,0.4,61.0,56.0,947.75,4.71,4.72,2.91
50%,21576.5,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,32364.25,1.04,62.5,59.0,5312.0,6.54,6.54,4.04
max,43152.0,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [6]:
df.isna().sum()

Id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [7]:
# time to see the categorical cols (cut,clarity,color) and based on 
# the description of the data it seems that they are all ordinal 
# so we gotta order them
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['J', 'I', 'H', 'G', 'F', 'E', 'D'] 
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'] 

# will also change the type from obj to categories to take less space and enhance readability
df['cut'] = df['cut'].astype(pd.CategoricalDtype(categories=cut_categories, ordered=True))
df['color'] = df['color'].astype(pd.CategoricalDtype(categories=color_categories, ordered=True))
df['clarity'] = df['clarity'].astype(pd.CategoricalDtype(categories=clarity_categories, ordered=True))

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43152 entries, 0 to 43151
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   Id       43152 non-null  int64   
 1   carat    43152 non-null  float64 
 2   cut      43152 non-null  category
 3   color    43152 non-null  category
 4   clarity  43152 non-null  category
 5   depth    43152 non-null  float64 
 6   table    43152 non-null  float64 
 7   price    43152 non-null  int64   
 8   x        43152 non-null  float64 
 9   y        43152 non-null  float64 
 10  z        43152 non-null  float64 
dtypes: category(3), float64(6), int64(2)
memory usage: 2.8 MB


In [9]:
# we need to handle x,y,z 0 values 
print(df[df['x']==0].value_counts().sum())
print(df[df['y']==0].value_counts().sum())
print(df[df['z']==0].value_counts().sum())

5
4
17


In [10]:
df.drop(df[df['x']==0].index, inplace =True)
df.drop(df[df['y']==0].index, inplace =True)
df.drop(df[df['z']==0].index, inplace =True)

In [11]:
columns_of_interest = ['depth', 'table','x','y','z']
Q1 = df[columns_of_interest].quantile(0.25)
Q3 = df[columns_of_interest].quantile(0.75)
IQR = Q3 - Q1

In [12]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find outliers using boolean indexing
outliers_mask = ((df[columns_of_interest] < lower_bound) | (df[columns_of_interest] > upper_bound)).any(axis=1)

# Remove outliers from the dataframe
df_final = df[~outliers_mask]
print("Number of outliers removed:", outliers_mask.sum())

Number of outliers removed: 2373


In [13]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40762 entries, 0 to 43151
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   Id       40762 non-null  int64   
 1   carat    40762 non-null  float64 
 2   cut      40762 non-null  category
 3   color    40762 non-null  category
 4   clarity  40762 non-null  category
 5   depth    40762 non-null  float64 
 6   table    40762 non-null  float64 
 7   price    40762 non-null  int64   
 8   x        40762 non-null  float64 
 9   y        40762 non-null  float64 
 10  z        40762 non-null  float64 
dtypes: category(3), float64(6), int64(2)
memory usage: 2.9 MB


In [14]:
df_modified = df_final.drop(['x','y','z','Id'],axis=1)
df_modified.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.06,Ideal,I,SI2,61.8,57.0,4270
1,1.51,Premium,G,VVS2,60.9,58.0,15164
2,0.32,Ideal,F,VS2,61.3,56.0,828
3,0.53,Ideal,G,VS2,61.2,56.0,1577
4,0.7,Premium,H,VVS2,61.0,57.0,2596


In [15]:
X = df_modified.drop('price', axis=1)
y = df_modified['price']

In [16]:
numeric_features = ['carat', 'depth', 'table']
categorical_features = ['cut', 'color', 'clarity']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

In [18]:
xgb_cv_scores = cross_val_score(xgb_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"XGBoost CV RMSE: {np.mean(np.sqrt(-xgb_cv_scores))}, Std: {np.std(xgb_cv_scores)}")

XGBoost CV RMSE: 537.9639897555738, Std: 12237.659746621066


In [19]:
dt_cv_scores = cross_val_score(dt_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Decision Tree CV RMSE: {np.mean(np.sqrt(-dt_cv_scores))}, Std: {np.std(dt_cv_scores)}")

Decision Tree CV RMSE: 712.3957610738782, Std: 13303.139536095103


In [20]:
rf_cv_scores = cross_val_score(rf_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Random Forest CV RMSE: {np.mean(np.sqrt(-rf_cv_scores))}, Std: {np.std(rf_cv_scores)}")

Random Forest CV RMSE: 544.1209318622035, Std: 10288.509584624315


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
param_grid_xgb = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

param_grid_rf = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

param_grid_dt = {
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

In [23]:
grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
grid_search_xgb.fit(X_train, y_train)
best_xgb_model = grid_search_xgb.best_estimator_

In [24]:
grid_search_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_

In [25]:
grid_search_dt = GridSearchCV(dt_pipeline, param_grid_dt, cv=5, scoring='neg_mean_squared_error')
grid_search_dt.fit(X_train, y_train)
best_dt_model = grid_search_dt.best_estimator_

In [26]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return mae, mse, rmse, r2

xgb_metrics = evaluate_model(best_xgb_model, X_test, y_test)
rf_metrics = evaluate_model(best_rf_model, X_test, y_test)
dt_metrics = evaluate_model(best_dt_model, X_test, y_test)

In [27]:
print(f"XGBoost - MAE: {xgb_metrics[0]}, MSE: {xgb_metrics[1]}, RMSE: {xgb_metrics[2]}, R2: {xgb_metrics[3]}")
print(f"Random Forest - MAE: {rf_metrics[0]}, MSE: {rf_metrics[1]}, RMSE: {rf_metrics[2]}, R2: {rf_metrics[3]}")
print(f"Decision Tree - MAE: {dt_metrics[0]}, MSE: {dt_metrics[1]}, RMSE: {dt_metrics[2]}, R2: {dt_metrics[3]}")

XGBoost - MAE: 276.56378499478717, MSE: 288324.53453261283, RMSE: 536.9585966651553, R2: 0.9824124379813156
Random Forest - MAE: 283.4763775244536, MSE: 304248.07649347506, RMSE: 551.5868712120287, R2: 0.9814411148774813
Decision Tree - MAE: 329.5922013745143, MSE: 420033.9791566908, RMSE: 648.1002847991126, R2: 0.974378269021232


In [28]:
submission = pd.read_csv('/kaggle/input/diamond-price-prediciton-2024/test.csv')
submission.head()

Unnamed: 0,Id,carat,cut,color,clarity,depth,table,x,y,z
0,1,0.34,Ideal,G,VVS2,61.1,57.0,4.52,4.48,2.75
1,2,0.71,Premium,E,VS2,62.7,58.0,5.74,5.68,3.58
2,3,0.44,Very Good,I,VS1,62.8,56.0,4.83,4.88,3.05
3,4,0.81,Premium,E,SI2,60.1,59.0,6.09,6.03,3.65
4,5,0.4,Ideal,G,VVS1,61.2,56.0,4.74,4.8,2.92


In [29]:
submission['cut'] = submission['cut'].astype(pd.CategoricalDtype(categories=cut_categories, ordered=True))
submission['color'] = submission['color'].astype(pd.CategoricalDtype(categories=color_categories, ordered=True))
submission['clarity'] = submission['clarity'].astype(pd.CategoricalDtype(categories=clarity_categories, ordered=True))

In [30]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10788 entries, 0 to 10787
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   Id       10788 non-null  int64   
 1   carat    10788 non-null  float64 
 2   cut      10788 non-null  category
 3   color    10788 non-null  category
 4   clarity  10788 non-null  category
 5   depth    10788 non-null  float64 
 6   table    10788 non-null  float64 
 7   x        10788 non-null  float64 
 8   y        10788 non-null  float64 
 9   z        10788 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 622.6 KB


In [31]:
submission_ready = submission.drop(columns=['Id','x','y','z'])

In [32]:
submission_preprocessed = best_xgb_model.named_steps['preprocessor'].transform(submission_ready)

In [33]:
predictions = best_xgb_model.named_steps['regressor'].predict(submission_preprocessed)

In [34]:
submission_results = pd.DataFrame({'id': submission['Id'], 'price': predictions})

In [35]:
submission_results

Unnamed: 0,id,price
0,1,790.679199
1,2,2899.890137
2,3,810.542480
3,4,2818.886230
4,5,1129.536255
...,...,...
10783,10784,1719.500488
10784,10785,6614.743652
10785,10786,3961.333008
10786,10787,4670.072754


In [36]:
submission_results.to_csv('submission.csv', index=False)