# Import necessary libraries

In [9]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [8]:
import requests
from io import StringIO
def read_gd(sharingurl):
    file_id = sharingurl.split('/')[-2]
    download_url='https://drive.google.com/uc?export=download&id=' + file_id
    url = requests.get(download_url).text
    csv_raw = StringIO(url)
    return csv_raw 

url = "https://drive.google.com/file/d/1rAYtC4NcMVKzQdevoyeTr3T0pNKicmlP/view?usp=sharing"
gdd = read_gd(url)

df = pd.read_csv(gdd)


df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## Step 2 - Exploratory Data Analysis

In [10]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [13]:
d = df.dtypes
d

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [14]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [20]:
sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x1a9acf974d0>

In [15]:
#Univariate analysis - categorical data

fig, axs = plt.subplots(1, 3, figsize=(12,3), constrained_layout=True)
fig.suptitle('Univariate plotting - categorical features')

axs[0].set_title('Cut')
sns.countplot(data=df, x='cut', ax=axs[0])

axs[1].set_title('Clarity')
sns.countplot(data=df, x='clarity', ax=axs[1])

axs[2].set_title('Color')
sns.countplot(data=df, x='color', ax=axs[2])

<Axes: title={'center': 'Color'}, xlabel='color', ylabel='count'>

In [16]:
#Cat vs Num analysis - Bivariate

fig, axs = plt.subplots(1,3, figsize=(10,3), constrained_layout=True)
fig.suptitle('Num vs Cat feature visualization')

axs[0].set_title('Cut vs Price')
sns.boxplot(data=df, x='price', y='cut', ax=axs[0])

axs[1].set_title('Clarity vs Price')
sns.boxplot(data=df, x='price', y='clarity', ax=axs[1])


axs[2].set_title('Color vs Price')
sns.boxplot(data=df, x='price', y='color', ax=axs[2])

<Axes: title={'center': 'Color vs Price'}, xlabel='price', ylabel='color'>

In [17]:
fig, axs =plt.subplots(2,3, figsize=(12,7), constrained_layout=True)
fig.suptitle('Univariate plot for numerical features')

axs[0,0].set_title('Carat')
sns.histplot(data=df, x='carat', ax=axs[0,0])

axs[0,1].set_title('Depth')
sns.histplot(data=df, x='depth', ax=axs[0,1])

axs[0,2].set_title('Table')
sns.histplot(data=df, x='table', ax=axs[0,2])


axs[1,0].set_title('X')
sns.histplot(data=df, x='x', ax=axs[1,0])

axs[1,1].set_title('Y')
sns.histplot(data=df, x='y', ax=axs[1,1])

axs[1,2].set_title('Z')
sns.histplot(data=df, x='z', ax=axs[1,2])


<Axes: title={'center': 'Z'}, xlabel='z', ylabel='Count'>

In [18]:
#Num vs Num 1

sns.pairplot(data=df,
            x_vars=['carat', 'depth', 'table'],
            y_vars='price')

<seaborn.axisgrid.PairGrid at 0x1a9a81eb620>

In [19]:
#Num vs Num 2

sns.pairplot(data=df,
            x_vars=['x', 'y', 'z'],
            y_vars='price')

<seaborn.axisgrid.PairGrid at 0x1a9acba2990>

## Separate the inputs and outputs

In [21]:


X = df.drop(columns=['price'])
y = df['price']

## Split the data into train and test

In [22]:
## Step 4 - split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(40455, 9) (13485, 9)
(40455,) (13485,)


## Apply Data Preparation on training data

In [23]:
#1. seperate numerical and categorical fatures

X_train_cat = X_train.select_dtypes(include=['object'])
X_train_num = X_train.select_dtypes(include=['number'])

In [24]:
X_train_cat.head()

Unnamed: 0,cut,color,clarity
441,Premium,H,SI2
50332,Very Good,D,SI1
35652,Ideal,G,VVS2
9439,Very Good,H,VS1
15824,Good,F,VS2


In [25]:
X_train_num.head()

Unnamed: 0,carat,depth,table,x,y,z
441,0.89,60.2,59.0,6.26,6.23,3.76
50332,0.7,64.0,53.0,5.57,5.61,3.58
35652,0.31,62.7,57.0,4.33,4.31,2.71
9439,0.9,62.3,59.0,6.12,6.17,3.83
15824,1.01,60.6,62.0,6.52,6.49,3.94


In [26]:
#2. Using standardization for rescaling

from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

#after rescaling, column names are lost since the dataframe is converted to a numpy ndarray
X_train_num_transformed = pd.DataFrame(std_scaler.fit_transform(X_train_num),
                                      columns=std_scaler.get_feature_names_out(),
                                      index=X_train_num.index)
X_train_num_transformed.describe().round(3)


Unnamed: 0,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,-0.0,0.0,-0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.26,-13.095,-6.484,-5.104,-4.978,-4.985
25%,-0.838,-0.457,-0.652,-0.909,-0.88,-0.885
50%,-0.206,0.101,-0.203,-0.036,-0.02,-0.026
75%,0.511,0.52,0.694,0.722,0.7,0.707
max,8.882,12.041,9.667,4.463,46.156,39.819


In [27]:
#defining the ordering for categorical columns - lowest to highest

cut_cat = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_cat = ['J', 'I', 'G', 'F', 'E', 'D']
clarity_cat = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

# creating the ordinal encoder
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(categories = [cut_cat, color_cat, clarity_cat],
                                handle_unknown='use_encoded_value',
                                unknown_value=-1,
                                encoded_missing_value=-5)

#applying the encoder
X_train_cat_transformed = pd.DataFrame(ordinal_encoder.fit_transform(X_train_cat),
                                columns=ordinal_encoder.get_feature_names_out(),
                                index = X_train_cat.index)
X_train_cat_transformed.head()

Unnamed: 0,cut,color,clarity
441,3.0,-1.0,1.0
50332,2.0,5.0,2.0
35652,4.0,2.0,5.0
9439,2.0,-1.0,4.0
15824,1.0,3.0,3.0


In [28]:
#concatenate X_train_num_transformed and X_train_cat_transformed

X_train_transformed = pd.concat([X_train_num_transformed, X_train_cat_transformed], axis=1)

X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
441,0.19492,-1.085697,0.694368,0.472201,0.430971,0.312528,3.0,-1.0,1.0
50332,-0.205679,1.567623,-1.997562,-0.142482,-0.107276,0.058918,2.0,5.0,2.0
35652,-1.027961,0.659908,-0.202942,-1.247129,-1.23586,-1.166861,4.0,2.0,5.0
9439,0.216004,0.380611,0.694368,0.347483,0.378883,0.411154,2.0,-1.0,4.0
15824,0.44793,-0.8064,2.040332,0.703821,0.656688,0.566137,1.0,3.0,3.0


## Apply data preparation on test data

In [29]:
#seperate cat and num features

X_test_cat = X_test.select_dtypes(include=['object'])
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

#applying transformation on numerical columns
X_test_num_transformed = pd.DataFrame(std_scaler.transform(X_test_num),
                                     columns=std_scaler.get_feature_names_out(),
                                     index=X_test_num.index)

#applying transformation on categorical columns
X_test_cat_transformed = pd.DataFrame(ordinal_encoder.transform(X_test_cat),
                                     columns=ordinal_encoder.get_feature_names_out(),
                                     index=X_test_cat.index)

#concatenating
X_test_transformed = pd.concat([X_test_num_transformed, X_test_cat_transformed], axis=1)
X_test_transformed.head()


Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
10176,0.637687,0.171139,-1.100252,0.783997,0.795591,0.805657,4.0,-1.0,1.0
16083,1.038286,0.590084,-0.651597,1.095792,1.03867,1.143803,4.0,-1.0,2.0
13420,0.848529,-0.457279,0.245713,1.024525,0.925812,0.904283,3.0,1.0,2.0
20407,1.481053,-0.596928,-0.651597,1.514489,1.411971,1.355144,4.0,3.0,2.0
8909,0.216004,-0.038334,-0.202942,0.392025,0.413609,0.397064,2.0,3.0,3.0


## Apply Linear Regression Model

In [30]:
%%time

from sklearn.linear_model import LinearRegression
from sklearn import metrics

LR_regressor = LinearRegression()
LR_regressor.fit(X_train_transformed, y_train)

y_test_pred = LR_regressor.predict(X_test_transformed)

LR_MAE = metrics.mean_absolute_error(y_test, y_test_pred)

print("Model's error: ", LR_MAE)

Model's error:  828.3725514326799
CPU times: total: 0 ns
Wall time: 28.5 ms


In [31]:
output_df = pd.DataFrame({'Actual': y_test})

output_df['LR predictions'] = y_test_pred
output_df.head()

Unnamed: 0,Actual,LR predictions
10176,4733,4834.76046
16083,6424,6956.116354
13420,5510,6484.393756
20407,8770,9847.956335
8909,4493,4711.351694


In [32]:
fig, ax = plt.subplots(figsize=(8,4))
sns.histplot(output_df['Actual'], color='blue', alpha=0.5, label='actual')
sns.histplot(output_df['LR predictions'], color='red', alpha=0.5, label='prediction')

plt.legend()

<matplotlib.legend.Legend at 0x1a9a81ea120>

## Apply KNN Regression Model

In [33]:
%%time

from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

KNN_regressor = KNeighborsRegressor()
KNN_regressor.fit(X_train_transformed, y_train)

y_test_pred = KNN_regressor.predict(X_test_transformed)

KNN_MAE = metrics.mean_absolute_error(y_test, y_test_pred)
print("Model's error: ", KNN_MAE)

Model's error:  385.01570634037824
CPU times: total: 1.33 s
Wall time: 2.53 s


In [34]:
output_df['KNN predictions'] = y_test_pred
output_df.head()

Unnamed: 0,Actual,LR predictions,KNN predictions
10176,4733,4834.76046,4664.4
16083,6424,6956.116354,6278.2
13420,5510,6484.393756,5180.8
20407,8770,9847.956335,12080.0
8909,4493,4711.351694,6015.2


In [35]:
fig, ax = plt.subplots(figsize=(8,3))

sns.histplot(output_df['Actual'], color='blue', alpha=0.5, label='actual')
sns.histplot(output_df['KNN predictions'], color='red', alpha=0.5, label='predictions')

plt.legend()

<matplotlib.legend.Legend at 0x1a9b6b54e10>

## Using Decision Tree Regression Model

In [36]:
%%time

from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

DT_regressor = DecisionTreeRegressor()
DT_regressor.fit(X_train_transformed, y_train)

y_test_pred = DT_regressor.predict(X_test_transformed)

DT_MAE = metrics.mean_absolute_error(y_test, y_test_pred)
print("Model's error:", DT_MAE)

Model's error: 354.4599555061179
CPU times: total: 422 ms
Wall time: 694 ms


In [37]:
output_df['DT predictions'] = y_test_pred
output_df.head()

Unnamed: 0,Actual,LR predictions,KNN predictions,DT predictions
10176,4733,4834.76046,4664.4,4620.0
16083,6424,6956.116354,6278.2,7109.0
13420,5510,6484.393756,5180.8,5098.0
20407,8770,9847.956335,12080.0,12477.0
8909,4493,4711.351694,6015.2,4253.0


In [38]:
fig, ax = plt.subplots(figsize=(8,3))

sns.histplot(output_df['Actual'], color='blue', alpha=0.5, label='actual')
sns.histplot(output_df['DT predictions'], color='red', alpha=0.5, label='predictions')
plt.legend()

<matplotlib.legend.Legend at 0x1a9b6c73d90>

## Apply Ensemble (Random Forest Regression Model) 


In [39]:
%%time

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

RF_regressor = RandomForestRegressor()
RF_regressor.fit(X_train_transformed, y_train)

y_test_pred = RF_regressor.predict(X_test_transformed)

RF_MAE = metrics.mean_absolute_error(y_test, y_test_pred)
print("Model's error: ", RF_MAE)

Model's error:  270.64114205848944
CPU times: total: 25.2 s
Wall time: 35 s


In [40]:
output_df['RF predictions'] = y_test_pred
output_df.head()

Unnamed: 0,Actual,LR predictions,KNN predictions,DT predictions,RF predictions
10176,4733,4834.76046,4664.4,4620.0,4645.39
16083,6424,6956.116354,6278.2,7109.0,7043.45
13420,5510,6484.393756,5180.8,5098.0,5157.53
20407,8770,9847.956335,12080.0,12477.0,11132.08
8909,4493,4711.351694,6015.2,4253.0,4714.26


In [41]:
fig, ax = plt.subplots(figsize=(8,3))

sns.histplot(output_df['Actual'], color='blue', alpha=0.5, label='actual')
sns.histplot(output_df['RF predictions'], color='red', alpha=0.5, label='predictions')

plt.legend()

<matplotlib.legend.Legend at 0x1a9b6cefed0>

In [42]:
fig, ax = plt.subplots(figsize=(8,3))

sns.histplot(output_df['Actual'], color='blue', alpha=0.5, label='actual')
sns.histplot(output_df['RF predictions'], color='red', alpha=0.5, label='predictions')

plt.legend()

<matplotlib.legend.Legend at 0x1a9b729df90>

## Comparing all the models

In [43]:
df_melted = pd.melt(output_df, var_name='Model', value_name='Prediction')

fig, ax = plt.subplots(figsize=(8,4))
sns.boxplot(x='Prediction', y='Model', data=df_melted)

plt.title('Comparison of actual vs predicted values')

plt.show()

  plt.show()


Each boxplot displays the distribution of predictions for each model relative to the actual values, showing key statistical features such as medians, interquartile ranges, and outliers. The actual values have a tighter distribution compared to the predictions, indicating that the models exhibit variability in their ability to approximate the actual data.

Among the models, Random Forest predictions appear to have the most consistency (narrower spread), while the others show greater variance and potential outliers, suggesting differences in predictive performance. This visualization helps highlight how closely each model aligns with the actual values and where they deviate.

In [44]:
#comparing the MAE values
# Comparing the MAE values
mae_data = {
    'Algorithm/Model': ['Linear regression', 'KNN', 'Decision Tree', 'Random Forest'],
    'Mean Absolute Value': [LR_MAE, KNN_MAE, DT_MAE, RF_MAE]
}

# Create the DataFrame
models = pd.DataFrame(data=mae_data)

# Display the DataFrame
models

Unnamed: 0,Algorithm/Model,Mean Absolute Value
0,Linear regression,828.372551
1,KNN,385.015706
2,Decision Tree,354.459956
3,Random Forest,270.641142


The Mean Absolute Error (MAE) scores indicate the average magnitude of error between the predicted diamond prices and the actual prices for each model. Linear Regression has the highest MAE (828.37), suggesting it struggles the most to accurately predict prices.

In contrast, Random Forest has the lowest MAE (270.68), making it the most accurate and reliable model for this task. KNN (385.02) and Decision Tree (357.92) also perform better than Linear Regression but are less accurate than Random Forest. Based on the MAE scores, Random Forest is the recommended model for predicting diamond prices, as it minimizes prediction errors more effectively than the other models.

To move on with the next part of this case study, we would save some relevant files like the scaler, encoder and model to be used to create our Stremlit app.

In [45]:
#importing required library
import joblib

# Save the trained model
joblib.dump(RF_regressor,  'Streamlit_files/random_forest_model.pkl')

# Save the scaler
joblib.dump(std_scaler, 'Streamlit_files/scaler.pkl')

# Save the encoder (if applicable, for categorical features)
joblib.dump(ordinal_encoder, 'Streamlit_files/encoder.pkl')

print("Model, scaler, and encoder saved successfully!")

FileNotFoundError: [Errno 2] No such file or directory: 'Streamlit_files/random_forest_model.pkl'

In [46]:
import os
import joblib

# Define the directory and file paths
output_dir = "Streamlit"
model_path = os.path.join(output_dir, "random_forest_model.pkl")
scaler_path = os.path.join(output_dir, "scaler.pkl")
encoder_path = os.path.join(output_dir, "encoder.pkl")

# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Save the model and scaler
joblib.dump(RF_regressor, model_path)
joblib.dump(std_scaler, scaler_path)
joblib.dump(ordinal_encoder, encoder_path)

print(f"Model saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}") 
print(f"Encoder saved to: {encoder_path}")

Model saved to: Streamlit\random_forest_model.pkl
Scaler saved to: Streamlit\scaler.pkl
Encoder saved to: Streamlit\encoder.pkl
