# Diamond's Cut Estimator

This Machine-Learning algorithm predicts a diamond's cut from the sample characteristics

In [46]:
import pandas as pd
import numpy as np

np.random.seed(47)

In [47]:
# Import the dataset and set into a DF
df = pd.read_csv('diamonds.csv', index_col=0)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [48]:
# check if are there any NaN
df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [49]:
# move `cut` feature to the right
df = df[['carat', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z', 'cut']]
df

Unnamed: 0,carat,color,clarity,depth,table,price,x,y,z,cut
1,0.23,E,SI2,61.5,55.0,326,3.95,3.98,2.43,Ideal
2,0.21,E,SI1,59.8,61.0,326,3.89,3.84,2.31,Premium
3,0.23,E,VS1,56.9,65.0,327,4.05,4.07,2.31,Good
4,0.29,I,VS2,62.4,58.0,334,4.20,4.23,2.63,Premium
5,0.31,J,SI2,63.3,58.0,335,4.34,4.35,2.75,Good
...,...,...,...,...,...,...,...,...,...,...
53936,0.72,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,Ideal
53937,0.72,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,Good
53938,0.70,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,Very Good
53939,0.86,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,Premium


In [50]:
df.replace(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], range(0, 5), inplace=True)
df

Unnamed: 0,carat,color,clarity,depth,table,price,x,y,z,cut
1,0.23,E,SI2,61.5,55.0,326,3.95,3.98,2.43,4
2,0.21,E,SI1,59.8,61.0,326,3.89,3.84,2.31,3
3,0.23,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1
4,0.29,I,VS2,62.4,58.0,334,4.20,4.23,2.63,3
5,0.31,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1
...,...,...,...,...,...,...,...,...,...,...
53936,0.72,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,4
53937,0.72,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,1
53938,0.70,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,2
53939,0.86,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,3


In [51]:
df.dtypes

carat      float64
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
cut          int64
dtype: object

In [54]:
# Split into X and y
X = df.drop('cut', axis=1)
y = df['cut']

# Split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((43152, 9), (10788, 9), (43152,), (10788,))

In [80]:
# Turn the characteristics into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features_X = ['color', 'clarity']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features_X)], remainder="passthrough")

# fill the train and test values separately
transformed_X_train = transformer.fit_transform(X_train)
transformed_X_test = transformer.transform(X_test)

In [86]:
# Fit a model
from sklearn.ensemble import RandomForestClassifier

# Setup model
model = RandomForestClassifier()

model.fit(transformed_X_train, y_train)
model_score = model.score(transformed_X_test, y_test)

In [87]:
print(f'The model has a {"{:.2f}".format(model_score * 100)} % of accuracy')

The model has a 77.50 % of accuracy


In [85]:
# Tests with other models
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler

RandomForestCls_model = RandomForestClassifier()
svm_model = svm.SVC()
SGDClassifier_model = SGDClassifier()

estimators = [RandomForestCls_model, svm_model, SGDClassifier_model] 

for element in estimators:
    model.fit(transformed_X_train, y_train)
    model_score = model.score(transformed_X_test, y_test)
    print(f'This model has a {"{:.2f}".format(model_score * 100)} % of accuracy')

This model has a 77.35 % of accuracy
This model has a 77.69 % of accuracy
This model has a 77.54 % of accuracy
