In [39]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import matplotlib.pyplot as plt
import os

In [40]:
# Read the applicants_data.csv file from the Resources folder into a Pandas DataFrame
diamond_data_df = pd.read_csv("Diamonds_price_data.csv")

# Review the DataFrame
diamond_data_df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [41]:
# Review the data types associated with the columns
diamond_data_df.dtypes

Unnamed: 0      int64
carat         float64
cut            object
color          object
clarity        object
depth         float64
table         float64
price           int64
x             float64
y             float64
z             float64
dtype: object

In [42]:
diamond_data_df = diamond_data_df.drop(columns=["Unnamed: 0", "depth"])

# Review the DataFrame
diamond_data_df

Unnamed: 0,carat,cut,color,clarity,table,price,x,y,z
0,0.23,Ideal,E,SI2,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53938,0.86,Premium,H,SI2,58.0,2757,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,55.0,2757,5.83,5.87,3.64
53940,0.71,Premium,E,SI1,55.0,2756,5.79,5.74,3.49
53941,0.71,Premium,F,SI1,62.0,2756,5.74,5.73,3.43


In [43]:
categorical_variables = list(diamond_data_df.dtypes[diamond_data_df.dtypes == "object"].index)

# Display the categorical variables list
categorical_variables

['cut', 'color', 'clarity']

In [44]:
data_features = diamond_data_df.drop(['price'], axis=1)
data_labels = diamond_data_df['price']

data_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53943 entries, 0 to 53942
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53943 non-null  float64
 1   cut      53943 non-null  object 
 2   color    53943 non-null  object 
 3   clarity  53943 non-null  object 
 4   table    53943 non-null  float64
 5   x        53943 non-null  float64
 6   y        53943 non-null  float64
 7   z        53943 non-null  float64
dtypes: float64(5), object(3)
memory usage: 3.3+ MB


In [45]:
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['J','I','H','G','F','E','D']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
encod = OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])

data_features[['cut','color','clarity']] = encod.fit_transform(data_features[['cut','color','clarity']])

In [46]:
SEED = 123
#split data into train_validation_test set
x_train,x_vtest,y_train,y_vtest = train_test_split(data_features, data_labels, test_size=0.4, random_state=SEED)
x_val,x_test,y_val,y_test = train_test_split(x_vtest, y_vtest, test_size=0.5, random_state=SEED)

In [47]:
scaler = StandardScaler()
scaler.fit(x_train)
scaler.transform(x_train)
scaler.transform(x_val)
scaler.transform(x_test)

array([[ 0.46395377,  0.97846375, -0.23129574, ...,  0.69862261,
         0.7121326 ,  0.47249257],
       [ 0.48503852,  0.97846375,  0.93965478, ...,  0.63627638,
         0.64275323,  0.7167004 ],
       [ 0.67480128,  0.08429221, -1.40224626, ...,  0.91238112,
         0.83354648,  0.78852623],
       ...,
       [-0.61136853,  0.97846375,  0.93965478, ..., -0.46814258,
        -0.52802353, -0.53306906],
       [-0.84330079, -0.80987933, -0.23129574, ..., -0.87784639,
        -0.88359276, -0.79164205],
       [ 0.78022504,  0.97846375, -0.23129574, ...,  1.01035377,
         0.95496037,  0.87471722]])

In [48]:
#create FNN using Tensorflow Keras
model = tf.keras.Sequential()

model.add(tf.keras.Input(shape=(x_train.shape[-1],)))
model.add(tf.keras.layers.Dense(128,activation='elu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)))
model.add(tf.keras.layers.Dense(64,activation='elu', kernel_regularizer=tf.keras.regularizers.l1(l1=0.05)))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(32,activation='elu', kernel_regularizer=tf.keras.regularizers.l1(l1=0.05)))
model.add(tf.keras.layers.Dense(16,activation='elu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)))
model.add(tf.keras.layers.Dense(1))

In [49]:
#compile model
model.compile(optimizer='adam', loss='mse',metrics=['mse','mae'])

In [50]:
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, show_layer_activations=True)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 128)               1152      
                                                                 
 dense_11 (Dense)            (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 32)                2080      
                                                                 
 dense_13 (Dense)            (None, 16)                528       
                                                                 
 dense_14 (Dense)            (None, 1)                 17        
                                                                 
Total params: 12,033
Trainable params: 12,033
Non-trai

In [51]:
log_dir="logs/fit/"
tb_callback=tf.keras.callbacks.TensorBoard(log_dir=log_dir)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3,verbose=1,mode='min')

history = model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=60, epochs=100, callbacks=[tb_callback, es_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 32: early stopping


In [52]:
test_result = model.evaluate(x_test,y_test,batch_size=40)
print(f"Test loss = {test_result[0]}")
print(f"Test mse = {test_result[1]}")
print(f"Test mae = {test_result[2]}")

Test loss = 418932.3125
Test mse = 418782.875
Test mae = 371.8443908691406
