In [24]:
import pandas as pd
from path import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [25]:
diamond_df = pd.read_csv(Path("Diamonds_price_data.csv"))
diamond_df.head

<bound method NDFrame.head of        Unnamed: 0  carat        cut color clarity  depth  table  price     x  \
0               1   0.23      Ideal     E     SI2   61.5   55.0    326  3.95   
1               2   0.21    Premium     E     SI1   59.8   61.0    326  3.89   
2               3   0.23       Good     E     VS1   56.9   65.0    327  4.05   
3               4   0.29    Premium     I     VS2   62.4   58.0    334  4.20   
4               5   0.31       Good     J     SI2   63.3   58.0    335  4.34   
...           ...    ...        ...   ...     ...    ...    ...    ...   ...   
53938       53939   0.86    Premium     H     SI2   61.0   58.0   2757  6.15   
53939       53940   0.75      Ideal     D     SI2   62.2   55.0   2757  5.83   
53940       53941   0.71    Premium     E     SI1   60.5   55.0   2756  5.79   
53941       53942   0.71    Premium     F     SI1   59.8   62.0   2756  5.74   
53942       53943   0.70  Very Good     E     VS2   60.5   59.0   2757  5.71   

         

In [26]:
# used label encoding for color, cut, and clarity features.
diamond_df['color'] = diamond_df['color'].replace(['J','I','H','G','F','E','D'],[1,2,3,4,5,6,7])
diamond_df['cut'] = diamond_df['cut'].replace(['Fair','Good','Very Good','Premium','Ideal'],[1,2,3,4,5])
diamond_df['clarity'] = diamond_df['clarity'].replace(['I3','I2','I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF'],[1,2,3,4,5,6,7,8,9,10])

# dropping unnecessary column.
diamond_df = diamond_df.drop(columns=["Unnamed: 0"])

# scaling diamond_df.
diamond_data_scaled = StandardScaler().fit_transform(diamond_df[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']])

# turning scaled data into DataFrame.
scaled_diamond_df = pd.DataFrame(diamond_data_scaled, columns=['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z'])

features = scaled_diamond_df.drop(columns=["price"])

features.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,-1.198189,0.981506,0.937131,-1.245216,-0.174033,-1.099673,-1.587882,-1.536239,-1.571166
1,-1.240384,0.085903,0.937131,-0.638084,-1.360676,1.585457,-1.641372,-1.658821,-1.741217
2,-1.198189,-1.705304,0.937131,0.576181,-3.384949,3.375544,-1.498733,-1.457436,-1.741217
3,-1.071605,0.085903,-1.414328,-0.030951,0.454189,0.242892,-1.36501,-1.317342,-1.287749
4,-1.029411,-1.705304,-2.002193,-1.245216,1.082412,0.242892,-1.240202,-1.212272,-1.117699


In [27]:
# creating numpy array of price data (labels, the value we want to predict).
labels = np.array(scaled_diamond_df['price'])

# converting to numpy array.
features = np.array(features)

In [13]:
# splitting data into training and testing sets.
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

# instantiate model.
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# fit the model to the data.
rf.fit(train_features, train_labels);

In [14]:
predictions = rf.predict(test_features)

# calculate the absolute errors
errors = abs(predictions - test_labels)

# print out the mean absolute error.
print('Mean Absolute Error:', round(np.mean(errors), 2), 'dollars')

# calculate mean absolute percentage error.
mape = 100 * (errors / test_labels)

# calculate accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.07 dollars
Accuracy: 99.72 %.


In [15]:
# numerical feature importances.
importances = list(rf.feature_importances_)

# list of tuples w/ features matched with their improtance.
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# sort most important features first.
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# print out feature and corresponding importances.
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];


Variable: carat                Importance: 0.62
Variable: y                    Importance: 0.27
Variable: clarity              Importance: 0.06
Variable: color                Importance: 0.03
Variable: x                    Importance: 0.01
Variable: cut                  Importance: 0.0
Variable: depth                Importance: 0.0
Variable: table                Importance: 0.0
Variable: z                    Importance: 0.0
