In [32]:
# Import Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [33]:
# Import data

df = pd.read_csv('../data/diamonds_train.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [34]:
# Check the different variables and types

df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [35]:
# Check the nan values

df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [36]:
# Scaling numeric columns

# Do not stratify the label
diamonds = df.drop("price", axis = 1)

# Set a new dataset label variable
diamond_labels = df["price"].copy()

# Drop all the category, so we could have only numeric
diamonds_num = diamonds.drop(["cut", "color", "clarity"], axis = 1)
diamonds_num.head()

Unnamed: 0,carat,depth,table,x,y,z
0,1.21,62.4,58.0,6.83,6.79,4.25
1,0.32,63.0,57.0,4.35,4.38,2.75
2,0.71,65.5,55.0,5.62,5.53,3.65
3,0.41,63.8,56.0,4.68,4.72,3.0
4,1.02,60.5,59.0,6.55,6.51,3.95


In [37]:
from sklearn.preprocessing import StandardScaler

# Perform the feature scaling on the numeric attributes of the dataset

num_scaler = StandardScaler()
diamonds_num_scaled = num_scaler.fit_transform(diamonds_num)

# Preview 
pd.DataFrame(diamonds_num_scaled).head()

Unnamed: 0,0,1,2,3,4,5
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274


In [38]:
# Processing 'object' data columns or transform string values into int

# We need only the category attributes to work with here
diamonds_cat = diamonds[["cut", "color", "clarity"]]
diamonds_cat.head()

Unnamed: 0,cut,color,clarity
0,Premium,J,VS2
1,Very Good,H,VS2
2,Fair,G,VS1
3,Good,D,SI1
4,Ideal,G,SI1


In [39]:
from sklearn.preprocessing import OneHotEncoder

# Perform the one-hot encoding on the category attributes of the dataset
cat_encoder = OneHotEncoder()
diamonds_cat_encoded = cat_encoder.fit_transform(diamonds_cat)

# Convert the encoded categories to arrays and Preview
pd.DataFrame(diamonds_cat_encoded.toarray()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [40]:
from sklearn.compose import ColumnTransformer

num_attribs = list(diamonds_num)
cat_attribs = ["cut", "color", "clarity"]

# Pipeline to transform our dataset
pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_attribs), # Perform feaured scaling on numeric attributes
    ("cat", OneHotEncoder(), cat_attribs) # Perform One-Hot encoding on the category attributes
])

In [41]:
# Transformed dataset to feed the ML Algorithm
diamonds_ready = pipeline.fit_transform(diamonds)

# Preview
pd.DataFrame(diamonds_ready).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# Use test dataset to get Kaggle prices

df_test = pd.read_csv('../data/diamonds_test.csv')

In [43]:
df_test.drop(columns='id', inplace=True)

In [44]:
# Scaling numeric columns

# Drop all the category, so we could have only numeric
diamonds_num_test = df_test.drop(["cut", "color", "clarity"], axis = 1)

from sklearn.preprocessing import StandardScaler

# Perform the feature scaling on the numeric attributes of the dataset

num_scaler = StandardScaler()
diamonds_num_test_scaled = num_scaler.fit_transform(diamonds_num_test)

# Preview 
pd.DataFrame(diamonds_num_test_scaled).head()

diamonds_cat_test = diamonds[["cut", "color", "clarity"]]

from sklearn.preprocessing import OneHotEncoder

# Perform the one-hot encoding on the category attributes of the dataset
cat_encoder = OneHotEncoder()
diamonds_cat_test_encoded = cat_encoder.fit_transform(diamonds_cat_test)

# Convert the encoded categories to arrays and Preview
pd.DataFrame(diamonds_cat_test_encoded.toarray()).head()

from sklearn.compose import ColumnTransformer

num_attribs = list(diamonds_num_test)
cat_attribs = ["cut", "color", "clarity"]

# Pipeline to transform our dataset
pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_attribs), # Perform feaured scaling on numeric attributes
    ("cat", OneHotEncoder(), cat_attribs) # Perform One-Hot encoding on the category attributes
])

diamonds_test_ready = pipeline.fit_transform(df_test)

In [45]:
# Preview
pd.DataFrame(diamonds_test_ready).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,-0.018412,0.6695,1.121874,0.075022,0.133236,0.173091,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.855078,-0.514957,-0.219192,0.964007,1.019395,0.870787,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.643349,0.321131,1.568896,1.475847,1.400444,1.404319,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.215939,1.435914,-1.560258,0.317472,0.345914,0.487738,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.636246,0.808848,0.22783,-0.616411,-0.575691,-0.483564,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [46]:
# Assign the features and target columns
target_col = 'price'

X = diamonds_ready
y = df[target_col]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((32364, 26), (8091, 26), (32364,), (8091,))

In [50]:
# Linear Regression model

model = LinearRegression(fit_intercept=True)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error

mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)

1143.527025733809

In [54]:
# Gradient Booster Regressor model

from sklearn import ensemble

model = ensemble.GradientBoostingRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)

755.152662878769

In [None]:
# Gradient Booster Regressor model

from sklearn import ensemble

model = ensemble.GradientBoostingRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)

In [55]:
# Random Forest Regressor model

model = ensemble.RandomForestRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)

544.8088667639856

In [56]:
submission = model.predict(diamonds_test_ready)
submission_random_forest_regression = pd.DataFrame(submission)

In [57]:
submission_random_forest_regression.reset_index(inplace=True)

In [58]:
submission_random_forest_regression = submission_random_forest_regression.rename(columns={'index': 'id', 0: 'price'})

In [59]:
submission_random_forest_regression.to_csv('../data/submission_random_forest_regression.csv', index=False)