In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./data/Diamonds_Prices.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
data.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [5]:
data = data.drop_duplicates()

In [6]:
data.duplicated().sum()

0

In [7]:
X = data.drop(["price"], axis=1)
y = data[["price"]]

In [8]:
cat_cols = X.select_dtypes(include="object").columns

In [9]:
num_cols = X.select_dtypes(exclude="object").columns

In [10]:
# Encode the categorical features(Ordinal)
# Cut quality is increasing order Fair, Good, Very Good, Premium, Ideal.
cut_mem = ["Ideal", "Premium", "Very Good", "Good", "Fair"]
color_mem = ["D", "E", "F", "G", "H", "I", "J"]
clarity_mem = ["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1"]

In [11]:
from sklearn.impute import SimpleImputer  # for handling missing value
# scating and transforming feature
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
num_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

In [13]:
cat_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder", OrdinalEncoder(
            categories=[cut_mem, color_mem, clarity_mem]))
    ]
)

In [14]:
preprocessor = ColumnTransformer(
    [
        ("numerical_pipeline", num_pipeline, num_cols),
        ("categorical_pipeline", cat_pipeline, cat_cols)
    ]
)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=30)

In [17]:
preprocessor.fit_transform(X_train)

array([[-1.05158383, -0.03328363,  0.24392371, ...,  0.        ,
         1.        ,  5.        ],
       [ 0.4299916 ,  1.43599539,  0.69144599, ...,  2.        ,
         1.        ,  4.        ],
       [ 0.91679495,  0.03668204, -0.65112086, ...,  0.        ,
         2.        ,  2.        ],
       ...,
       [ 0.23950333,  0.94623572, -0.20359857, ...,  3.        ,
         6.        ,  7.        ],
       [ 0.47232232,  0.31654471,  1.58649056, ...,  3.        ,
         6.        ,  6.        ],
       [-1.03041847,  0.52644171, -0.20359857, ...,  1.        ,
         3.        ,  1.        ]])

In [19]:
X_train_trans = pd.DataFrame(preprocessor.fit_transform(
    X_train), columns=preprocessor.get_feature_names_out())
X_train_trans

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-1.051584,-0.033284,0.243924,-1.284193,-1.278314,-1.277823,0.0,1.0,5.0
1,0.429992,1.435995,0.691446,0.465214,0.525171,0.681416,2.0,1.0,4.0
2,0.916795,0.036682,-0.651121,1.036449,0.989434,1.012758,0.0,2.0,2.0
3,-1.030418,0.876270,-0.203599,-1.284193,-1.260458,-1.191386,2.0,1.0,5.0
4,0.218338,1.016201,-0.651121,0.375959,0.346608,0.494136,2.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...
37650,-0.818765,1.226098,-1.546165,-0.918245,-0.876548,-0.773607,3.0,6.0,0.0
37651,-1.030418,0.876270,-0.203599,-1.230639,-1.260458,-1.162574,1.0,3.0,6.0
37652,0.239503,0.946236,-0.203599,0.295629,0.346608,0.436511,3.0,6.0,7.0
37653,0.472322,0.316545,1.586491,0.608023,0.668021,0.681416,3.0,6.0,6.0


In [20]:
X_test_trans = pd.DataFrame(preprocessor.transform(
    X_test), columns=preprocessor.get_feature_names_out())
X_test_trans

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-1.030418,1.785824,-1.098643,-1.284193,-1.305098,-1.133761,3.0,5.0,5.0
1,-0.183804,0.946236,-0.203599,-0.070319,-0.108727,0.018732,2.0,4.0,5.0
2,-0.522450,-0.313146,-0.651121,-0.391638,-0.349787,-0.399047,0.0,5.0,5.0
3,1.636417,-1.082769,1.138968,1.571982,1.516195,1.372913,1.0,2.0,6.0
4,0.556984,-0.033284,-0.651121,0.759757,0.730518,0.739041,0.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...
16134,1.678748,-1.642494,1.586491,1.688014,1.632261,1.401725,1.0,0.0,6.0
16135,-0.204969,1.086167,-0.651121,-0.079244,-0.028374,0.076357,3.0,2.0,1.0
16136,0.726307,-0.662975,0.243924,0.947194,0.900153,0.825478,1.0,2.0,6.0
16137,-0.522450,-1.152734,0.243924,-0.347010,-0.385500,-0.485484,1.0,2.0,5.0


In [21]:
# Model Training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [22]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [23]:
# Train multiple models

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Elasticnet': ElasticNet()
}

In [24]:
trained_model_list = []
model_list = []
r2_list = []

In [25]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    print(model)

LinearRegression()
Lasso()
Ridge()
ElasticNet()


In [27]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_trans, y_train)

    # Make Predictions
    y_pred = model.predict(X_test_trans)

    # this is a validation(test) score
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score", r2_square*100)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1228.3374233355164
MAE: 805.925377319051
R2 score 90.51115842202947


Lasso
Model Training Performance
RMSE: 1227.1639854315526
MAE: 806.9289430823325
R2 score 90.52927925381849


Ridge
Model Training Performance
RMSE: 1228.286328774895
MAE: 806.0227428663089
R2 score 90.51194781116382


Elasticnet
Model Training Performance
RMSE: 1612.2441929890426
MAE: 1067.07424115762
R2 score 83.65294696906334




In [28]:
trained_model_list

[]

In [29]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']

In [30]:
r2_list

[0.9051115842202947,
 0.9052927925381848,
 0.9051194781116383,
 0.8365294696906334]