In [2]:
import pandas as pd
df = pd.DataFrame(pd.read_csv("./data/gemstone.csv"))
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
df.drop(columns=['id'],inplace=True)

In [6]:
x=df.drop(columns=['price'],axis=1)
y=df['price']
print(x)
print(y)

        carat        cut color clarity  depth  table     x     y     z
0        1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55
1        2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05
2        0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50
3        0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71
4        1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77
...       ...        ...   ...     ...    ...    ...   ...   ...   ...
193568   0.31      Ideal     D    VVS2   61.1   56.0  4.35  4.39  2.67
193569   0.70    Premium     G    VVS2   60.3   58.0  5.75  5.77  3.47
193570   0.73  Very Good     F     SI1   63.1   57.0  5.72  5.75  3.62
193571   0.34  Very Good     D     SI1   62.9   55.0  4.45  4.49  2.81
193572   0.71       Good     E     SI2   60.8   64.0  5.73  5.71  3.48

[193573 rows x 9 columns]
0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569   

Define ordinal which columns should be ordinal encoded and which should be scaled

In [7]:
numerical_columns=x.select_dtypes(exclude='object').columns
categorical_columns=x.select_dtypes(include='object').columns

In [8]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [9]:
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [10]:
df['cut'].value_counts().sort_values()

cut
Fair          2021
Good         11622
Very Good    37566
Premium      49910
Ideal        92454
Name: count, dtype: int64

In [12]:
df['color'].value_counts().sort_values()

color
J     6456
I    17514
D    24286
H    30799
F    34258
E    35869
G    44391
Name: count, dtype: int64

In [13]:
df['clarity'].value_counts().sort_values()

clarity
I1        512
IF       4219
VVS1    10628
VVS2    15762
SI2     30484
VS1     30669
VS2     48027
SI1     53272
Name: count, dtype: int64

In [14]:
cut_categories=["Fair","Good","Very Good","Premium","Ideal"]
color_categories=["D","E","F","G","H","I","J"]
clarity_categories=["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]

How to work with the missing values?

We need simple imputer to fill the missing values and standard scaler to do feature scaling of numerical values

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

Creating Numerical and Categorical pipelines and Combine both the pipelines

In [16]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("Ordinalencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ("scaler",StandardScaler())
    ]
)
preprocessor=ColumnTransformer(
    [
        ("numericalpipeline",num_pipeline,numerical_columns),
        ("categoricalpipeline",cat_pipeline,categorical_columns)
    ]
)

Train Test Split

In [17]:
#Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=45)
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns=preprocessor.get_feature_names_out()) 
x_test = pd.DataFrame(preprocessor.transform(x_test), columns = preprocessor.get_feature_names_out())

In [18]:
x_train.head()

Unnamed: 0,numericalpipeline__carat,numericalpipeline__depth,numericalpipeline__table,numericalpipeline__x,numericalpipeline__y,numericalpipeline__z,categoricalpipeline__cut,categoricalpipeline__color,categoricalpipeline__clarity
0,-0.19541,0.72199,-0.640704,-0.049355,-0.081201,0.008907,0.873256,-0.317345,-1.315751
1,-0.173766,-0.203669,0.401246,-0.004238,0.018691,-0.005603,-0.133146,2.145303,0.017304
2,1.579392,0.72199,0.922221,1.457564,1.444428,1.518017,-0.133146,0.913979,-1.315751
3,3.267619,-0.481366,-1.161679,2.567452,2.516001,2.446701,0.873256,2.145303,0.017304
4,-0.455137,-0.111103,-0.640704,-0.34713,-0.308229,-0.324838,0.873256,1.529641,2.683414


In [19]:
x_test.head()

Unnamed: 0,numericalpipeline__carat,numericalpipeline__depth,numericalpipeline__table,numericalpipeline__x,numericalpipeline__y,numericalpipeline__z,categoricalpipeline__cut,categoricalpipeline__color,categoricalpipeline__clarity
0,0.475552,-0.759064,-1.161679,0.708616,0.7361,0.632866,0.873256,-0.317345,0.017304
1,-1.191031,-0.666498,0.922221,-1.565299,-1.552344,-1.587267,-1.139547,-1.548669,1.350359
2,-0.996236,-0.018537,-1.161679,-1.168266,-1.143693,-1.151947,-1.139547,0.913979,1.350359
3,-0.498425,-0.759064,1.443196,-0.356154,-0.389959,-0.426413,-0.133146,-0.317345,0.017304
4,0.237469,-3.443474,0.922221,0.474006,0.445504,0.139503,-0.133146,-1.548669,-0.649224


In [20]:
preprocessor.get_feature_names_out()

array(['numericalpipeline__carat', 'numericalpipeline__depth',
       'numericalpipeline__table', 'numericalpipeline__x',
       'numericalpipeline__y', 'numericalpipeline__z',
       'categoricalpipeline__cut', 'categoricalpipeline__color',
       'categoricalpipeline__clarity'], dtype=object)

In [16]:
x_train.head()

Unnamed: 0,numericalpipeline__carat,numericalpipeline__depth,numericalpipeline__table,numericalpipeline__x,numericalpipeline__y,numericalpipeline__z,categoricalpipeline__cut,categoricalpipeline__color,categoricalpipeline__clarity
0,-0.19541,0.72199,-0.640704,-0.049355,-0.081201,0.008907,0.873256,-0.317345,-1.315751
1,-0.173766,-0.203669,0.401246,-0.004238,0.018691,-0.005603,-0.133146,2.145303,0.017304
2,1.579392,0.72199,0.922221,1.457564,1.444428,1.518017,-0.133146,0.913979,-1.315751
3,3.267619,-0.481366,-1.161679,2.567452,2.516001,2.446701,0.873256,2.145303,0.017304
4,-0.455137,-0.111103,-0.640704,-0.34713,-0.308229,-0.324838,0.873256,1.529641,2.683414


In [17]:
x_test.head()

Unnamed: 0,numericalpipeline__carat,numericalpipeline__depth,numericalpipeline__table,numericalpipeline__x,numericalpipeline__y,numericalpipeline__z,categoricalpipeline__cut,categoricalpipeline__color,categoricalpipeline__clarity
0,0.475552,-0.759064,-1.161679,0.708616,0.7361,0.632866,0.873256,-0.317345,0.017304
1,-1.191031,-0.666498,0.922221,-1.565299,-1.552344,-1.587267,-1.139547,-1.548669,1.350359
2,-0.996236,-0.018537,-1.161679,-1.168266,-1.143693,-1.151947,-1.139547,0.913979,1.350359
3,-0.498425,-0.759064,1.443196,-0.356154,-0.389959,-0.426413,-0.133146,-0.317345,0.017304
4,0.237469,-3.443474,0.922221,0.474006,0.445504,0.139503,-0.133146,-1.548669,-0.649224


Apply the models

In [23]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error,root_mean_squared_error, mean_squared_error

In [24]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [25]:
regression.coef_

array([ 6410.30028966,  -133.18235413,   -69.85943892, -1758.49512798,
        -429.51055284,   -75.79135359,    70.97707844,  -465.44129638,
         650.80643035])

In [26]:
regression.intercept_


3964.713119666067

In [27]:
import numpy as np 
def model_evaluation(true, predicted): 
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    return mae, mse, r2_square, rmse 


Training multiple models

In [30]:

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    #Make Predictions
    y_pred=model.predict(x_test)

    mae, mse, r2_square, rmse=model_evaluation(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('*'*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1015.9917605432947
MAE: 679.7054095585776
R2 score 93.71745654465842
***********************************


Lasso
Model Training Performance
RMSE: 1016.1357781042951
MAE: 680.8409748615866
R2 score 93.71567530833693
***********************************


Ridge
Model Training Performance
RMSE: 1015.9949164269063
MAE: 679.7333205816706
R2 score 93.7174175148011
***********************************


Elasticnet
Model Training Performance
RMSE: 1540.009206276005
MAE: 1065.2324249765468
R2 score 85.565499379458
***********************************




In [31]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet']