In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('data/gemstone.csv')

In [3]:
df=df.drop(labels=['id'],axis=1)

In [4]:
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387


In [5]:
#Seperate independent and dependent features 
X = df.drop(labels=['price'],axis=1)
Y = df[['price']]

In [6]:
#Defining which columns should be scaled and which to be ordinal encoded
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [7]:
#Defining ranking for each ordinal values for categorical columns
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [8]:
#SimpleImputer = handling missing values with simple strategies(mean,median,mode)
#StandardScaler = for handling feature scaling (linear regression)
#OrdinalEncoder = for performing ordinal encoding

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

#for pipelines - it will connect imputer to scaler to encoder
from sklearn.pipeline import Pipeline

#after pipelines, we need to group the connections together
from sklearn.compose import ColumnTransformer 

In [9]:
##Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

##Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]
)

#here we are scaling the ordinal values, cos we applied ordinal encoder
#if it was one hot encoding, there was no need to scale it

In [10]:
#we need to combine the numerical and categorical pipelines, to make it one whole model

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [11]:
## Train Test Split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.30,random_state=56)

In [12]:
#create a df out of the training and testing data and get the columns names too

x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [13]:
x_train.head(2)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,1.09952,0.25677,-0.118185,1.066987,1.132921,1.144427,0.872346,-0.319975,-1.316352
1,-0.845175,-0.205745,-0.638757,-0.834922,-0.890061,-0.866758,0.872346,0.911597,-0.648688


In [14]:
x_test.head(2)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,1.553283,1.551811,0.922958,1.373456,1.350641,1.508772,-1.137289,-1.551546,-0.648688
1,-0.996429,1.921822,-0.638757,-1.222514,-1.216641,-1.070791,-2.142106,-0.319975,-0.648688


In [15]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
regression=LinearRegression()
regression.fit(x_train,y_train)

In [17]:
regression.coef_

array([[ 6430.40550882,  -111.96533002,   -66.35775681, -1766.18980758,
         -248.34674026,  -260.64785382,    73.54510095,  -465.18062593,
          649.65632108]])

In [18]:
regression.intercept_

array([3972.43829935])

In [19]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae, rmse, r2_square

In [20]:
## Train multiple models 

models ={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

model_list=[] #list of all models
r2_list=[]    #r2 value for all models

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)

    #make predictions

    y_pred = model.predict(x_test)
    mae,rmse,r2_square =  evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training performance")
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score:",r2_square*100)

    r2_list.append(r2_square)

    print('='*40)
    print('\n')

LinearRegression
Model Training performance
RMSE: 1011.0462049658761
MAE: 673.5966414771332
R2 score: 93.69081572268352


Lasso
Model Training performance
RMSE: 1010.7604284725377
MAE: 674.6077948211764
R2 score: 93.69438185395352


Ridge
Model Training performance
RMSE: 1011.054560498365
MAE: 673.624473229305
R2 score: 93.69071144097656


ElasticNet
Model Training performance
RMSE: 1534.2674049057848
MAE: 1063.1434045621402
R2 score: 85.47108031643594




In [21]:
#we can see Lasso model has the highest R2 score, so it is the perfect FIT!!!