In [1]:
import pandas as pd
import numpy as np

# Model Training

In [2]:
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [3]:

df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
df.dropna(inplace=True)

In [6]:
# Independent and dependent features
x = df.drop(labels=['price'], axis=1)
y = df['price']

In [7]:

#segregating categorical and numerical features
categorical_columns = x.select_dtypes(include=['object']).columns
numerical_columns = x.select_dtypes(exclude=['object']).columns

In [8]:
# define a custom rank for each ordinal variable
cut_categories  = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [9]:
from sklearn.impute import SimpleImputer # Imputation transformer for completing missing values.
from sklearn.preprocessing import OrdinalEncoder # Encode categorical features as an integer array.
from sklearn.preprocessing import StandardScaler # handle feature scaling

#pipelines
from sklearn.pipeline import Pipeline # Pipeline of transforms with a final estimator.
from sklearn.compose import ColumnTransformer # Applies transformers to columns of an array or pandas DataFrame.


In [10]:
# Numerical pipeline
numerical_pipeline = Pipeline(steps = 
    [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ]
)

# Categorical pipeline
categorical_pipeline = Pipeline(steps = 
    [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
    ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', numerical_pipeline, numerical_columns),
    ('cat_pipeline', categorical_pipeline, categorical_columns)
])

In [11]:
# train test split

from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [12]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns = preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test), columns = preprocessor.get_feature_names_out())   

In [13]:
x_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.04175,-0.811959,0.691649,-1.212037,-1.223607,-1.30659,0.082334,-1.54051,-0.034968
1,-1.062732,0.669231,-1.094862,-1.336339,-1.291121,-1.278011,0.975985,-0.950576,1.789417
2,0.217192,-0.106631,-0.201606,0.3595,0.37141,0.365278,-0.811318,1.40916,-0.034968
3,0.427015,-1.940485,1.584905,0.616984,0.616148,0.379568,-1.70497,0.819226,-1.251225
4,0.385051,0.387099,-0.648234,0.634741,0.523316,0.636778,0.082334,-0.360642,-1.251225


In [14]:
# MODEL TRAINING

from sklearn.linear_model import LinearRegression, Lasso, Ridge,ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [15]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [16]:
regression.coef_

array([ 5307.71242549,  -118.95585682,   -71.48241431, -1048.30953115,
          16.64460585,   -94.6869617 ,   134.59484324,  -554.53418276,
         815.32289297])

In [17]:
regression.intercept_

3930.618132316906

In [18]:
# Evaluate model performance
def evaluate_model(true_value, predicted):
    mae = mean_absolute_error(true_value, predicted)
    mse = mean_squared_error(true_value, predicted)
    rmse = np.sqrt(mse)

    r2 = r2_score(true_value, predicted)
    return mae, mse, rmse, r2

In [19]:
# Model Evaluation

models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),    
    'ElasticNet':ElasticNet()
}
training_model_list = []
model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(x_train, y_train)

    #make predictions
    y_pred = model.predict(x_test)

    mae, mse, rmse, r2 = evaluate_model(y_test, y_pred)

    print(model_name)
    print('-'*40)
    
    model_list.append(model_name)

    print('Model training performance')
    print('MAE: ', mae)
    print('MSE: ', mse)
    print('RMSE: ', rmse)
    print('R2 Score: ', r2*100)

    r2_list.append(r2*100)

    print('-'*40)   
    print('\n')

LinearRegression
----------------------------------------
Model training performance
MAE:  802.3192508863639
MSE:  1503897.8097831635
RMSE:  1226.3351131657137
R2 Score:  90.53490924798588
----------------------------------------


Lasso
----------------------------------------
Model training performance
MAE:  803.2748452028947
MSE:  1502834.85838915
RMSE:  1225.901651189503
R2 Score:  90.54159915161088
----------------------------------------


Ridge
----------------------------------------
Model training performance
MAE:  802.5231469350549
MSE:  1503659.7100233433
RMSE:  1226.2380315515186
R2 Score:  90.53640777788603
----------------------------------------


ElasticNet
----------------------------------------
Model training performance
MAE:  1063.0899988553433
MSE:  2491113.5964155677
RMSE:  1578.326200889907
R2 Score:  84.32166327375045
----------------------------------------


