## Model Training

In [37]:
import pandas as pd

In [38]:
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [39]:
df = df.drop('id', axis =1)

In [40]:
X = df.drop('price', axis = 1)
y = df[['price']]

In [41]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [42]:
# segregating the numerical and categorical column
categorical_column = X.select_dtypes(include=['object']).columns
numerical_column = X.select_dtypes(exclude=['object']).columns

In [43]:
categorical_column

Index(['cut', 'color', 'clarity'], dtype='object')

In [44]:
numerical_column

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [45]:
## define the rank of the each  ordinal value
print(df.cut.unique())
print(df.color.unique())
print(df.clarity.unique())

['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
['F' 'J' 'G' 'E' 'D' 'H' 'I']
['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']


In [46]:
cut_category = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_category = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_category = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [47]:
from sklearn.impute import SimpleImputer ## Handling the missing value
from sklearn.preprocessing import OrdinalEncoder ## Encoding the ordinal value
from sklearn.preprocessing import StandardScaler ## Encoding the nominal value

## pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [48]:
## Numerical Pipeline
num_pipeline =  Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)
## Categorical Pipeline
cat_pipeline=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=[cut_category, color_category, clarity_category]))
    ]
)

preprocessor= ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_column),
    ('cat_pipeline', cat_pipeline, categorical_column)
])

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.30, random_state=30)

In [50]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
168192,0.34,Ideal,I,VVS2,60.9,57.0,4.56,4.53,2.76
35202,0.9,Good,E,SI1,63.8,57.0,6.07,6.03,3.87
41091,1.02,Premium,G,VS1,62.7,58.0,6.35,6.39,4.0
31239,0.32,Premium,G,VS2,62.1,59.0,4.37,4.35,2.71
45722,0.35,Ideal,J,VVS2,61.1,56.0,4.53,4.57,2.78


In [51]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
70432,0.53,Premium,E,VS2,60.8,56.0,5.24,5.21,3.19
64839,0.71,Very Good,H,SI1,62.9,57.0,5.67,5.69,3.56
185316,0.3,Ideal,H,IF,62.1,57.0,4.27,4.29,2.66
84658,1.24,Premium,G,VS2,61.6,61.0,6.88,6.82,4.21
31953,0.36,Premium,E,VS1,60.4,58.0,4.6,4.63,2.8


In [52]:
y_train

Unnamed: 0,price
168192,765
35202,4763
41091,6139
31239,720
45722,774
...,...
66455,544
46220,5694
98804,4563
48045,6998


In [53]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [54]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,4.0,1.0,5.0
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,1.0,5.0,2.0
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,3.0,3.0,4.0
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,3.0,3.0,3.0
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,4.0,0.0,5.0


In [55]:
## Model Trainig

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [56]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [57]:
y_pred=reg.predict(X_test)

In [58]:
print(mean_absolute_error(y_pred, y_test))
print(mean_squared_error(y_pred, y_test))
print(r2_score(y_pred, y_test))

674.0255115796862
1028002.7598132553
0.9325284792033968


In [59]:
reg.coef_

array([[ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
         -499.29302619,   -63.39317848,    72.89534931,   283.5924305 ,
          433.73822644]])

In [60]:
reg.intercept_

array([1466.68510901])

In [61]:
def evaluate_model(true, pred):
    mae= mean_absolute_error(true, pred)
    mse= mean_squared_error(true, pred)
    r2= r2_score(true, pred)
    return mae, mse, r2

In [71]:
model = {
    'LinearRegression': LinearRegression(),
    'Laso':Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for key in model.keys():
    model_instance = model[key]
    model_instance.fit(X_train, y_train)

    #Make prediciton
    y_pred = model_instance.predict(X_test)

    mae, mse, r2 = evaluate_model(y_test, y_pred)
    print(key)
    model_list.append(key)

    print("Model Training Performance")
    print('MSE', mse)
    print('MAE', mae)
    print('R2 score', r2)
    print('='*35)
    print('\n')




LinearRegression
Model Training Performance
MSE 1028002.7598132553
MAE 674.0255115796862
R2 score 0.9368908248567511


Laso
Model Training Performance
MSE 1027937.0713363611
MAE 675.2036880701619
R2 score 0.9368948574778251


Ridge
Model Training Performance
MSE 1028005.1736007958
MAE 674.0565132295918
R2 score 0.9368906766741327


ElasticNet
Model Training Performance
MSE 2291935.7054395494
MAE 1050.7468664314322
R2 score 0.8592978759337908




In [72]:
model_list

['LinearRegression', 'Laso', 'Ridge', 'ElasticNet']