In [1]:
import pandas as pd

In [2]:
#Data Ingestion
df = pd.read_csv('data/gemstone.csv')
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70,984
2,3,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779
...,...,...,...,...,...,...,...,...,...,...,...
26962,26963,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09,5408
26963,26964,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74,1114
26964,26965,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17,1656
26965,26966,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60,682


In [3]:
df=df.drop(labels=['id'],axis=1)

In [4]:
X=df.drop(labels=['price'],axis=1)
Y=df[['price']]

In [5]:
Y
 

Unnamed: 0,price
0,499
1,984
2,6289
3,1082
4,779
...,...
26962,5408
26963,1114
26964,1656
26965,682


In [6]:
# Define which column should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [14]:
# Define the custom ranking for each ordinal variable
cut_categories =['Fair','Good','Very Good','Premium','Ideal']
color_categories =['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [15]:
# Handling missing values
from sklearn.impute import SimpleImputer

#Handling feature Scaling
from sklearn.preprocessing import StandardScaler
# handling ordinal encoding
from sklearn.preprocessing import OrdinalEncoder


from sklearn.pipeline import Pipeline
# to combine two different pipeline
from sklearn.compose import ColumnTransformer

In [16]:
## Numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler()),
    ]
)

## Categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('Ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [17]:
## Train tets split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [18]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [19]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.561674,0.318936,-0.651005,0.704838,0.692859,0.723432,0.983583,0.226283,0.576148
1,-0.604087,-0.397691,1.589878,-0.533052,-0.486169,-0.533329,-0.812066,-0.945547,-0.639724
2,-0.520818,-0.254365,-1.099182,-0.409263,-0.376688,-0.410385,0.983583,0.812198,-1.247659
3,-0.999613,0.247274,-1.099182,-1.178523,-1.134635,-1.107067,0.983583,0.812198,0.576148
4,0.228599,0.963901,-0.651005,0.413049,0.330729,0.477544,-0.812066,0.812198,-0.031788


In [20]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [21]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [22]:
regression.coef_

array([[4989.90118006, -117.54738915,  -71.5693016 , -811.65093156,
          23.86496768,  -22.98019608,  128.63802707, -551.41625636,
         835.50098553]])

In [23]:
regression.intercept_

array([3944.254662])

In [29]:
    import numpy as np
    def evaluate_model(true,predicted):
        mae = mean_absolute_error(true,predicted)
        mse=mean_squared_error(true,predicted)
        rmse=np.sqrt(mean_squared_error(true,predicted))
        r2_square=r2_score(true,predicted)
        return  mae,mse,rmse,r2_square


In [30]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Your models dictionary
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
}

trained_model_list = []
model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    
    # Make prediction
    y_pred = model.predict(X_test)
    mae, mse, rmse, r2_square = evaluate_model(y_test, y_pred)
    
    print(model_name)
    model_list.append(model_name)
    print("Model training performance")
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R² score:", r2_square * 100)
    
    print('=' * 35)
    print('\n')


LinearRegression
Model training performance
MAE: 813.809226685926
MSE: 1391561.362208422
RMSE: 1179.644591480172
R² score: 91.4417386050815


Lasso
Model training performance
MAE: 815.4198090362225
MSE: 1394371.672450772
RMSE: 1180.8351588815317
R² score: 91.42445487594959


Ridge
Model training performance
MAE: 814.0750740594788
MSE: 1391982.7923832168
RMSE: 1179.823203867095
R² score: 91.43914676134858


ElasticNet
Model training performance
MAE: 1079.23871584633
MSE: 2654517.353624659
RMSE: 1629.2689629476954
R² score: 83.6744149366052


