Imported Libraries:



In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import Dataet

In [3]:
df=pd.read_csv(r'E:\All project file\DimondPricePrediction\notebooks\data\gemstone.csv')

In [4]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
df.drop('id',axis=1,inplace=True)
x=df.drop('price',axis=1)
y=df['price']

In [6]:
# Segregating numerical and categorical variables
categorical_cols = x.select_dtypes(include='object').columns
numerical_cols = x.select_dtypes(exclude='object').columns

In [7]:
# Define the custom ranking for each ordinal variable
## For Domain Purpose https://www.americangemsociety.org/ags-diamond-grading-system/
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [8]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
## numerical pipeline
num_pipline=Pipeline(steps=[("imputer",SimpleImputer(strategy="median")),
                            ("scaler",StandardScaler())])
## categorical pipeline
cat_pipeliene=Pipeline(
    steps=[("imputer",SimpleImputer(strategy="most_frequent")),
           ("ordinalencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
           ("scaler",StandardScaler())]
)

## Combine both pipelines
preprocessor=ColumnTransformer([
    ("num_pipline",num_pipline,numerical_cols),
    ("cat_pipline",cat_pipeliene,categorical_cols)
])

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [11]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [12]:
x_train.head()

Unnamed: 0,num_pipline__carat,num_pipline__depth,num_pipline__table,num_pipline__x,num_pipline__y,num_pipline__z,cat_pipline__cut,cat_pipline__color,cat_pipline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [13]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [14]:
regressor=LinearRegression()
regressor.fit(x_train,y_train)

In [16]:
regressor.coef_

array([ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
        -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
         652.10059539])

In [17]:
regressor.intercept_

np.float64(3976.878738902296)

In [18]:
def evaluate_model(y_test,y_pred):
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)
    mae=mean_absolute_error(y_test,y_pred)
    r2=r2_score(y_test,y_pred)
    return rmse,mae,r2

In [23]:
## Train multiple models
## Model Ecaluation
model={
    "Linear Regression":LinearRegression(),
    "Ridge Regression":Ridge(),
    "Lasso Regression":Lasso(),
    "ElasticNet Regression":ElasticNet()
}
## Model Evaluation
train_model_list=[]
model_list=[]
r2_list=[]
for model_name,model in model.items():
    
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    rmse,mae,r2=evaluate_model(y_test,y_pred)
    train_model_list.append(model_name)
    model_list.append(model)
    r2_list.append(r2)
    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2*100)

    r2_list.append(r2)
    
    print('='*35)
    print('\n')
    

Model Training Performance
RMSE: 1014.6296630375463
MAE: 675.0758270067483
R2 score 93.62906819996049


Model Training Performance
RMSE: 1014.6343233534411
MAE: 675.1077629781329
R2 score 93.62900967491632


Model Training Performance
RMSE: 1014.659130275064
MAE: 676.2421173665508
R2 score 93.62869814082755


Model Training Performance
RMSE: 1533.3541245902313
MAE: 1060.9432977143008
R2 score 85.44967219374031




In [24]:
model_list

[LinearRegression(), Ridge(), Lasso(), ElasticNet()]