In [2]:
import pandas as pd 
import numpy as np 

In [3]:
df = pd.read_csv(r'E:\Diamond price prediction\notebook\data\gemstone.csv')

In [4]:
df.drop(columns='id',axis=1 , inplace=True)

In [5]:
X = df.drop(labels=['price'],axis=1)
y = df[['price']]

In [6]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [8]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [11]:
# from domain website  RANKED 
cut_map = ['Fair' ,'Good' ,'Very Good' ,'Premium' ,'Ideal']
clearity_map = ['I1' ,'SI2' ,'SI1' ,'VS2' ,  'VS1' ,'VVS2' , 'VVS1' , 'IF']
colour_map = ['D','E','F','G','H','I','J']

pipelining : we have to automate the handing for  categorical variables , standard scalar, missing values 

In [12]:
# for data 
from sklearn.impute import SimpleImputer  # impute null values to what we want 
from sklearn.preprocessing import StandardScaler # to scale 
from sklearn.preprocessing import OrdinalEncoder  #categories to rank 

In [15]:
# for automate pipelining 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer  # to join the pipelines 

we have to make two pipleines for num/cat data 

In [26]:
numerical_pipeline = Pipeline(
    steps = [('imputer',SimpleImputer(strategy='median')),
             ('scaler',StandardScaler())
    ]
)

categorical_pipeline =  Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=[cut_map,colour_map,clearity_map])),
        ('scaler',StandardScaler())
    ]
)

In [27]:
# join these pipelines 
preprocesser = ColumnTransformer(
    transformers=[('numerical_pipe',numerical_pipeline,numerical_cols),
                  ('categorical_pipe',categorical_pipeline,categorical_cols)
    ]

)



In [28]:
# train test split 
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=42)

In [29]:
X_train = pd.DataFrame(preprocesser.fit_transform(X_train),columns=preprocesser.get_feature_names_out())
X_test = pd.DataFrame(preprocesser.transform(X_test),columns=preprocesser.get_feature_names_out())

In [36]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [41]:
def evaluate_model(true, predicted):
    mae  = mean_squared_error(true,predicted)
    mse  = mean_absolute_error(true,predicted)
    r2_square  = r2_score(true, predicted)
    rmse = np.sqrt(mse)

    return mae, mse ,r2_square,rmse 

In [43]:
models = {
    'LinearRegression' : LinearRegression(),
    'Lasso,': Lasso(),
    'Ridge':Ridge(),
    'ElasticNet': ElasticNet()
}

train_model_list = []
model_list  = []
r2_list = []


for  i in range(len(list(models))):
     model = list(models.values())[i]
     model.fit(X_train,y_train)

     y_pred = model.predict(X_test)

     mae, mse , r2_square ,rmse = evaluate_model(y_test,y_pred)

     print(list(models.keys())[i])
     model_list.append(list(models.keys())[i])

     print('Model Training perfomance ')

     print('MSE : ',  mse )
     print('RMSE : ', rmse )
     print('MAE : ',  mae )
     print('r2_score : ',  r2_square)

     r2_list.append(r2_square)

     print('='*35)
     print('\n')


LinearRegression
Model Training perfomance 
MSE :  678.6599280042687
RMSE :  26.051102241637853
MAE :  1030398.1767863379
r2_score :  0.9362334489637524


Lasso,
Model Training perfomance 
MSE :  679.936239443818
RMSE :  26.075587039294398
MAE :  1030358.896587914
r2_score :  0.9362358798325504


Ridge
Model Training perfomance 
MSE :  678.6934635146449
RMSE :  26.05174588227524
MAE :  1030402.7644628948
r2_score :  0.9362331650537865


ElasticNet
Model Training perfomance 
MSE :  1065.7688041796291
RMSE :  32.646114687350305
MAE :  2329237.260977818
r2_score :  0.855854338619943




In [44]:
model_list

['LinearRegression', 'Lasso,', 'Ridge', 'ElasticNet']

In [47]:
r2_list


[0.9362334489637524, 0.9362358798325504, 0.9362331650537865, 0.855854338619943]