In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

df = pd.read_csv('Diamond Price Prediction.csv')
df.columns = ['carat','cut','color','clarity','depth','table','price','x','y','z']

In [2]:
CutRankLst = ['Fair','Good','Very Good','Premium','Ideal']
ColorRankLst = ['D','E','F','G','H','I','J']
ClarityRankLst = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

## DataFrame Split

In [3]:
y = df[['price']]
x = df.drop(['price'],axis=1)

In [4]:
categorical_column_name = x.select_dtypes(['O']).columns
numercal_column_name = x.select_dtypes(['int64','float64']).columns

In [5]:
X_train,X_test,y_train,y_test = train_test_split(x,y, test_size=0.3, random_state=29)

## Model Building

In [6]:
models = {'LinearRegression':LinearRegression(),
          'Lasso':Lasso(),
          'Ridge':Ridge(),
          'ElasticNet':ElasticNet()}

## Pipeline Building

In [7]:
numeric_pipeline = Pipeline(steps=[
    ('imputation',SimpleImputer(strategy='median')),
    ('scalar',StandardScaler())
                                   
])

In [8]:
categorical_pipeline = Pipeline(steps=[
    ('imputation',SimpleImputer(strategy=('most_frequent'))),
    ('encoder',OrdinalEncoder(categories=[CutRankLst,ColorRankLst,ClarityRankLst])),
    ('scaler',StandardScaler())
])

## Pipeline Combining

In [9]:
preprocessor = ColumnTransformer(transformers=[
    ('numeric_transformer',numeric_pipeline,numercal_column_name),
    ('categorical_transformer',categorical_pipeline,categorical_column_name)
])

## Error and Score Cheking

In [10]:
def ErrorScoreChecker(tv,pv):
    MAE = mean_absolute_error(tv,pv)
    MSE = mean_squared_error(tv,pv)
    R2 = r2_score(tv,pv)
    SRMSE = np.sqrt(mean_squared_error(tv,pv))
    return MAE, MSE, R2, SRMSE

## Testing Models

In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [20]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    yp = model.predict(X_test)
    mae, mse, r2, rmse = ErrorScoreChecker(y_test,yp)
    print('mae = ',mae)
    print('mse = ',mse)
    print('rmse = ',rmse)
    print('r2 = ',r2*100)
    print('\n','='*35)

mae =  808.6124190183048
mse =  1491798.7519549378
rmse =  1221.3921368483334
r2 =  90.58932375073819

mae =  809.7112801181503
mse =  1491801.4941975847
rmse =  1221.3932594367732
r2 =  90.5893064519184

mae =  808.7147987793674
mse =  1491778.2917673967
rmse =  1221.3837610544022
r2 =  90.58945281922057

mae =  1074.8959508426833
mse =  2602022.97196191
rmse =  1613.0787246634648
r2 =  83.58572444829659

