In [2]:
from sklearn.linear_model import LinearRegression , Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd 
import numpy as np

df=pd.read_csv("data/gemstone.csv")
X=df.drop(labels=['price'], axis=1)
Y=df[['price']]

# define which column should be ordinal-encoded and which should be scaled.
categorical_col=X.select_dtypes(include='object').columns
numerical_col=X.select_dtypes(exclude='object').columns

# define the custome ranking for each ordinal variable
cut_cat=["Fair","Good", "Very Good", "Premium", "Ideal"]
color_cat=["D", "E", "F", "G", "H", "I", "J"]
clarity_cat=["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

## numerical pipeline

num_pipeline=Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

## Categorical Pipeline

cat_pipeline=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder', OrdinalEncoder(categories=[cut_cat, color_cat, clarity_cat])),
        ('scalar', StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ("num_pipeline", num_pipeline, numerical_col),
    ("cat_pipeline", cat_pipeline, categorical_col)
])


X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.30, random_state=30)
X_train=pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

regression=LinearRegression()
regression.fit(X_train, y_train)

regression.coef_
regression.intercept_

def evaluate_model(true, predicted):
    mae=mean_absolute_error(true, predicted)
    mse=mean_squared_error(true, predicted)
    rmse=np.sqrt(mean_squared_error(true, predicted))
    r2_square=r2_score(true, predicted)
    return mae, rmse, r2_square

## train multiple models

models={
    
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train, y_train)
    
    # make prediction
    y_pred=model.predict(X_test)
    
    mae, rmse, r2_square=evaluate_model(y_test, y_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model Training Performance')
    print('RMSE:', rmse)
    print('MAE:', mae)
    print('R2 score:', r2_square*100)
    
    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')
    

LinearRegression
Model Training Performance
RMSE: 1013.9048025658439
MAE: 674.0257224619528
R2 score: 93.68908132630604


Lasso
Model Training Performance
RMSE: 1013.8784227049384
MAE: 675.0716918421082
R2 score: 93.68940971806553


Ridge
Model Training Performance
RMSE: 1013.9060209095798
MAE: 674.0557936094762
R2 score: 93.68906615945242


ElasticNet
Model Training Performance
RMSE: 1533.4194703386765
MAE: 1060.742542700526
R2 score: 85.56488759855306


