In [9]:
import numpy as np
import pandas as pd

In [10]:
df = pd.read_csv(r"D:\Github\House_price\notebooks\data\gemstone.csv")

In [11]:
df = df.drop(labels = ["id"], axis = 1)

In [12]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   carat    193573 non-null  float64
 1   cut      193573 non-null  object 
 2   color    193573 non-null  object 
 3   clarity  193573 non-null  object 
 4   depth    193573 non-null  float64
 5   table    193573 non-null  float64
 6   x        193573 non-null  float64
 7   y        193573 non-null  float64
 8   z        193573 non-null  float64
 9   price    193573 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 14.8+ MB


In [14]:
X = df.drop(labels = ['price'], axis =1)
y = df['price']

In [30]:
categorical_cols = X.columns[X.dtypes == 'object']
numerical_cols = X.columns[X.dtypes != 'object']

In [31]:
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1', 'SI2', 'SI1','VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [32]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [33]:
num_pipeline = Pipeline(
    steps = [
        ('imputer',SimpleImputer(strategy='median')),
        ('scalar', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scalar', StandardScaler())
    ]
)

preprocessing = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
]
)

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=69)

In [38]:
X_train = pd.DataFrame(preprocessing.fit_transform(X_train), columns =preprocessing.get_feature_names_out())

In [39]:
X_test = pd.DataFrame(preprocessing.transform(X_test), columns =preprocessing.get_feature_names_out())

In [40]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [42]:
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

In [43]:
def evaluate_model(y,y_pred):
    mae = mean_absolute_error(y,y_pred)
    mse = mean_squared_error(y,y_pred)
    r2 = r2_score(y,y_pred)
    return mae, mse, r2


In [44]:
mae, mse, r2 = evaluate_model(y_test, y_pred)
print(f"{mae}\n{mse}\n{r2}")

681.8252363645279
1065925.560543789
0.9340166937428288


In [47]:
models = {"LinearRegression":LinearRegression(), "Lasso": Lasso(), "Ridge":Ridge(), "ElasticNet" : ElasticNet()}

model_list= ["LinearRegression", "Lasso", "Ridge", "ElasticNet"]
r2_scores = []
mae_scores = []
mse_scores = []

for i in model_list:
    x = models[i]
    x.fit(X_train,y_train)
    y_pred = x.predict(X_test)
    mae, mse, r2 = evaluate_model(y_test, y_pred)
    print(f"Model Name: {i}")
    print(f"RMSE: {r2}")
    print(f"MSE: {mse}")
    print(f"MAE: {mae}")
    print("=====================")


Model Name: LinearRegression
RMSE: 0.9340166937428288
MSE: 1065925.560543789
MAE: 681.8252363645279
Model Name: Lasso
RMSE: 0.934046829905509
MSE: 1065438.726707782
MAE: 682.8792433648141
Model Name: Ridge
RMSE: 0.9340170690836792
MSE: 1065919.4971100355
MAE: 681.8533031469099
Model Name: ElasticNet
RMSE: 0.854614919293956
MSE: 2348619.407495231
MAE: 1062.672056962228
